In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# 读取数据
train = pd.read_csv('train.csv')
x_train = train.drop('Survived', 1)
y_train = train['Survived']

x_test = pd.read_csv('test.csv')
test_id = x_test['PassengerId']

In [3]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [4]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_train = x_train[features]
x_test = x_test[features]

#### Fill nan

In [6]:
# 拼接x_train和x_test两个数据集，以整个数据的均值或高频值填充缺失项
data = x_train.append(x_test)
assert data.shape[0] == x_train.shape[0] + x_test.shape[0]
data.reset_index()

# Age
import math
x_train.Age = pd.Series(map(math.floor,x_train.Age.fillna(data.Age.mean())))
x_test.Age = pd.Series(map(math.floor,x_test.Age.fillna(data.Age.mean())))

# Embarked
x_train.Embarked = x_train.Embarked.fillna(data.Embarked.value_counts().index[0])

# Fare
x_test.Fare = x_test.Fare.fillna(data.Fare.mean())

#### One-hot Encoding

In [7]:
# 将没有数值大小关系的特征进行独热编码
x_train = pd.get_dummies(x_train, columns=['Pclass','Sex','Embarked'])
x_test = pd.get_dummies(x_test, columns=['Pclass','Sex','Embarked'])

#### Normalization

In [8]:
def Scale(train_data, test_data, features=None, mode='Standar'):
    '''
    train_data: 训练集
    
    test_data: 测试集
    
    features: 将要进行归一化处理的特征列表。默认值为None, 对所有特征进行归一化处理。
              
    mode: 'Standar'为StandardScaler; 'MinMax'为MinMaxScaler.
    '''
    # Scale mode
    if mode == 'Standar':
        Scaler = StandardScaler()
    elif mode == 'MinMax':
        Scaler = MinMaxScaler()
    else : 
        print('the Scale mode selected has not suppoted ')
        return None
    
    # copy of data
    s_train = train_data.copy(deep=True)
    s_test = test_data.copy(deep=True)
    
    # Scaling
    if features == None:
        s_train = Scaler.fit_transform(s_train)
        s_test = Scaler.transform(s_test)
        return s_train, s_test
    
    elif type(features) == list:
        s_train.loc[:,features] = Scaler.fit_transform(s_train.loc[:,features])
        s_test.loc[:,features] = Scaler.transform(s_test.loc[:,features])
        return np.array(s_train), np.array(s_test)
    
    else:
        print(' ''List'' type of features requested!')
        return None

In [9]:
# StandarScaled
ss_train, ss_test = Scale(x_train, x_test, features=['Age'], mode='Standar')

# MinmaxScaled
mn_train, mn_test = Scale(x_train, x_test, features=['Age'], mode='MinMax')

### Model

In [10]:
def export_csv(x_test_id, prediction, name):
    '''保存预测结果'''
    return pd.DataFrame({'PassengerId': x_test_id, 'Survived': prediction}).set_index('PassengerId').to_csv(name + '.csv')

#### sklearn 

In [11]:
# 决策树
from sklearn.tree import DecisionTreeClassifier as DTC

# 模型定义
dtc = DTC()

# 模型训练
dtc.fit(ss_train, y_train)

# 结果预测
ss_dtc_predictions =  dtc.predict(ss_test)

# 保存预测结果
export_csv(test_id, ss_dtc_predictions, name='ss_dtc_predictions')

In [12]:
# 逻辑回归
from sklearn.linear_model import LogisticRegression

# 模型定义
LR = LogisticRegression()

# 模型训练
LR.fit(ss_train, y_train)

# 结果预测
ss_lr_predictions = LR.predict(ss_test)

# 保存预测结果
export_csv(test_id, ss_lr_predictions, name='ss_lr_predictions')

#### TPOT 

In [13]:
from tpot import TPOTClassifier
# TPOTClassifier定义
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)

# 模型训练
tpot.fit(ss_train, y_train)

# 预测
ss_tpot_prediction = tpot.predict(ss_test)

# 保存预测结果
export_csv(test_id, ss_tpot_prediction, name='ss_tpot_prediction')

# 导出模型文件
tpot.export('ss_pipeline.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8328102441780179
Generation 2 - Current best internal CV score: 0.8328102441780179
Generation 3 - Current best internal CV score: 0.8350134957002073
Generation 4 - Current best internal CV score: 0.8350134957002073
Generation 5 - Current best internal CV score: 0.8350134957002073
Best pipeline: RandomForestClassifier(XGBClassifier(input_matrix, learning_rate=0.1, max_depth=5, min_child_weight=11, n_estimators=100, nthread=1, subsample=0.9000000000000001), bootstrap=False, criterion=gini, max_features=0.1, min_samples_leaf=5, min_samples_split=2, n_estimators=100)
