# 使用sklearn中的基本分类器来进行分类

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
#集成方法
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
%matplotlib inline

# 根据EDA分析，对训练集和测试集进行数据预处理

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df_test_id = df_test['PassengerId']

In [6]:
df_test_id[:5]

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [7]:
#删除掉乘客id和姓名特征列
df_train.drop(['PassengerId'],inplace=True, axis=1)
df_test.drop(['PassengerId'],inplace=True, axis=1)
df_train.drop(['Name'],inplace=True, axis=1)
df_test.drop(['Name'],inplace=True, axis=1)
df_train.drop(['Ticket'],inplace=True, axis=1)
df_test.drop(['Ticket'],inplace=True, axis=1)

In [8]:
df_train_y = df_train['Survived']

In [9]:
df_train.drop(['Survived'],inplace=True,axis=1)

In [10]:
df_test.isnull().sum()

Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [11]:
df_train.shape

(891, 8)

In [12]:
df_test.shape

(418, 8)

In [13]:
df_train_m = df_train.shape[0]

In [14]:
df_data = pd.concat([df_train,df_test],ignore_index=True)

# 处理缺失数据

In [15]:
df_data['has_cabin'] = [0 if i is np.nan else 1 for i in df_data['Cabin']]

In [16]:
df_data.drop(['Cabin'], inplace=True, axis=1)

In [17]:
df_data.loc[(df_data['Sex']=='female')&(df_data['Age'].isnull()),'Age'] = df_data.loc[(df_data['Sex']=='female')&(df_data['Age'].notnull()),'Age'].mode()[0]

In [18]:
df_data.loc[(df_data['Sex']=='male')&(df_data['Age'].isnull()),'Age'] = df_data.loc[(df_data['Sex']=='male')&(df_data['Age'].notnull()),'Age'].mode()[0]

In [19]:
df_data.loc[df_data['Embarked'].isnull(),'Embarked'] = df_data.loc[(df_data['Embarked'].notnull()),'Embarked'].mode()[0]

In [20]:
df_data.loc[df_data['Fare'].isnull(),'Fare'] = df_data.loc[(df_data['Fare'].notnull()),'Fare'].mode()[0]

# 对类别特征进行独热编码

In [21]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
Pclass       1309 non-null int64
Sex          1309 non-null object
Age          1309 non-null float64
SibSp        1309 non-null int64
Parch        1309 non-null int64
Fare         1309 non-null float64
Embarked     1309 non-null object
has_cabin    1309 non-null int64
dtypes: float64(2), int64(4), object(2)
memory usage: 81.9+ KB


In [22]:
#热编码
dummies_Pclass = pd.get_dummies(df_data['Pclass'], prefix='Pclass')
dummies_Sex = pd.get_dummies(df_data['Sex'], prefix='Sex')
dummies_Embarked = pd.get_dummies(df_data['Embarked'], prefix='Embarked')

df_data = pd.concat([df_data, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_data.drop(['Pclass', 'Sex', 'Embarked'], axis=1, inplace=True)
df_data.head()

Unnamed: 0,Age,SibSp,Parch,Fare,has_cabin,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
0,22.0,1,0,7.25,0,0,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,1,0,0,1,0,1,0,0
2,26.0,0,0,7.925,0,0,0,1,1,0,0,0,1
3,35.0,1,0,53.1,1,0,0,1,1,0,1,0,0
4,35.0,0,0,8.05,0,0,0,1,0,1,0,0,1


In [23]:
#根据EDA，观察到age、SibSp与Survived相关性很弱，以及has_cabin和Pclass_1有很强的正相关，取其一
df_data.drop(['Age', 'SibSp', 'has_cabin'], axis=1, inplace=True)

In [24]:
df_data.shape

(1309, 10)

In [25]:
df_train = df_data.loc[:df_train_m-1,:]
df_test = df_data.loc[df_train_m:,:].reset_index(drop=True)

In [26]:
df_train.shape

(891, 10)

In [27]:
df_test.shape

(418, 10)

In [28]:
df_data.shape

(1309, 10)

In [29]:
df_test.isnull().sum()

Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
Sex_female    0
Sex_male      0
Pclass_1      0
Pclass_2      0
Pclass_3      0
dtype: int64

# 使用sklearn LogisticRegression建模

In [30]:
pipe_lr = Pipeline([('sc1',StandardScaler()),('clf',LogisticRegression(random_state=1))])
pipe_lr.fit(df_train, df_train_y)
y_pred = pipe_lr.predict(df_test)

In [31]:
df_result = pd.DataFrame({'PassengerId': df_test_id, 'Survived': y_pred})
df_result.to_csv('data/lr_submission.csv', index = False)

# 使用sklearn 线性svm建模

In [33]:
#线性核
pipe_svm = Pipeline([('sc1',StandardScaler()),('clf',SVC(kernel='linear'))])
pipe_svm.fit(df_train, df_train_y)
y_pred = pipe_svm.predict(df_test)

In [34]:
df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/svm_submission.csv',index=False)

In [35]:
#高斯核
pipe_rbf = Pipeline([('sc1',StandardScaler()),('clf',SVC(kernel='rbf'))])
pipe_rbf.fit(df_train, df_train_y)
y_pred = pipe_lr.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/rbf_submission.csv',index=False)

# 使用sklearn 决策树建模

In [37]:
dt = DecisionTreeClassifier()
dt.fit(df_train,df_train_y)
y_pred = dt.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/dt_submission.csv',index=False)

# 使用sklearn k近邻建模

In [38]:
knn = KNeighborsClassifier()
knn.fit(df_train, df_train_y)
y_pred = knn.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/knn_submission.csv',index=False)

# 使用sklearn 高斯朴素贝叶斯建模

In [39]:
gnb = GaussianNB()
gnb.fit(df_train, df_train_y)
y_pred = gnb.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gnb_submission.csv',index=False)

# 使用sklearn 高斯过程分类器

In [44]:
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(df_train, df_train_y)
y_pred = gpc_rbf_isotropic.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gpc_rbf_submission.csv',index=False)

# 使用sklearn 随机森林建模

In [47]:
rfc = RandomForestClassifier()
rfc.fit(df_train, df_train_y)
y_pred = rfc.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/rfc_submission.csv',index=False)

# 使用sklearn AdaBoost建模

In [48]:
adaboost = AdaBoostClassifier()
adaboost.fit(df_train, df_train_y)
y_pred = adaboost.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/ada_submission.csv',index=False)

# 使用sklearn gbm建模

In [49]:
gbm = GradientBoostingClassifier()
gbm.fit(df_train,df_train_y)
y_pred = gbm.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gbm_submission.csv',index=False)

# 使用默认的参数，对数据进行训练，发现gbm模型的得分最高0.799
## 特征工程还可以通过对连续数据的离散化来获取额外的信息，提高模型的方差
## 通过交叉验证，网格搜索对模型参数进行调优