# 使用sklearn中的基本分类器来进行分类

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
#集成方法
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
%matplotlib inline

# 根据EDA分析，对训练集和测试集进行数据预处理

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
df_test_id = df_test['PassengerId']

In [6]:
df_test_id[:5]

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

# 开始进行数据处理

In [7]:
df_target = df_train['Survived']
df_train.drop(['Survived'],inplace=True, axis=1)
print df_train.shape
print df_test.shape

(891, 11)
(418, 11)


In [8]:
df_train_y = df_target

In [9]:
df_clean = pd.concat([df_train,df_test],ignore_index=True)

## 1.填充缺失数据

In [10]:
df_clean['Age'].fillna(df_clean['Age'].median(), inplace=True)
df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0], inplace=True)
df_clean['Fare'].fillna(df_clean['Fare'].median(), inplace=True)


## 2.创建新的特征

In [11]:
df_clean['Has_Cabin'] = [0 if i is np.nan else 1 for i in df_clean['Cabin']]
df_clean['FamilySize'] = df_clean['SibSp']+df_clean['Parch']+1 #家庭成员数
df_clean['IsAlone'] = 1 #全部初始化为1,即都是孤家寡人
df_clean['IsAlone'].loc[df_clean['FamilySize']>1] = 0 #家庭成员数大于1的,设置为0,即非孤家寡人
#通过对乘客名字的分析,得出此人的头衔
df_clean['Title'] = df_clean['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
#对Fare特征进行分段离散
df_clean['FareBin'] = pd.qcut(df_clean['Fare'],4)
#对Age特征进行分段离散
df_clean['AgeBin'] = pd.cut(df_clean['Age'].astype(int), 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## 3.删除多余特征

In [12]:
drop_column = ['PassengerId','Cabin','Ticket']
df_clean.drop(drop_column, axis=1, inplace=True)
print df_clean.isnull().sum()

Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Has_Cabin     0
FamilySize    0
IsAlone       0
Title         0
FareBin       0
AgeBin        0
dtype: int64


In [13]:
stat_min = 10
title_names = (df_clean['Title'].value_counts()<stat_min)
df_clean['Title'] = df_clean['Title'].apply(lambda x: 'Misc' if title_names.loc[x]==True else x)
print df_clean['Title'].value_counts()

Mr        757
Miss      260
Mrs       197
Master     61
Misc       34
Name: Title, dtype: int64


In [14]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
Pclass        1309 non-null int64
Name          1309 non-null object
Sex           1309 non-null object
Age           1309 non-null float64
SibSp         1309 non-null int64
Parch         1309 non-null int64
Fare          1309 non-null float64
Embarked      1309 non-null object
Has_Cabin     1309 non-null int64
FamilySize    1309 non-null int64
IsAlone       1309 non-null int64
Title         1309 non-null object
FareBin       1309 non-null category
AgeBin        1309 non-null category
dtypes: category(2), float64(2), int64(6), object(4)
memory usage: 125.5+ KB


## 4.格式转换

In [15]:
label = LabelEncoder()
df_clean['Sex_Code'] = label.fit_transform(df_clean['Sex'])
df_clean['Embarked_Code'] = label.fit_transform(df_clean['Embarked'])
df_clean['Title_Code'] = label.fit_transform(df_clean['Title'])
df_clean['AgeBin_Code'] = label.fit_transform(df_clean['AgeBin'])
df_clean['FareBin_Code'] = label.fit_transform(df_clean['FareBin'])

df_x_bin = ['Sex_Code','Pclass','Embarked_Code','Title_Code','FamilySize','AgeBin_Code','FareBin_Code','Has_Cabin']

In [22]:
df_clean_bin.head()

Unnamed: 0,Sex_Code,Pclass,Embarked_Code,Title_Code,FamilySize,AgeBin_Code,FareBin_Code,Has_Cabin
0,1,3,2,3,2,1,0,0
1,0,1,0,4,2,2,3,1
2,0,3,2,2,1,1,1,0
3,0,1,2,4,2,2,3,1
4,1,3,2,3,1,2,1,0


In [26]:
df_clean_bin = df_clean[df_x_bin]
df_pclass_dummy = pd.get_dummies(df_clean_bin['Pclass'],prefix='Pclass')
df_embarked_dummy = pd.get_dummies(df_clean_bin['Embarked_Code'],prefix='Embarked_Code')
df_title_dummy = pd.get_dummies(df_clean_bin['Title_Code'], prefix='Title_Code')
df_clean_bin = pd.concat([df_clean_bin[['Sex_Code','FamilySize','AgeBin_Code','FareBin_Code','Has_Cabin']],df_pclass_dummy,
                         df_embarked_dummy, df_title_dummy],axis=1)

In [27]:
df_clean_bin.shape

(1309, 16)

In [28]:
df_train_m = 891
df_train = df_clean_bin.loc[:df_train_m-1,:]
df_test = df_clean_bin.loc[df_train_m:,:].reset_index(drop=True)

# 处理缺失数据

# 使用sklearn LogisticRegression建模

In [30]:
df_train.head()

Unnamed: 0,Sex_Code,FamilySize,AgeBin_Code,FareBin_Code,Has_Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_Code_0,Embarked_Code_1,Embarked_Code_2,Title_Code_0,Title_Code_1,Title_Code_2,Title_Code_3,Title_Code_4
0,1,2,1,0,0,0,0,1,0,0,1,0,0,0,1,0
1,0,2,2,3,1,1,0,0,1,0,0,0,0,0,0,1
2,0,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0
3,0,2,2,3,1,1,0,0,0,0,1,0,0,0,0,1
4,1,1,2,1,0,0,0,1,0,0,1,0,0,0,1,0


In [31]:
pipe_lr = Pipeline([('sc1',StandardScaler()),('clf',LogisticRegression(random_state=1))])
pipe_lr.fit(df_train, df_train_y)
y_pred = pipe_lr.predict(df_test)

In [32]:
df_result = pd.DataFrame({'PassengerId': df_test_id, 'Survived': y_pred})
df_result.to_csv('data/lr_submission.csv', index = False)

# 使用sklearn 线性svm建模

In [44]:
#线性核
pipe_svm = Pipeline([('sc1',StandardScaler()),('clf',SVC(kernel='linear'))])
pipe_svm.fit(df_train, df_train_y)
y_pred = pipe_svm.predict(df_test)

In [45]:
df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/svm_submission.csv',index=False)

In [46]:
#高斯核
pipe_rbf = Pipeline([('sc1',StandardScaler()),('clf',SVC(kernel='rbf'))])
pipe_rbf.fit(df_train, df_train_y)
y_pred = pipe_lr.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/rbf_submission.csv',index=False)

# 使用sklearn 决策树建模

In [33]:
dt = DecisionTreeClassifier()
dt.fit(df_train,df_train_y)
y_pred = dt.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/dt_submission.csv',index=False)

# 使用sklearn k近邻建模

In [48]:
knn = KNeighborsClassifier()
knn.fit(df_train, df_train_y)
y_pred = knn.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/knn_submission.csv',index=False)

# 使用sklearn 高斯朴素贝叶斯建模

In [49]:
gnb = GaussianNB()
gnb.fit(df_train, df_train_y)
y_pred = gnb.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gnb_submission.csv',index=False)

# 使用sklearn 高斯过程分类器

In [34]:
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(df_train, df_train_y)
y_pred = gpc_rbf_isotropic.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gpc_rbf_submission.csv',index=False)

# 使用sklearn 随机森林建模

In [51]:
rfc = RandomForestClassifier()
rfc.fit(df_train, df_train_y)
y_pred = rfc.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/rfc_submission.csv',index=False)

# 使用sklearn AdaBoost建模

In [52]:
adaboost = AdaBoostClassifier()
adaboost.fit(df_train, df_train_y)
y_pred = adaboost.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/ada_submission.csv',index=False)

# 使用sklearn gbm建模

In [35]:
gbm = GradientBoostingClassifier()
gbm.fit(df_train,df_train_y)
y_pred = gbm.predict(df_test)

df_result = pd.DataFrame({'PassengerId':df_test_id,'Survived':y_pred})
df_result.to_csv('data/gbm_submission.csv',index=False)

# 使用默认的参数，对数据进行训练，发现gbm模型的得分最高0.799
## 特征工程还可以通过对连续数据的离散化来获取额外的信息，提高模型的方差
## 通过交叉验证，网格搜索对模型参数进行调优