In [78]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

In [79]:
# 读取数据集
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [80]:
# 选择用于训练的特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_train = train_data[features]
x_test = test_data[features]

y_train = train_data['Survived']

In [81]:
# 检查缺失值
print ('训练数据信息：')
x_train.info()
print('-'*30)
print('测试数据信息：')
x_test.info()

训练数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB
------------------------------
测试数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [82]:
# 使用登录最多的港口来填充登录港口的nan值
print ('\n\n\n登录港口信息：')
print (x_train['Embarked'].value_counts())
x_train['Embarked'].fillna('S', inplace=True)
x_test['Embarked'].fillna('S', inplace=True)





登录港口信息：
S    644
C    168
Q     77
Name: Embarked, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [83]:
# 使用平均年龄来填充年龄中的nan值
x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)
x_test['Age'].fillna(x_test['Age'].mean(), inplace=True)

In [84]:
# 使用票价的均值填充票价中的nan值
x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)

In [85]:
# 将特征值转换成特征向量
dvec = DictVectorizer(sparse=False)

x_train = dvec.fit_transform(x_train.to_dict(orient='record'))
x_test = dvec.transform(x_test.to_dict(orient='record'))

In [86]:
# 打印特征向量格式
print ('\n\n\n特征向量格式')
print (dvec.feature_names_)




特征向量格式
['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


In [87]:
# 支持向量机
svc = SVC()
# 决策树
dtc = DecisionTreeClassifier()
# 随机森林
rfc = RandomForestClassifier()
# 逻辑回归
lr = LogisticRegression()
# 贝叶斯
nb = MultinomialNB()
# K邻近
knn = KNeighborsClassifier()
# AdaBoost
boost = AdaBoostClassifier()

print ('\n\n\n模型验证:')
print ('SVM acc is', np.mean(cross_val_score(svc, x_train, y_train, cv=10)))
print ('DecisionTree acc is', np.mean(cross_val_score(dtc, x_train, y_train, cv=10)))
print ('RandomForest acc is', np.mean(cross_val_score(rfc, x_train, y_train, cv=10)))
print ('LogisticRegression acc is', np.mean(cross_val_score(lr, x_train, y_train, cv=10)))
print ('NaiveBayes acc is', np.mean(cross_val_score(nb, x_train, y_train, cv=10)))
print ('KNN acc is', np.mean(cross_val_score(knn, x_train, y_train, cv=10)))
print ('AdaBoost acc is', np.mean(cross_val_score(boost, x_train, y_train, cv=10)))







模型验证:




SVM acc is 0.7264374077857225
DecisionTree acc is 0.7790089093178979
RandomForest acc is 0.7935665645216207




LogisticRegression acc is 0.795800987402111
NaiveBayes acc is 0.6927267052547952
KNN acc is 0.7083591533310635
AdaBoost acc is 0.8104199296334128


In [88]:
# 训练
boost.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [89]:
# 预测
y_predict = boost.predict(x_test)

In [90]:
# 保存结果
result = {'PassengerId': test_data['PassengerId'],
          'Survived': y_predict}
result = pd.DataFrame(result)
result.to_csv('submission.csv',index=False)