In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer

# 数据加载
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
# 数据探索
# 查看train_data信息
#pd.set_option('display.max_columns', None) #显示所有列
print('查看数据信息：列名、非空个数、类型等')
print(train_data.info())
print('-'*30)
print('查看数据摘要')
print(train_data.describe())
print('-'*30)
print('查看离散数据分布')
print(train_data.describe(include=['O']))
print('-'*30)
print('查看前5条数据')
print(train_data.head())
print('-'*30)
print('查看后5条数据')
print(train_data.tail())

# 使用平均年龄来填充年龄中的nan值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(),inplace=True)
# 使用票价的均值填充票价中的nan值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace=True)

print(train_data['Embarked'].value_counts())
# 使用登录最多的港口来填充登录港口的nan值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S',inplace=True)
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]
print('特征值')
print(train_features)

#相当于one-hot编码
dvec=DictVectorizer(sparse=False)
train_features=dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)
# 构造ID3决策树
clf = DecisionTreeClassifier(criterion='entropy')
# 决策树训练
clf.fit(train_features, train_labels)

test_features=dvec.transform(test_features.to_dict(orient='record'))
# 决策树预测
pred_labels = clf.predict(test_features)

# 得到决策树准确率(基于训练集)
acc_decision_tree = round(clf.score(train_features, train_labels), 6)
print(u'score准确率为 %.4lf' % acc_decision_tree)

# 使用K折交叉验证 统计决策树准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(clf, train_features, train_labels, cv=10)))


查看数据信息：列名、非空个数、类型等
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
------------------------------
查看数据摘要
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%

In [2]:
############################## tpot ################################################################

from tpot import TPOTClassifier
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(train_features, train_labels)
tpot.export('tpot_titanic.py')
#可得tpot选择的为xgboost 有时候会是随机森林方法




HBox(children=(IntProgress(value=0, description='Optimization Progress', max=120, style=ProgressStyle(descript…


Generation 1 - Current best internal CV score: 0.8294080723118448
Generation 2 - Current best internal CV score: 0.8294394576611639
Generation 3 - Current best internal CV score: 0.8294394576611639
Generation 4 - Current best internal CV score: 0.8383968363567886
Generation 5 - Current best internal CV score: 0.8395329860021341
Best pipeline: XGBClassifier(input_matrix, learning_rate=0.001, max_depth=7, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.9000000000000001)


In [3]:
#用xgboost方法
from xgboost import XGBClassifier
exported_pipeline = XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.7500000000000001)
exported_pipeline.fit(train_features, train_labels)
results = exported_pipeline.predict(test_features)
xgboost_rate = round(exported_pipeline.score(train_features, train_labels), 6)
print(u'xgboost score准确率为 %.4lf' % xgboost_rate)
# 使用K折交叉验证 统计xgboost准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(exported_pipeline, train_features, train_labels, cv=10)))


xgboost score准确率为 0.9080
cross_val_score准确率为 0.8283


In [4]:
#有时候会推荐随机森林
from sklearn.ensemble import RandomForestClassifier

exported_pipeline1 = RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.1, min_samples_leaf=15, min_samples_split=15, n_estimators=100)
exported_pipeline1.fit(train_features, train_labels)
results = exported_pipeline1.predict(test_features)
random_score = round(exported_pipeline1.score(train_features, train_labels), 6)
print(u'随机森林 score准确率为 %.4lf' % random_score)
# 使用K折交叉验证 统计随机森林准确率
print(u'cross_val_score准确率为 %.4lf' % np.mean(cross_val_score(exported_pipeline1, train_features, train_labels, cv=10)))


随机森林 score准确率为 0.8361
cross_val_score准确率为 0.8115
