In [1]:
# 导入数据集
import pandas as pd 
data = pd.read_csv(filepath_or_buffer='../111 - 分类算法 - 决策树 - 泰坦尼克号乘客生存预测/titanic.csv')

In [2]:
data

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0000,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0000,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male
...,...,...,...,...,...,...,...,...,...,...,...
1308,1309,3rd,0,"Zakarian, Mr Artun",,,,,,,male
1309,1310,3rd,0,"Zakarian, Mr Maprieder",,,,,,,male
1310,1311,3rd,0,"Zenn, Mr Philip",,,,,,,male
1311,1312,3rd,0,"Zievens, Rene",,,,,,,female


In [3]:
# 选择合适的特征
data_x = data.loc[:, ['pclass', 'age', 'sex']]
data_y = data.loc[:, 'survived']

In [8]:
data_x.tail()

Unnamed: 0,pclass,age,sex
1308,3rd,,male
1309,3rd,,male
1310,3rd,,male
1311,3rd,,female
1312,3rd,,male


In [11]:
# 填补 age 中的 Nan 值
data_x.fillna(value=data_x['age'].mean(), inplace=True)

In [13]:
# 因为特征值中属于类别的较多，所以使用字典进行特征分类
data_x = data_x.to_dict(orient='records')

In [14]:
data_x

[{'pclass': '1st', 'age': 29.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 2.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 25.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 0.9167, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 63.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 39.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 58.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 71.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 19.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 50.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 24.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 36.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 37.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 

In [16]:
# 训练集划分
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y, random_state=233)

In [17]:
# 进行字典特征提取
from sklearn import feature_extraction
transfer = feature_extraction.DictVectorizer()
x_train = transfer.fit_transform(X=x_train)
x_test = transfer.transform(X=x_test)

In [19]:
# 获取随机森林预估计
from sklearn import ensemble
estimator = ensemble.RandomForestClassifier(criterion='entropy', n_estimators=)

In [22]:
# 加入网格搜索和交叉验证
from sklearn import model_selection

# 网格搜索的超参数：
param_dict = {'n_estimators': [100, 120, 150, 300, 400, 500],
              'max_depth': range(1, 15)}
estimator = model_selection.GridSearchCV(estimator=estimator, param_grid=param_dict, cv=3)
estimator.fit(X=x_train, y=y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(criterion='entropy'),
             param_grid={'max_depth': range(1, 15),
                         'n_estimators': [100, 120, 150, 300, 400, 500]})

In [24]:
# 模型评估
y_predict = estimator.predict(X=x_test)
print('模型预测结果和真实值的比对：', y_predict == y_test)

score = estimator.score(X=x_test, y=y_test)
print('\n准确率为：', score)

print('\n最佳参数为:', estimator.best_params_)
print('\n最佳预估器为:', estimator.best_estimator_)
print('\n最佳结果为:', estimator.best_score_)
print('\n交叉验证结果为:', estimator.cv_results_)

模型预测结果和真实值的比对： 1299     True
1085     True
86       True
293      True
745     False
        ...  
1161    False
471      True
1225     True
1204     True
618      True
Name: survived, Length: 329, dtype: bool

准确率为： 0.8145896656534954

最佳参数为: {'max_depth': 5, 'n_estimators': 400}

最佳预估器为: RandomForestClassifier(criterion='entropy', max_depth=5, n_estimators=400)

最佳结果为: 0.8302845528455286

交叉验证结果为: {'mean_fit_time': array([0.04149715, 0.05144397, 0.06061228, 0.11495733, 0.15392995,
       0.19005903, 0.04033597, 0.05307404, 0.06045397, 0.1186552 ,
       0.15832551, 0.19886748, 0.04351473, 0.05076687, 0.06379048,
       0.12620465, 0.17539756, 0.21632822, 0.04536573, 0.05356892,
       0.06660565, 0.13329697, 0.17622232, 0.22041806, 0.04776041,
       0.05743631, 0.07068006, 0.14035861, 0.18719602, 0.2345473 ,
       0.04945525, 0.05937481, 0.07538629, 0.15010802, 0.20693477,
       0.25289965, 0.05187583, 0.06778709, 0.07875586, 0.15679971,
       0.20796005, 0.25496252, 0.0529333 , 

In [30]:
# 可视化，不能这样可视化！！
from sklearn import tree
tree.export_graphviz(decision_tree=estimator.best_estimator_, out_file='random_tree.dot', feature_names=transfer.feature_names_)

AttributeError: 'RandomForestClassifier' object has no attribute 'tree_'