# 案例：泰坦尼克号乘客生存预测
- 预测不同特征的乘客的生还可能（kaggle经典）
- 这里用决策树和随机森林预测

**流程分析：**
1. 获取数据
2. 筛选特征值、标签值
3. 数据处理
    - 缺失值处理
    - 特征值 -> 字典（便于转换成one-hot编码）
4. 划分数据集
5. 特征工程
    - 字典特征抽取
6. 决策树预估器流程
7. 模型评估
8. 随机森林预测

# 一、获取数据

In [18]:
import pandas as pd

In [19]:
data_1 = pd.read_csv('./day2资料/02-代码/titanic.csv')
data_1.head()

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male


In [20]:
pd.isnull(data_1.age).sum() # age里面有缺失值

np.int64(680)

# 二、筛选特征值和标签值

In [21]:
x = data_1[['pclass', 'age', 'sex']] # 选择这三个特征，[[ ]]表示选择多个列，组成DataFrame格式
y = data_1['survived']

In [22]:
x.head()

Unnamed: 0,pclass,age,sex
0,1st,29.0,female
1,1st,2.0,female
2,1st,30.0,male
3,1st,25.0,female
4,1st,0.9167,male


In [23]:
y.head()

0    1
1    0
2    0
3    0
4    1
Name: survived, dtype: int64

# 三、数据处理

## 1、缺失值处理

In [24]:
x.loc[:, 'age'] = x['age'].fillna(x['age'].mean())
pd.isnull(x.age).sum()

np.int64(0)

## 2、转换成字典

In [25]:
x = x.to_dict(orient='records')
x

[{'pclass': '1st', 'age': 29.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 2.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 30.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 25.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 0.9167, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 63.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 39.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 58.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 71.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 19.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'female'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 31.19418104265403, 'sex': 'male'},
 {'pclass': '1st', 'age': 50.0, 'sex': 'female'},
 {'pclass': '1st', 'age': 24.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 36.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 37.0, 'sex': 'male'},
 {'pclass': '1st', 'age': 47.0, 

# 四、划分数据集

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=11)

# 五、特征工程
- 字典特征抽取

In [28]:
from sklearn.feature_extraction import DictVectorizer
import numpy as np

In [29]:
trans = DictVectorizer()
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)

# 六、决策树预估器流程
# 七、模型评估

In [30]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [31]:
esti = DecisionTreeClassifier() # 实例化预估器，树的深度自己调参
esti.fit(x_train, y_train) # 训练

# 可视化决策树
export_graphviz(esti, out_file='my_titanic_tree.dot', feature_names=trans.get_feature_names_out()) # 转换成one-hot后feature_names会改变

# 模型评估
    # 方法1：
y_pred = esti.predict(x_test) # 预测
print("y_predict =", y_pred)
accuracy_1 = np.sum(y_pred == y_test)/sum(np.ones(y_test.shape))
print("accuracy_1 =", accuracy_1) # 计算准确率

    # 方法2：
accuracy_2 = esti.score(x_test, y_test) # 计算准确率
print("accuracy_2 =", accuracy_2)


y_predict = [0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 0 1 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 0
 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1
 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
accuracy_1 = 0.7933130699088146
accuracy_2 = 0.7933130699088146


- 网格搜索 + 交叉验证

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
para_grid = range(1, 10)
esti = GridSearchCV(esti, param_grid={'max_depth': para_grid}, cv=5)
esti.fit(x_train, y_train) # 训练

# 评估模型
    # 方法1：
y_pred = esti.predict(x_test) # 预测
print("y_predict =", y_pred)
print("accuracy_1 =", np.sum(y_pred == y_test)/sum(np.ones(y_test.shape))) # 计算准确率

    # 方法2：
accuracy = esti.score(x_test, y_test) # 计算准确率
print("accuracy_2 =", accuracy)

# 最佳参数：best_params_
print("最佳参数：\n", esti.best_params_)
# 最佳结果：best_score_
print("最佳结果：\n", esti.best_score_)
# 最佳估计器：best_estimator_
print("最佳估计器:\n", esti.best_estimator_)
# 交叉验证结果：cv_results_
# print("交叉验证结果:\n", esti.cv_results_)

y_predict = [0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0
 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
accuracy_1 = 0.8115501519756839
accuracy_2 = 0.8115501519756839
最佳参数：
 {'max_depth': 2}
最佳结果：
 0.8262353672433441
最佳估计器:
 DecisionTreeClassifier(max_depth=2)


# 八、随机森林预测
- 用网格搜索调参

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
# 实例化预估器
esti = RandomForestClassifier()
para_dict = {'n_estimators': [50, 60, 70, 80, 90, 100],
             'max_depth': [1, 2, 3, 4, 5, 6]}
esti = GridSearchCV(esti, param_grid=para_dict, cv=4)
esti.fit(x_train, y_train)

In [37]:
# 评估模型
    # 方法1：
y_pred = esti.predict(x_test) # 预测
print("y_predict =", y_pred)
print("accuracy_1 =", np.sum(y_pred == y_test)/sum(np.ones(y_test.shape))) # 计算准确率

    # 方法2：
accuracy = esti.score(x_test, y_test) # 计算准确率
print("accuracy_2 =", accuracy)

# 最佳参数：best_params_
print("最佳参数：\n", esti.best_params_)
# 最佳结果：best_score_
print("最佳结果：\n", esti.best_score_)
# 最佳估计器：best_estimator_
print("最佳估计器:\n", esti.best_estimator_)
# 交叉验证结果：cv_results_
# print("交叉验证结果:\n", esti.cv_results_)

y_predict = [0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0
 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 1 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0
 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0]
accuracy_1 = 0.8145896656534954
accuracy_2 = 0.8145896656534954
最佳参数：
 {'max_depth': 3, 'n_estimators': 70}
最佳结果：
 0.8302845528455285
最佳估计器:
 RandomForestClassifier(max_depth=3, n_estimators=70)
