In [204]:
import pandas as pd

# 获取数据集 
data = pd.read_csv('titanic/titanic.csv')

# 数据预处理
# 划分特征值和目标值
x = data[['pclass','age','sex']]
y = data['survived']

# 缺失值处理
x['age'].fillna(x['age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x['age'].fillna(x['age'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['age'].fillna(x['age'].mean(), inplace=True)


In [205]:
# 将x转换为字典型
x = x.to_dict(orient="records")


In [206]:
# 数据集划分
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=22) 

In [207]:
# 字典特征提取
from sklearn.feature_extraction import DictVectorizer
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [208]:
x_train = x_train.toarray()
x_test = x_test.toarray()

In [209]:
import numpy as np
x1 = np.concatenate((x_test, x_train), axis=0)


In [210]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



grid_param = {"n_estimators":[100,200,400,600,800,1000,1200],"max_depth":[3,10,15,25]}
estimator = RandomForestClassifier()
estimator = GridSearchCV(estimator,param_grid=grid_param,cv=5)
estimator.fit(x_train, y_train)
y_predict = estimator.predict(x_test) 
print('结果:\n', y_test == y_predict)
print('准确率为\n',estimator.score(x_test, y_test))
print('最佳参数\n',estimator.best_params_)
print('最佳结果',estimator.best_score_)
print('最佳估计器\n',estimator.best_estimator_)
print('最佳交叉验证结果\n',estimator.cv_results_)

结果:
 831      True
261      True
1210     True
1155     True
255      True
        ...  
1146     True
1125    False
386      True
1025    False
337      True
Name: survived, Length: 329, dtype: bool
准确率为
 0.7811550151975684
最佳参数
 {'max_depth': 3, 'n_estimators': 200}
最佳结果 0.8404330260022792
最佳估计器
 RandomForestClassifier(max_depth=3, n_estimators=200)
最佳交叉验证结果
 {'mean_fit_time': array([0.05306659, 0.09281106, 0.18131447, 0.27433853, 0.3660985 ,
       0.49611058, 0.52775841, 0.05123324, 0.10087476, 0.20136151,
       0.3017312 , 0.40211697, 0.50096736, 0.60660992, 0.05077147,
       0.10112405, 0.20093813, 0.30235925, 0.4024302 , 0.50356574,
       0.60368099, 0.0510232 , 0.10143991, 0.20214086, 0.30767508,
       0.40286102, 0.50449314, 0.61332989]), 'std_fit_time': array([0.00608669, 0.00175207, 0.00670333, 0.01542245, 0.01517171,
       0.02845779, 0.01434141, 0.00039895, 0.0009277 , 0.00085317,
       0.00210686, 0.00128205, 0.00415665, 0.0108884 , 0.00064965,
       0.00068021, 0.

In [211]:
# 用全数据集重新训练
final_model = RandomForestClassifier(max_depth=3, n_estimators=400, random_state=42)
final_model.fit(x1, y)
y_predict = final_model.predict(x1)
print('结果:\n', y == y_predict)
print('准确率为\n',final_model.score(x1, y))


结果:
 0       False
1        True
2        True
3        True
4       False
        ...  
1308     True
1309     True
1310     True
1311     True
1312     True
Name: survived, Length: 1313, dtype: bool
准确率为
 0.6626047220106626
