In [25]:
import pandas as pd
from sklearn.datasets import fetch_openml

# 用sklearn加载泰坦尼克号幸存者数据集
titanic = fetch_openml('titanic', version=1, as_frame=True)
titanic_df = titanic.frame

# 显示前几行数据以确认加载成功
print(titanic_df.head())
titanic_df.head()


   pclass survived                                             name     sex  \
0       1        1                    Allen, Miss. Elisabeth Walton  female   
1       1        1                   Allison, Master. Hudson Trevor    male   
2       1        0                     Allison, Miss. Helen Loraine  female   
3       1        0             Allison, Mr. Hudson Joshua Creighton    male   
4       1        0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St Louis,

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [26]:
# 查找titanic_df的缺失值
missing_values = titanic_df.isnull().sum()
print("各列缺失值数量：")
print(missing_values)


各列缺失值数量：
pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [27]:
# 用age的均值填充age的缺失值
titanic_df['age'].fillna(titanic_df['age'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['age'].fillna(titanic_df['age'].mean(), inplace=True)


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False    # 正常显示负号

# 只保留分析需要的列
df = titanic_df[['pclass', 'sex', 'age', 'survived','parch','sibsp']].copy()

# 创建3x1的子图布局
# fig, axes = plt.subplots(3, 1, figsize=(10, 15))

# # 1. pclass与survived的关系
# sns.barplot(ax=axes[0], x='pclass', y='survived', data=df, errorbar=None)
# axes[0].set_title('不同舱位(pclass)的生存率')
# axes[0].set_ylabel('生存率')
# axes[0].set_xlabel('舱位等级')

# # 2. sex与survived的关系
# sns.barplot(ax=axes[1], x='sex', y='survived', data=df, errorbar=None)
# axes[1].set_title('不同性别(sex)的生存率')
# axes[1].set_ylabel('生存率')
# axes[1].set_xlabel('性别')

# # 3. age与survived的关系
# sns.histplot(ax=axes[2], data=df, x='age', hue='survived', bins=30, kde=True, 
#              stat='density', common_norm=False)
# axes[2].set_title('年龄(age)与生存(survived)的关系')
# axes[2].set_xlabel('年龄')
# axes[2].set_ylabel('密度')
# axes[2].legend(['未生存(0)', '生存(1)'])

# plt.tight_layout()
# plt.show()

# # 4. 多变量联合分析：性别和舱位对生存率的影响
# plt.figure(figsize=(8,5))
# sns.barplot(x='pclass', y='survived', hue='sex', data=df, errorbar=None)
# plt.title('不同舱位和性别的生存率')
# plt.ylabel('生存率')
# plt.xlabel('舱位等级')
# plt.show()


In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
import matplotlib.pyplot as plt

# 准备数据
X = df[['pclass', 'sex', 'age','parch']]
# 将性别转换为数值
X['sex'] = X['sex'].map({'male': 0, 'female': 1})

y = df['survived']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建决策树模型
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)

# 预测并评估模型
y_pred = clf.predict(X_test)
print(f"模型准确率: {accuracy_score(y_test, y_pred):.2f}")

# # 可视化决策树
# plt.figure(figsize=(12,8))
# tree.plot_tree(clf, 
#                feature_names=['pclass', 'sex', 'age'], 
#                class_names=['Not Survived', 'Survived'],
#                filled=True)
# plt.show()


模型准确率: 0.74


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sex'] = X['sex'].map({'male': 0, 'female': 1})


In [30]:
from sklearn.model_selection import GridSearchCV

# 设置参数网格，包含max_depth, min_samples_split, min_impurity_decrease
param_grid = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1]
}

# 创建网格搜索对象
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                          param_grid, 
                          cv=5, 
                          scoring='accuracy')

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和对应的准确率
print(f"最佳参数组合: {grid_search.best_params_}")
print(f"最佳模型准确率: {grid_search.best_score_:.2f}")

# 使用最佳参数重新训练模型
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
print(f"测试集准确率: {accuracy_score(y_test, y_pred):.2f}")


最佳参数组合: {'max_depth': 4, 'min_impurity_decrease': 0.0, 'min_samples_split': 2}
最佳模型准确率: 0.80
测试集准确率: 0.74


In [32]:
from sklearn.ensemble import RandomForestClassifier

# 创建随机森林分类器
rf_clf = RandomForestClassifier(n_estimators=500, 
                               max_depth=5, 
                               random_state=42)

# 训练模型
rf_clf.fit(X_train, y_train)

# 预测并评估模型
y_pred_rf = rf_clf.predict(X_test)
print(f"随机森林模型准确率: {accuracy_score(y_test, y_pred_rf):.2f}")

# # 特征重要性可视化
# importances = rf_clf.feature_importances_
# feature_names = ['pclass', 'sex', 'age']
# plt.figure(figsize=(8, 4))
# plt.barh(feature_names, importances)
# plt.title("Feature Importance")
# plt.xlabel("Importance Score")
# plt.ylabel("Features")
# plt.show()


随机森林模型准确率: 0.76
