In [28]:
import pandas as pd
import numpy as np
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

### 观察数据集

In [29]:
from IPython.display import display

print("--- 训练集 (Train Set) ---")
display(titanic_train.head())
#乘客序号 是否存活 船票等级(1为头等) 名字 性别 年龄 船上的兄弟姐妹配偶数 船上的父母孩子数 票号 票价 船舱号 登船港口(C,Q,S)
print("\n--- 考试集 (Test Set) ---")
display(titanic_test.head())
#乘客序号 船票等级(1为头等) 名字 性别 年龄 船上的兄弟姐妹配偶数 船上的父母孩子数 票号 票价 船舱号 登船港口(C,Q,S) 

--- 训练集 (Train Set) ---


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



--- 考试集 (Test Set) ---


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [30]:
print("--------------------- 训练集 (Train Set) info-------------------")
titanic_train.info()
print("---------------------考试集 (Teat Set) info-----------------------")
titanic_test.info()

--------------------- 训练集 (Train Set) info-------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
---------------------考试集 (Teat Set) info-----------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column  

### 特征工程1

In [31]:
df = titanic_train.copy()
df['Title'] = df['Name'].str.extract(r'([A-Za-z]+)\.' , expand = False)#读取姓名中隐含的人物社会地位的信息,比如Mr. Master，
#标准化称谓
title_mapping = {'Mlls': 'Miss' ,  'Ms':'Miss' , 'Mme':'Mrs'}
df['Title'] = df['Title'].replace(title_mapping)
#处理非主流称谓
title_mask = ~df['Title'].isin(['Mr' , 'Miss' , 'Mrs' , 'Master'])
df.loc[title_mask, 'Title'] = df.loc[title_mask , 'Sex'].map({'male':'Mr' , 'female':'Mrs'})

### 数据清洗

In [32]:
#中位数填充
title_age_medians = {
    'Mr' : 32.32,
    'Miss' : 21.68,
    'Mrs' : 35.86,
    'Master' : 4.57
}
for title , median_age in title_age_medians.items():
    age_mask = (df['Age'].isnull()) & (df['Title'] == title)
    df.loc[age_mask , 'Age'] = median_age
# 填充'Embarked'和'Fare'的缺失值
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0]) #众数填充
df['Fare'] = df['Fare'].fillna(df['Fare'].median()) #中位数填充

### 特征工程2

In [33]:
# 交互特征
df['Age*Class'] = df['Age'] * df['Pclass']
df['Age*Fare'] = df['Age'] * df['Fare']    

# 分类变量 One-Hot
df_sex = pd.get_dummies(df['Sex'], prefix='sex', drop_first=True, dtype=int)
df_Pclass = pd.get_dummies(df['Pclass'], prefix='Pclass',drop_first=True, dtype=int)
df_Embarked = pd.get_dummies(df['Embarked'], prefix='sEmbarked', drop_first=True, dtype=int)
df_Title = pd.get_dummies(df['Title'], prefix='Title', drop_first=True, dtype=int)

# 合并新生成的特征列
df = pd.concat([df, df_sex, df_Pclass, df_Embarked, df_Title], axis=1)

# 家庭规模特征
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# 将连续变量分箱 (Binning)
df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[0, 1, 2, 3, 4])
df['FareBand'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3])

# 对数变换
df['Fare_log'] = np.log1p(df['Fare'])

# 清理与缩放
# 丢弃已经处理过或不再需要的原始列
df = df.drop(['Sex','Pclass','Name','Ticket','Embarked','Cabin', 'Title','Fare', 'SibSp', 'Parch'], axis=1)

# 特征缩放 (Standard Scaling)
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
mew = df[numeric_columns].mean(axis=0)
std = df[numeric_columns].std(axis=0)
df[numeric_columns] = (df[numeric_columns] - mew) / std

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    float64 
 1   Survived     891 non-null    float64 
 2   Age          891 non-null    float64 
 3   Age*Class    891 non-null    float64 
 4   Age*Fare     891 non-null    float64 
 5   sex_male     891 non-null    float64 
 6   Pclass_2     891 non-null    float64 
 7   Pclass_3     891 non-null    float64 
 8   sEmbarked_Q  891 non-null    float64 
 9   sEmbarked_S  891 non-null    float64 
 10  Title_Miss   891 non-null    float64 
 11  Title_Mr     891 non-null    float64 
 12  Title_Mrs    891 non-null    float64 
 13  FamilySize   891 non-null    float64 
 14  IsAlone      891 non-null    float64 
 15  AgeBand      891 non-null    category
 16  FareBand     891 non-null    category
 17  Fare_log     891 non-null    float64 
dtypes: category(2), float64(16)
me

### 切分数据集

In [None]:
from sklearn.model_selection import train_test_split
dataset = df.copy()
target = dataset['Survived']
features = dataset.drop('Survived' , axis=1)
Train_features, Test_features, Train_target, Test_target = train_test_split(features , target , 
                                                                            test_size = 0.25 , random_state = 33,
                                                                            stratify = target
                                                                            )

    Age  Pclass_2  Pclass_3  Sex_male
0  22.0     False      True      True
1  38.0     False     False     False
2  26.0     False      True     False
3  35.0     False     False     False
4  35.0     False      True      True


### 决策树

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

dtree = DecisionTreeClassifier(random_state=33)
dtree.fit(Train_features, Train_target)

predictions = dtree.predict(Test_features)
print("准确率 (Accuracy):", accuracy_score(Test_target, predictions))
print("\n分类报告 (Classification Report):\n", classification_report(Test_target, predictions))
print("\n编码后的特征列名：", features.columns.tolist())

准确率 (Accuracy): 0.8340807174887892

分类报告 (Classification Report):
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       134
           1       0.82      0.74      0.78        89

    accuracy                           0.83       223
   macro avg       0.83      0.82      0.82       223
weighted avg       0.83      0.83      0.83       223


编码后的特征列名： ['Age', 'Pclass_2', 'Pclass_3', 'Sex_male']


### 提交

In [41]:
test = pd.get_dummies(test , columns=['Pclass', 'Sex'] , drop_first = True)

dtree2 = DecisionTreeClassifier(random_state=33)
dtree2.fit(features, target)

predictions = dtree2.predict(test)
sequence_numbers = range(892 , 892 + len(predictions))
output_df = pd.DataFrame({
    'PassengerId': sequence_numbers,
    'Survived': predictions
})
output_df.to_csv('predictions.csv', index=False)