In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# 查看基本信息
print(train_df.shape)   # (8693, 14)
print(test_df.shape)    # (4277, 13)

# 显示前几行
print(train_df.head())

(8693, 14)
(4277, 13)
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False

第三步：数据预处理（Feature Engineering）
我们需要处理缺失值、提取特征、编码分类变量。
1. 合并 train 和 test（方便统一处理）

In [18]:
# 保存 test 的 PassengerId 用于提交
test_passenger_ids = test_df['PassengerId']

# 添加标识列
train_df['is_train'] = True
test_df['is_train'] = False
test_df['Transported'] = np.nan  # 占位

# 合并
df = pd.concat([train_df, test_df], ignore_index=True)

2. 提取有用信息

In [19]:
# 从 PassengerId 提取 Group 和 GroupSize
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
df['GroupSize'] = df.groupby('Group')['Group'].transform('count')

# 从 Cabin 提取 Deck, Num, Side
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)

# 总消费
df['TotalSpent'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df['HasSpent'] = df['TotalSpent'] > 0

3. 填充缺失值（简单策略）


In [20]:
# 分类变量用众数填充
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
    df[col] = df[col].fillna(df[col].mode()[0])  # 注意：去掉 inplace=True，用 = 赋值

# 数值变量用中位数或 0 填充
df['Age'] = df['Age'].fillna(df['Age'].median())
df['TotalSpent'] = df['TotalSpent'].fillna(0)
for col in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    df[col] = df[col].fillna(0)

 第四步：特征编码 & 准备训练集


In [21]:
from sklearn.preprocessing import LabelEncoder

# 选择要使用的特征
features = [
    'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
    'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'GroupSize', 'Deck', 'Side', 'TotalSpent', 'HasSpent'
]

# 对分类变量编码
label_encoders = {}
for col in ['HomePlanet', 'Destination', 'Deck', 'Side']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# CryoSleep 和 VIP 是 bool，转成 int
df['CryoSleep'] = df['CryoSleep'].astype(bool).astype(int)
df['VIP'] = df['VIP'].astype(bool).astype(int)
df['HasSpent'] = df['HasSpent'].astype(int)

# 分离回 train 和 test
train = df[df['is_train']].copy()
test = df[~df['is_train']].copy()

X_train = train[features]
y_train = train['Transported'].astype(bool)
X_test = test[features]

第五步：训练模型（用 RandomForest 入门）


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# 创建模型
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

# 交叉验证评估
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# 训练最终模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

CV Accuracy: 0.7973 ± 0.0105


第六步：生成提交文件


In [23]:
# 创建提交 DataFrame
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': y_pred
})

# 保存
submission.to_csv('submission.csv', index=False)
print("✅ 提交文件已保存为 submission.csv")

✅ 提交文件已保存为 submission.csv
