In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [3]:
test['Fare'] = test['Fare'].fillna(test['Fare'].mode()[0])

In [4]:
train['Sex_encoded'] = train['Sex'].map({'male': 1, 'female': 0})
test['Sex_encoded'] = test['Sex'].map({'male': 1, 'female': 0})

In [5]:
train.dropna(subset=['Embarked'], inplace=True)
test.dropna(subset=['Embarked'], inplace=True)

In [6]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# 初始化编码器（必须设置 handle_unknown='ignore'）
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# 训练集拟合并转换
embarked_train_encoded = ohe.fit_transform(train[['Embarked']])  # 输入必须是二维
embarked_cols = ohe.get_feature_names_out(['Embarked'])  # 获取列名（如 ['Embarked_C', 'Embarked_Q', 'Embarked_S']）

# 测试集转换（自动对齐训练集的列，未知类别全0）
embarked_test_encoded = ohe.transform(test[['Embarked']])

# 将编码结果转为DataFrame
train_encoded_df = pd.DataFrame(embarked_train_encoded, columns=embarked_cols, index=train.index)
test_encoded_df = pd.DataFrame(embarked_test_encoded, columns=embarked_cols, index=test.index)

# 合并到原数据
train = pd.concat([train, train_encoded_df], axis=1)
test = pd.concat([test, test_encoded_df], axis=1)

# train.drop(['Embarked_S', 'Embarked_C', 'Embarked_Q'], axis=1, inplace=True)
# test.drop(['Embarked_S', 'Embarked_C', 'Embarked_Q'], axis=1, inplace=True)

In [7]:
from sklearn.impute import KNNImputer

# 选择相关特征并标准化
from sklearn.preprocessing import StandardScaler
age_train = train[['Age']].values
age_test = test[['Age']].values

scaler = StandardScaler()

scaled_age_train = scaler.fit_transform(age_train)
scaled_age_test = scaler.transform(age_test)

train['Age_scaled'] = scaled_age_train
test['Age_scaled'] = scaled_age_test

# KNN填充
imputer = KNNImputer(n_neighbors=5)

train['Age'] = imputer.fit_transform(train[['Age_scaled']])
test['Age'] = imputer.transform(test[['Age_scaled']])

train.drop(['Age_scaled'], axis=1, inplace=True)
test.drop(['Age_scaled'], axis=1, inplace=True)

In [8]:
X_train = train.drop(['PassengerId', 'Survived', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
y_train = train['Survived']

X_test = test.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# model = RandomForestClassifier(n_estimators=100, random_state=42)
model = XGBClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

predictions = model.predict(X_test)
print(classification_report(submission['Survived'], predictions))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       266
           1       0.81      0.76      0.79       152

    accuracy                           0.85       418
   macro avg       0.84      0.83      0.83       418
weighted avg       0.85      0.85      0.85       418

