# XGBoost

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

### Original Dataset

In [10]:
# Load original train data
df_orig = pd.read_csv("train.csv")
df_orig = df_orig.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_orig['Age'] = df_orig['Age'].fillna(df_orig['Age'].median())
df_orig['Embarked'] = df_orig['Embarked'].fillna(df_orig['Embarked'].mode()[0])

df_orig = pd.get_dummies(df_orig, columns=['Sex', 'Embarked'], drop_first=True)

X_orig = df_orig.drop('Survived', axis=1)
y_orig = df_orig['Survived']

Grid Search

In [11]:
xgb_orig = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
}

grid_orig = GridSearchCV(xgb_orig, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_orig.fit(X_orig, y_orig)

print("🔍 XGB (original) best params:", grid_orig.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
🔍 XGB (original) best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


Predict

In [12]:
test_orig = pd.read_csv("test.csv")
passenger_ids_orig = test_orig['PassengerId']
test_orig = test_orig.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_orig['Age'] = test_orig['Age'].fillna(test_orig['Age'].median())
test_orig['Fare'] = test_orig['Fare'].fillna(test_orig['Fare'].median())
test_orig['Embarked'] = test_orig['Embarked'].fillna(test_orig['Embarked'].mode()[0])
test_orig = pd.get_dummies(test_orig, columns=['Sex', 'Embarked'], drop_first=True)

# Align test with training columns
for col in X_orig.columns:
    if col not in test_orig.columns:
        test_orig[col] = 0
test_orig = test_orig[X_orig.columns]

# Predict
preds_orig = grid_orig.best_estimator_.predict(test_orig)
submission_orig = pd.DataFrame({
    "PassengerId": passenger_ids_orig,
    "Survived": preds_orig
})
submission_orig.to_csv("submission/submission_xgb_original.csv", index=False)
print("✅ submission_xgb_original.csv saved.")


✅ submission_xgb_original.csv saved.


### Engineered Data

In [13]:
df_clean = pd.read_csv("train_cleaned.csv")
X_clean = df_clean.drop('Survived', axis=1)
y_clean = df_clean['Survived']

Grid Search

In [14]:
xgb_clean = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

grid_clean = GridSearchCV(xgb_clean, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_clean.fit(X_clean, y_clean)

print("🔍 XGB (cleaned) best params:", grid_clean.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
🔍 XGB (cleaned) best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [15]:
test_clean = pd.read_csv("test_cleaned.csv")
passenger_ids_clean = test_clean['PassengerId']
X_test_clean = test_clean.drop('PassengerId', axis=1)

preds_clean = grid_clean.best_estimator_.predict(X_test_clean)
submission_clean = pd.DataFrame({
    "PassengerId": passenger_ids_clean,
    "Survived": preds_clean
})
submission_clean.to_csv("submission/submission_xgb_cleaned.csv", index=False)
print("✅ submission_xgb_cleaned.csv saved.")

✅ submission_xgb_cleaned.csv saved.
