In [1]:
import pandas as pd
import numpy as np

In [67]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

In [63]:
train_df = pd.read_csv("train.csv")
train_df = train_df.set_index("PassengerId")

In [53]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:

    X = df.copy()

    X["Title"] = X["Name"].str.split(",").str[1].str.split(".").str[0].str.strip()

    rare_titles = ['Lady', 'the Countess', 'Capt', 'Col', 'Don', 'Dr', 
                   'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']

    X["Title"] = X["Title"].replace({
       "Mlle": "Miss",
        "Ms": "Miss",
        "Mme": "Mrs"
    })

    X["Title"] = X["Title"].replace(rare_titles, "Rare")

    
    X["Has_Cabin"] = X["Cabin"].notna().astype(int)
    X["Deck"] = X["Cabin"].str[0].fillna("U").replace({"T" : "U"})

    X["FamilySize"] = X["SibSp"] + X["Parch"] + 1
    X["IsAlone"] = (X["FamilySize"] == 1).astype(int)
    X["FarePerPerson"] = X["Fare"] / X["FamilySize"]

    X["Pclass"] = X["Pclass"].astype(object)
    X["AgeClass"] = X["Age"] * X["Pclass"].astype(int)

    return X.drop(columns=["Name","Ticket","Cabin"], errors="ignore")


In [74]:
fe = FunctionTransformer(add_features, validate = False)

In [75]:
num_proc = Pipeline([
    ("imputer", SimpleImputer(strategy = "mean")),
    ("scaler", StandardScaler())
])

In [76]:
cat_proc = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

In [77]:
preprocessor = ColumnTransformer([
    ("num", num_proc, selector(dtype_include=["int64","float64"])),
    ("cat", cat_proc, selector(dtype_include=["object","category"]))
])

In [78]:
pipe_xgb = Pipeline([
    ("fe", fe),
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(
        n_estimators=500, learning_rate=0.03, max_depth=4,
        subsample=0.8, colsample_bytree=0.8, random_state=42,
        tree_method="hist", eval_metric="logloss", n_jobs=1
    ))
])

In [79]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [80]:
param_grid = {
    "model__max_depth": [3, 4, 5],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__n_estimators": [200, 400, 800],    # no early stopping in CV, so search this
    "model__min_child_weight": [1, 3],
    "model__subsample": [0.7, 0.9, 1.0],
    "model__colsample_bytree": [0.7, 0.9, 1.0],
    "model__reg_lambda": [1.0, 3.0, 10.0],
    "model__reg_alpha": [0.0, 0.1]
}

grid = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1,
    refit=True
)

In [81]:
X = train_df.drop(columns = "Survived")
y = train_df["Survived"]

In [82]:
grid.fit(X,y)

In [84]:
best_model = grid.best_estimator_
print(grid.best_params_)
print(grid.best_score_)

{'model__colsample_bytree': 0.7, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__min_child_weight': 1, 'model__n_estimators': 800, 'model__reg_alpha': 0.1, 'model__reg_lambda': 10.0, 'model__subsample': 1.0}
0.8484652564183041


In [100]:
X_test = pd.read_csv("test.csv")
X_test = X_test.set_index("PassengerId")

In [101]:
y_pred = best_model.predict(X_test)

In [102]:
submission_df = pd.DataFrame({
    "Survived" : y_pred},
    index = X_test.index)

In [103]:
submission_df = submission_df.reset_index()

In [104]:
submission_df.to_csv("submission.csv", index=False)

In [106]:
!kaggle competitions submit -c titanic -f submission.csv -m "grid_search_XGBModel_FE"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 19.0kB/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 4.92kB/s]
