In [None]:
import numpy as np
import pandas as pd
import joblib
import json
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# ----------------------------
# Load raw data
# ----------------------------
train_df = pd.read_csv("../dataset/train.csv")
y_full_train = train_df["Survived"]

# ----------------------------
# 1. Custom Feature Engineering
# ----------------------------
def feature_engineering(df: pd.DataFrame):
    df = df.copy()
    
    # Fill missing values
    df["Embarked"] = df["Embarked"].fillna("C")
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

    # New engineered features
    df["norm_fare"] = np.log(df["Fare"] + 1)
    df["cabin_multiple"] = df["Cabin"].apply(
        lambda x: 0 if pd.isna(x) else len(x.split(" "))
    ).astype(str)
    df["cabin_categories"] = df["Cabin"].apply(lambda x: str(x)[0])
    df["name_title"] = df["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip())

    # Convert to string for categorical columns
    df["Pclass"] = df["Pclass"].astype(str)
    
    # Drop unused columns
    drop_cols = ["PassengerId", "Name", "Cabin", "Ticket", "Survived"]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])

    return df

def clf_performance(model, name, X_val, y_val):
    print(f"\nBest {name} Model Evaluation:")
    print("Best parameters:", model.best_params_)

    y_pred = model.predict(X_val)

    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("F1 Score:", f1_score(y_val, y_pred))
    print("Precision:", precision_score(y_val, y_pred))
    print("Recall:", recall_score(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))


# ----------------------------
# 2. ColumnTransformer (encoding + scaling)
# ----------------------------
categorical_cols = [
    "Pclass",
    "Sex",
    "Embarked",
    "cabin_categories",
    "cabin_multiple",
    "name_title",
]
numerical_cols = ["Age", "SibSp", "Parch", "norm_fare"]

preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ("numerical", StandardScaler(), numerical_cols)
])

# ----------------------------
# 3. Build full pipeline
# ----------------------------
rf_pipeline = Pipeline([
    ("feature_eng", FunctionTransformer(feature_engineering)),
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=29))
])

svc_pipeline = Pipeline([
    ("feature_eng", FunctionTransformer(feature_engineering)),
    ("preprocessor", preprocessor),
    ("model", SVC(probability=True, random_state=29))
])

# ----------------------------
# 4. Train / validation split
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    train_df, y_full_train, test_size=0.2, random_state=29, stratify=y_full_train
)

# ----------------------------
# 5. GridSearchCV for RF
# ----------------------------
param_grid_rf = {
    "model__n_estimators": [100, 200, 400, 500],
    "model__criterion": ["gini", "entropy"],
    "model__max_depth": [15, 20, 25],
    "model__max_features": ["log2", "sqrt", 10],
    "model__min_samples_leaf": [2, 3],
    "model__min_samples_split": [2, 3],
}

clf_rf = GridSearchCV(rf_pipeline, param_grid=param_grid_rf, cv=5, verbose=2, n_jobs=-1)
best_clf_rf = clf_rf.fit(X_train, y_train)

clf_performance(best_clf_rf, "Random Forest", X_val, y_val)

# ----------------------------
# 6. GridSearchCV for SVC
# ----------------------------
param_grid_svc = [
    {"model__kernel": ["rbf"], "model__gamma": [0.1, 0.5, 1, 2, 5], "model__C": [0.1, 1, 10, 100]},
    {"model__kernel": ["linear"], "model__C": [0.1, 1, 10, 100]},
    {"model__kernel": ["poly"], "model__degree": [2, 3, 4], "model__C": [0.1, 1, 10, 100]},
]

clf_svc = GridSearchCV(svc_pipeline, param_grid=param_grid_svc, cv=5, verbose=2, n_jobs=-1)
best_clf_svc = clf_svc.fit(X_train, y_train)

clf_performance(best_clf_svc, "SVC", X_val, y_val)

# ----------------------------
# 7. Save the pipelines
# ----------------------------
joblib.dump(best_clf_rf.best_estimator_, "../models/best_rf_pipeline.pkl")
joblib.dump(best_clf_svc.best_estimator_, "../models/best_svc_pipeline.pkl")


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[CV] END model__criterion=gini, model__max_depth=15, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.5s
[CV] END model__criterion=gini, model__max_depth=15, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__criterion=gini, model__max_depth=15, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__criterion=gini, model__max_depth=15, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__criterion=gini, model__max_depth=15, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__criterion=gini, model__max_dep

['../models/best_svc_pipeline.pkl']

In [3]:
# Pick 5 passengers from the validation set
sample_idx = X_val.sample(5, random_state=42).index

# Extract original (raw) data from train_df so it's in the same format as the API input
validation_samples = train_df.loc[sample_idx]

# Show their true labels
for idx in sample_idx:
    print(f"PassengerID: {idx}, Survived: {y_val.loc[idx]}")

samples_json = validation_samples.to_dict(orient="records")

print(json.dumps(samples_json, indent=2))

PassengerID: 113, Survived: 0
PassengerID: 730, Survived: 1
PassengerID: 274, Survived: 1
PassengerID: 360, Survived: 0
PassengerID: 306, Survived: 1
[
  {
    "PassengerId": 114,
    "Survived": 0,
    "Pclass": 3,
    "Name": "Jussila, Miss. Katriina",
    "Sex": "female",
    "Age": 20.0,
    "SibSp": 1,
    "Parch": 0,
    "Ticket": "4136",
    "Fare": 9.825,
    "Cabin": NaN,
    "Embarked": "S"
  },
  {
    "PassengerId": 731,
    "Survived": 1,
    "Pclass": 1,
    "Name": "Allen, Miss. Elisabeth Walton",
    "Sex": "female",
    "Age": 29.0,
    "SibSp": 0,
    "Parch": 0,
    "Ticket": "24160",
    "Fare": 211.3375,
    "Cabin": "B5",
    "Embarked": "S"
  },
  {
    "PassengerId": 275,
    "Survived": 1,
    "Pclass": 3,
    "Name": "Healy, Miss. Hanora \"Nora\"",
    "Sex": "female",
    "Age": NaN,
    "SibSp": 0,
    "Parch": 0,
    "Ticket": "370375",
    "Fare": 7.75,
    "Cabin": NaN,
    "Embarked": "Q"
  },
  {
    "PassengerId": 361,
    "Survived": 0,
    "Pclass": 