## 1. Importing Dependancies

In [None]:
import os
from pathlib import Path

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from scipy.stats import norm
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestClassifier,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
    ElasticNet,
    Lasso,
    LinearRegression,
    LogisticRegression,
    Ridge,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier, XGBRegressor

## 2. Load The Data

We read from the 04-encoded folder we just created.

In [None]:
# 1. Define paths (using pathlib for robustness)
PROJECT_ROOT = Path().resolve().parent
DATA_DIR = PROJECT_ROOT / "data" / "04-encoded"

# 2. Load the datasets
X_train = pd.read_csv(DATA_DIR / "X_train_encoded.csv")
X_val = pd.read_csv(DATA_DIR / "X_val_encoded.csv")
y_train = pd.read_csv(DATA_DIR / "y_train.csv").values.ravel()  # ravel() flattens it to an array
y_val = pd.read_csv(DATA_DIR / "y_val.csv").values.ravel()

print(f"Data Loaded. X_train shape: {X_train.shape}")

## 3. Scaling The Data

### I\. The Safety Check (Age Imputation)

**Crucial:** If you didn't explicitly fill missing values in the Age column during your EDA/Feature Engineering phase, the Scaler will **crash**.

In [None]:
# Check if Age has missing values
if X_train["age"].isnull().sum() > 0:
    print(f"Found {X_train['age'].isnull().sum()} missing ages. Filling with Median...")

    # Calculate median on TRAIN
    age_median = X_train["age"].median()

    # Fill on all
    X_train["age"] = X_train["age"].fillna(age_median)
    X_val["age"] = X_val["age"].fillna(age_median)

print("No missing values in Age.")

### II\. Scale & Save (The MLOps Step)

We only scale the continuous columns (age, fare, FamilySize). We do **not** touch the binary columns (like sex\_male, pclass\_2).

In [None]:
# 1. Define columns to scale
scale_cols = ["age", "fare", "familysize"]

# 2. Initialize and Fit Scaler (On TRAIN only)
scaler = StandardScaler()
scaler.fit(X_train[scale_cols])

# 3. Transform Data
# We use .loc to modify the specific columns in place
X_train.loc[:, scale_cols] = scaler.transform(X_train[scale_cols])
X_val.loc[:, scale_cols] = scaler.transform(X_val[scale_cols])

# 4. Save the Scaler (CRITICAL for your API later)
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)
joblib.dump(scaler, MODEL_DIR / "scaler.pkl")
# Save the list of columns that the model expects
# This ensures we can align the API input perfectly later
joblib.dump(X_train.columns.tolist(), "../models/model_columns.pkl")

print("Model columns saved. We will use this to align the API input.")

print(f"Data scaled and scaler saved to {MODEL_DIR}/scaler.pkl")
print(X_train[scale_cols].head())

In [None]:
X_train.columns

## 4. Model Tournament

You don't want to just pick one model and hope for the best. You want to pit 4-5 different algorithms against each other, see which one learns the patterns best, and then "promote" the winner to the next round (Hyperparameter Tuning).

### The "Model Tournament" Bracket

1.  **The Qualifiers (Spot Checking):** We try 5-6 standard algorithms with their default settings. We don't tune them yet; we just want to see who has potential.
    
2.  **The Semi-Finals (Cross Validation):** We verify the scores aren't just "luck" from a specific train/test split.
    
3.  **The Finals (Hyperparameter Tuning):** We take the top 2 models and tweak their settings (the knobs and dials) to squeeze out every drop of performance.

### The Setup (The Qualifiers)

We will test these 6 classic competitors:

1.  **Logistic Regression:** (Your Baseline - simple and interpretable).
    
2.  **K-Nearest Neighbors (KNN):** Looks for "similar" passengers.
    
3.  **Support Vector Machine (SVM):** Draws complex boundary lines.
    
4.  **Random Forest:** A team of Decision Trees voting together.
    
5.  **XGBoost:** The "Kaggle King" (Gradient Boosting).

6. **MLPClassifier** Undisputed kings of Unstructured Data (Images, Audio, Text)

In [None]:
# 2. Define Models
# NOTE: We added probability=True to SVM so it can give us percentages later!
# Add it to your models dictionary
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    # The Neural Network Challenger
    # max_iter=1000 gives it enough time to learn without giving up
    "Neural Network": MLPClassifier(max_iter=1000, random_state=42),
}
# 3. The Loop
results = []
names = []

print("üèÜ Running Model Tournament...")

for name, model in models.items():
    # cv=5 means we test it 5 times on different chunks of data
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")

    results.append(cv_scores)
    names.append(name)

    print(f"{name}: {cv_scores.mean()*100:.2f}% Accuracy (+/- {cv_scores.std()*100:.2f}%)")

# 4. Visualize
plt.figure(figsize=(10, 5))
plt.boxplot(results, labels=names)
plt.title("Model Comparison (Accuracy)")
plt.ylabel("Accuracy Score")
plt.show()

### Overall takeaway

SVM and Random Forest show the **highest median accuracy**, while XGBoost has the **lowest median** and the most inconsistent performance. Logistic Regression and KNN are stable mid-performers, and the Neural Network is moderate but more variable.

--------------------------------------------------
### Model-by-model interpretation

**SVM**

*   Highest median accuracy (~0.84)
    
*   Relatively tight spreadüëâ Strong and consistent performer
    

**Random Forest**

*   Similar median to SVM
    
*   Wider spread than SVMüëâ High peak performance but slightly less stable
    

**Logistic Regression**

*   Median around ~0.83
    
*   Small spreadüëâ Reliable and stable baseline model
    

**KNN**

*   Slightly lower median (~0.82)
    
*   Fairly consistentüëâ Decent but not top-performing
    

**Neural Network**

*   Moderate median (~0.82)
    
*   Noticeable variabilityüëâ Performance depends heavily on conditions
    

**XGBoost**

*   Lowest median (~0.79)
    
*   Wide spreadüëâ Most inconsistent and weakest performer in this comparison

-----------------------------------------------------------------------------------------

### Variability insight

*   Narrower boxes = more stable performance
    
*   Wider boxes/long whiskers = higher variability
    
*   Random Forest and XGBoost fluctuate more across runs
    
*   Logistic Regression and KNN are more predictable
    

### Conclusion

> SVM and Random Forest achieved the highest median accuracy, indicating superior predictive performance. Logistic Regression and KNN provided stable and competitive results with lower variance. The Neural Network showed moderate performance with some variability, while XGBoost had the lowest median accuracy and the widest spread, suggesting inconsistent behavior across runs. Overall, SVM appears to offer the best balance between accuracy and stability.


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Import all competitors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# -------------------------------------------------------------------------
# 1. SETUP: Define Models and their Hyperparameter Grids
# -------------------------------------------------------------------------
tournament_config = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {"C": [0.1, 1, 10], "solver": ["liblinear", "lbfgs"]},
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [5, 9, 15],
            "weights": ["uniform", "distance"],
            "p": [1, 2],  # 1=Manhattan, 2=Euclidean
        },
    },
    "SVM": {
        # Note: probability=True is needed for the final percentage output!
        "model": SVC(probability=True, random_state=42),
        "params": {"C": [1, 10, 100], "kernel": ["rbf", "linear"], "gamma": ["scale", "auto"]},
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [10, 20, None],
            "min_samples_split": [2, 5],
        },
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1],
            "max_depth": [3, 5, 7],
        },
    },
    "Neural Network": {
        "model": MLPClassifier(max_iter=1000, random_state=42),
        "params": {
            "hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "activation": ["tanh", "relu"],
            "alpha": [0.0001, 0.05],
        },
    },
}

# -------------------------------------------------------------------------
# 2. EXECUTION: The Loop
# -------------------------------------------------------------------------
results = []
best_models = {}

print("üèÜ STARTING MODEL TOURNAMENT...")
print("-" * 60)

for name, config in tournament_config.items():
    print(f"‚öôÔ∏è Tuning {name}...")

    # A. Grid Search (CV)
    gs = GridSearchCV(
        estimator=config["model"],
        param_grid=config["params"],
        cv=5,  # 5-Fold Cross Validation
        scoring="accuracy",
        n_jobs=-1,  # Use all CPU cores
        verbose=0,  # Keep it quiet
    )

    gs.fit(X_train, y_train)

    # B. Validation Check
    # Get the best version of the model
    tuned_model = gs.best_estimator_

    # Test on Validation Set
    val_pred = tuned_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)

    # C. Store Results
    results.append(
        {
            "Model": name,
            "CV Score (Avg)": gs.best_score_,
            "Validation Score": val_acc,
            "Gap": abs(gs.best_score_ - val_acc),
            "Best Params": gs.best_params_,
        }
    )

    # Save the actual model object for later
    best_models[name] = tuned_model

    print(f"   -> CV: {gs.best_score_:.2%} | Val: {val_acc:.2%}")

# -------------------------------------------------------------------------
# 3. RESULTS: The Leaderboard
# -------------------------------------------------------------------------
print("-" * 60)
print("üèÅ FINAL LEADERBOARD")
print("-" * 60)

leaderboard = pd.DataFrame(results).sort_values(by="Validation Score", ascending=False)

# Pretty print
print(leaderboard[["Model", "CV Score (Avg)", "Validation Score", "Gap"]])

# -------------------------------------------------------------------------
# 4. SAVE THE WINNER
# -------------------------------------------------------------------------
winner_name = leaderboard.iloc[0]["Model"]
winner_model = best_models[winner_name]

import joblib

joblib.dump(winner_model, "../models/model.pkl")
print(f"\nüíæ The Winner ({winner_name}) has been saved to '../models/model.pkl'")

Testing submission

In [None]:
# -------------------------------
# Fix Python path so `src` is found
# -------------------------------
import sys
from pathlib import Path

# Adjust this if your notebook is nested deeper
PROJECT_ROOT = Path.cwd().parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Sanity check (optional)
assert (PROJECT_ROOT / "src").exists(), "src/ folder not found. Check PROJECT_ROOT."

# -------------------------------
# Imports
# -------------------------------
import pandas as pd

from src.utils import load_config

# -------------------------------
# Load config
# -------------------------------
config = load_config()

# -------------------------------
# Load your model predictions
# -------------------------------
output_path = PROJECT_ROOT / config["data"]["predictions_path"]
df_new = pd.read_csv(output_path)

# -------------------------------
# Load raw test data
# -------------------------------
test_path = PROJECT_ROOT / config["data"]["test_path"]
df_test = pd.read_csv(test_path)

# -------------------------------
# Gender baseline (all females survive)
# -------------------------------
df_test["Gender_Model"] = df_test["Sex"].apply(lambda x: 1 if x == "female" else 0)

# -------------------------------
# Compare predictions
# -------------------------------
diff = (df_new["Survived"] != df_test["Gender_Model"]).sum()

print(f"Total Rows: {len(df_new)}")
print(f"Differences from Gender Baseline: {diff}")

# Optional: percentage difference
pct_diff = diff / len(df_new) * 100
print(f"Difference Percentage: {pct_diff:.2f}%")