## 1. Importing Dependancies

In [None]:
import os
from pathlib import Path

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostRegressor
from scipy import stats
from scipy.stats import norm
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
    VotingRegressor,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

## 2. Load The Data

We read from the 04-encoded folder we just created.

In [None]:
# 1. Define paths (using pathlib for robustness)
PROJECT_ROOT = Path().resolve().parent
DATA_DIR = PROJECT_ROOT / "data" / "04-encoded"

# 2. Load the datasets
X_train = pd.read_csv(DATA_DIR / "X_train_encoded.csv")
X_val = pd.read_csv(DATA_DIR / "X_val_encoded.csv")
y_train = pd.read_csv(DATA_DIR / "y_train.csv").values.ravel()  # ravel() flattens it to an array
y_val = pd.read_csv(DATA_DIR / "y_val.csv").values.ravel()

print(f"Data Loaded. X_train shape: {X_train.shape}")

## 3. Scaling The Data

### 1\. The Safety Check (Age Imputation)

**Crucial:** If you didn't explicitly fill missing values in the Age column during your EDA/Feature Engineering phase, the Scaler will **crash**.

In [None]:
# Check if Age has missing values
if X_train["age"].isnull().sum() > 0:
    print(f"Found {X_train['age'].isnull().sum()} missing ages. Filling with Median...")

    # Calculate median on TRAIN
    age_median = X_train["age"].median()

    # Fill on all
    X_train["age"] = X_train["age"].fillna(age_median)
    X_val["age"] = X_val["age"].fillna(age_median)

print("âœ… No missing values in Age.")

### 2\. Scale & Save (The MLOps Step)

We only scale the continuous columns (age, fare, FamilySize). We do **not** touch the binary columns (like sex\_male, pclass\_2).

In [None]:
# 1. Define columns to scale
scale_cols = ["age", "fare", "familysize"]

# 2. Initialize and Fit Scaler (On TRAIN only)
scaler = StandardScaler()
scaler.fit(X_train[scale_cols])

# 3. Transform Data
# We use .loc to modify the specific columns in place
X_train.loc[:, scale_cols] = scaler.transform(X_train[scale_cols])
X_val.loc[:, scale_cols] = scaler.transform(X_val[scale_cols])

# 4. Save the Scaler (CRITICAL for your API later)
MODEL_DIR = PROJECT_ROOT / "models"
MODEL_DIR.mkdir(exist_ok=True)
joblib.dump(scaler, MODEL_DIR / "scaler.pkl")
# Save the list of columns that the model expects
# This ensures we can align the API input perfectly later
joblib.dump(X_train.columns.tolist(), "../models/model_columns.pkl")

print("Model columns saved. We will use this to align the API input.")

print(f"Data scaled and scaler saved to {MODEL_DIR}/scaler.pkl")
print(X_train[scale_cols].head())