In [None]:
import pandas as pd
from IPython.display import display

from src.preprocessing import load_data, preprocess_data
from src.train_model import train_and_evaluate

print("Notebook ready.")


In [None]:
# === Load Dataset ===
df = load_data()
display(df.head())
print("Shape:", df.shape)
print("Columns:", list(df.columns))


In [None]:
# === Data Cleaning & Preprocessing ===

# Drop columns not useful for regression
cols_to_drop = ["RecordID", "HealthImpactClass"]
df_clean = df.drop(columns=cols_to_drop, errors="ignore")

# Define features & target
X = df_clean.drop(columns=["HealthImpactScore"])
y = df_clean["HealthImpactScore"]

print("Cleaned Data Shape:", df_clean.shape)
print("Feature Shape:", X.shape)
print("Target Shape:", y.shape)

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)

print("\nTrain:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)


## Baseline Model Training & Comparison

In [None]:
# === Baseline Model Comparison ===

!pip install xgboost lightgbm --quiet

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(),
    "KNN": KNeighborsRegressor(),
    "SVR": SVR(kernel='rbf'),
    "XGBoost": XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8),
    "LightGBM": LGBMRegressor(n_estimators=300, learning_rate=0.05)
}

results = []

print("Training baseline models...\n")

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results.append([name, mae, rmse, r2])
    print(f"{name}: MAE={mae:.4f}, RMSE={rmse:.4f}, R²={r2:.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R2"])
print("\n=== Baseline Model Comparison ===")
display(results_df.sort_values("R2", ascending=False))


## LightGBM Hyperparameter Tuning.

In [None]:
# === LightGBM Hyperparameter Tuning ===

from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

print("Running LightGBM Grid Search...")

lgbm = LGBMRegressor(random_state=42)

param_grid_lgbm = {
    "n_estimators": [300, 500, 800],
    "learning_rate": [0.05, 0.1],
    "max_depth": [-1, 5, 10],
    "num_leaves": [31, 50, 70],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.7, 1.0]
}

grid_lgbm = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid_lgbm,
    scoring="r2",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_lgbm.fit(X_train, y_train)

print("\nBest Params (LightGBM):", grid_lgbm.best_params_)
print("Best CV R² (LightGBM):", grid_lgbm.best_score_)

best_lgbm = grid_lgbm.best_estimator_

# Evaluate on the test set
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

pred_lgbm = best_lgbm.predict(X_test)

mae_lgbm = mean_absolute_error(y_test, pred_lgbm)
rmse_lgbm = np.sqrt(mean_squared_error(y_test, pred_lgbm))
r2_lgbm = r2_score(y_test, pred_lgbm)

print("\n=== Tuned LightGBM Results ===")
print(f"MAE:  {mae_lgbm:.4f}")
print(f"RMSE: {rmse_lgbm:.4f}")
print(f"R²:   {r2_lgbm:.4f}")


In [None]:
# === XGBoost Hyperparameter Tuning ===

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

print("Running XGBoost Grid Search...")

xgb_model = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

param_grid_xgb = {
    "n_estimators": [300, 600, 900],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [4, 6, 8],
    "subsample": [0.7, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.9, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    cv=3,
    scoring="r2",
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train, y_train)

print("\nBest Params (XGBoost):", grid_xgb.best_params_)
print("Best CV R² (XGBoost):", grid_xgb.best_score_)

best_xgb = grid_xgb.best_estimator_


In [None]:
# === Evaluate Final Tuned XGBoost Model ===

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("Evaluating final tuned XGBoost model...")

# Predict
y_pred_xgb = best_xgb.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred_xgb)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2 = r2_score(y_test, y_pred_xgb)

print("\n=== Tuned XGBoost Results ===")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.4f}")


In [None]:
# === LightGBM Feature Importance ===

import matplotlib.pyplot as plt
import pandas as pd

# Use the tuned LightGBM model
model_lgb = best_lgbm

# Extract feature importances
importance_df = pd.DataFrame({
    "feature": X_train.columns,
    "importance": model_lgb.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n=== LightGBM Feature Importance ===")
print(importance_df)

# Plot
plt.figure(figsize=(10,6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.gca().invert_yaxis()
plt.title("LightGBM Feature Importance")
plt.xlabel("Importance")
plt.show()
