# Modeling Experimentation

## Imports

In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
# Machine learning libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [3]:
# Useful paths
from pathlib import Path

raw_data_folder = Path("../data/raw")
processed_data_folder = Path("../data/processed") 
submission_folder = Path("../submissions")
figures_folder = Path("../reports/figures")

## Data

### Load data

In [4]:
raw_data = pd.read_csv(raw_data_folder / "train.csv")
to_predict = pd.read_csv(raw_data_folder / "test.csv")
sample_sub = pd.read_csv(raw_data_folder / "sample_submission.csv")

### Data split

* There is not a lot of data so it is not possible to do the split stratifying by y because some classes have unique values. 
* Dividing temperatures by ranges may be worth exploring.

In [5]:
SEED = 42

X = raw_data.drop(columns=["id", "Tm", "SMILES"])
X_id = raw_data["id"]
X_SMILES = raw_data["SMILES"]
y = raw_data["Tm"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=SEED, shuffle=True)

In [6]:
X_train.shape, X_val.shape, X_test.shape

((1863, 424), (399, 424), (400, 424))

## Model Selection

We know from the EDA notebook:
* Mean (Tm):    278.26  
* Std Dev (Tm): 85.12

In [23]:
def evaluate_simple_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    train_mae = mean_absolute_error(y_train, model.predict(X_train))
    val_mae = mean_absolute_error(y_val, y_pred)

    pred_mean = y_pred.mean()
    actual_mean = y_val.mean()

    pred_std = y_pred.std()
    actual_std = y_val.std()

    # Results Summary
    print(f"Train MAE: {train_mae:.2f} K")
    print(f"Validation MAE: {val_mae:.2f} K")

    print("-"*20)

    print(f"Predicted Mean Tm: {pred_mean:.2f} K")
    print(f"Actual Mean Tm: {actual_mean:.2f} K")
    print(f"Relative Error in Mean: {abs(pred_mean - actual_mean) / actual_mean * 100:.2f} %")
    
    print("-"*20)

    print(f"Predicted Std Tm: {pred_std:.2f} K")
    print(f"Actual Std Tm: {actual_std:.2f} K")
    print(f"Relative Error in Std: {abs(pred_std - actual_std) / actual_std * 100:.2f} %")

    return model, val_mae, train_mae, pred_mean, pred_std


In [25]:
# Let's try some simple models first

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=SEED),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=SEED),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=SEED),
    "CatBoost": cb.CatBoostRegressor(n_estimators=100, random_state=SEED, verbose=0)
}
results = {}

In [30]:
for name, model in models.items():
    print(f"Evaluating {name}...")
    trained_model, val_mae, train_mae, pred_mean, pred_std = evaluate_simple_model(model, X_train, y_train, X_val, y_val)
    results[name] = {
        "model": trained_model,
        "val_mae": val_mae,
        "train_mae": train_mae,
        "pred_mean": pred_mean,
        "pred_std": pred_std
    }
    print("\n")

Evaluating Linear Regression...
Train MAE: 30.20 K
Validation MAE: 37.24 K
--------------------
Predicted Mean Tm: 275.75 K
Actual Mean Tm: 276.07 K
Relative Error in Mean: 0.12 %
--------------------
Predicted Std Tm: 76.70 K
Actual Std Tm: 89.29 K
Relative Error in Std: 14.10 %


Evaluating Ridge Regression...
Train MAE: 31.96 K
Validation MAE: 38.39 K
--------------------
Predicted Mean Tm: 275.87 K
Actual Mean Tm: 276.07 K
Relative Error in Mean: 0.07 %
--------------------
Predicted Std Tm: 68.74 K
Actual Std Tm: 89.29 K
Relative Error in Std: 23.02 %


Evaluating Lasso Regression...
Train MAE: 36.42 K
Validation MAE: 42.68 K
--------------------
Predicted Mean Tm: 276.59 K
Actual Mean Tm: 276.07 K
Relative Error in Mean: 0.19 %
--------------------
Predicted Std Tm: 61.52 K
Actual Std Tm: 89.29 K
Relative Error in Std: 31.10 %


Evaluating Random Forest...
Train MAE: 15.05 K
Validation MAE: 37.33 K
--------------------
Predicted Mean Tm: 269.53 K
Actual Mean Tm: 276.07 K
Relative

In [31]:
# Save true values statistics for later comparison
true_val_mean = y_val.mean()
true_val_std = y_val.std()

In [37]:
# Show results
results_df = pd.DataFrame(results).T
results_df["relative val-train mae"] = (results_df["val_mae"] - results_df["train_mae"]) / results_df["train_mae"]
results_df["distance to true mean"] = abs(results_df["pred_mean"] - true_val_mean) / true_val_mean
results_df["distance to true std"] = abs(results_df["pred_std"] - true_val_std) / true_val_std

try:
    results_df = results_df.drop(columns=["model"])
except:
    pass

In [43]:
order_col = "distance to true mean"
results_df = results_df.sort_values(by=order_col)
results_df

Unnamed: 0,val_mae,train_mae,pred_mean,pred_std,relative val-train mae,distance to true mean,distance to true std
Ridge Regression,38.386378,31.960609,275.87399,68.738413,0.201053,0.000714,0.230163
LightGBM,44.597371,34.314312,276.327549,63.404069,0.299673,0.000929,0.289905
Linear Regression,37.242059,30.195999,275.750818,76.702514,0.233344,0.00116,0.140969
Lasso Regression,42.682576,36.417954,276.592482,61.519464,0.17202,0.001889,0.311011
CatBoost,40.697272,31.83918,276.860632,61.62271,0.278214,0.00286,0.309855
XGBoost,36.654803,21.844591,277.391754,69.115356,0.677981,0.004784,0.225941
Random Forest,37.333986,15.054103,269.531247,71.293209,1.479987,0.023689,0.20155
