# Modeling Experimentation

## Imports

In [4]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Machine learning libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [5]:
# Useful paths
from pathlib import Path

raw_data_folder = Path("../data/raw")
processed_data_folder = Path("../data/processed") 
submission_folder = Path("../submissions")
figures_folder = Path("../reports/figures")

## Data

### Load data

In [6]:
raw_data = pd.read_csv(raw_data_folder / "train.csv")
to_predict = pd.read_csv(raw_data_folder / "test.csv")
sample_sub = pd.read_csv(raw_data_folder / "sample_submission.csv")

### Data split

* There is not a lot of data so it is not possible to do the split stratifying by y because some classes have unique values. 
* Dividing temperatures by ranges may be worth exploring.

In [12]:
SEED = 42

X = raw_data.drop(columns=["id", "Tm", "SMILES"])
X_id = raw_data["id"]
X_SMILES = raw_data["SMILES"]
y = raw_data["Tm"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=SEED, shuffle=True)

In [13]:
X_train.shape, X_val.shape, X_test.shape

((1863, 424), (399, 424), (400, 424))

## Model Selection

We know from 

In [17]:
# Let's try a simple model first

rf_clf = RandomForestRegressor(n_estimators=100, random_state=SEED)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)

In [None]:
# Summary of results


print(f"Train MAE: {mean_absolute_error(y_train, rf_clf.predict(X_train)):.2f} K")
print(f"Validation MAE: {mae:.2f} K")

print(f"Mean predicted Tm: {y_pred.mean():.2f} K")
print(f"Std predicted Tm: {y_pred.std():.2f} K")

Train MAE: 15.05 K
Validation MAE: 37.33 K
Mean predicted Tm: 269.53 K
Std predicted Tm: 71.29 K


As we can see, there is quite some overfitting.