In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import Ridge

RANDOM = 123
np.random.seed(RANDOM)

In [20]:
train_df = pd.read_csv('../data/CW1_train.csv')
test_df = pd.read_csv('../data/CW1_test.csv')

X = train_df.drop(columns=['outcome'])
y = train_df['outcome']

In [21]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f'Categorical columns: {categorical_cols}')
print(f'Numerical columns: {numerical_cols}')

Categorical columns: ['cut', 'color', 'clarity']
Numerical columns: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [22]:
preprocess_linear = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
    ],
    remainder='drop'
)

preprocess_tree = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_cols),
        ("cat", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
    ],
    remainder='drop'
)

In [23]:
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM)

def cv_r2(model_pipeline):
    scores = cross_val_score(model_pipeline, X, y, cv=cv, scoring='r2', n_jobs=-1)
    return scores.mean(), scores.std()

# Baseline 1: Regularized Linear Model

In [24]:
ridge = Pipeline(steps=[
    ('preprocess', preprocess_linear),
    ("model", Ridge(alpha=1.0, random_state=RANDOM))])

ridge_mean, ridge_std = cv_r2(ridge)
print(f'Ridge Regression CV R2: {ridge_mean:.4f} ± {ridge_std:.4f}')

Ridge Regression CV R2: 0.2825 ± 0.0133


# Random Forest

In [25]:
rf = Pipeline(steps=[
    ("preprocess", preprocess_tree),
    ("model", RandomForestRegressor(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=2,
        random_state=RANDOM,
        n_jobs=-1
    )),
])

rf_mean, rf_std = cv_r2(rf)
print(f"RandomForest CV R²: {rf_mean:.4f} ± {rf_std:.4f}")


RandomForest CV R²: 0.4566 ± 0.0142


In [26]:
hgb = Pipeline(steps=[
    ("preprocess", preprocess_tree),
    ("model", HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=6,
        max_iter=800,
        random_state=RANDOM
    )),
])

hgb_mean, hgb_std = cv_r2(hgb)
print(f"HistGB CV R²: {hgb_mean:.4f} ± {hgb_std:.4f}")


HistGB CV R²: 0.4371 ± 0.0195


In [27]:
results = pd.DataFrame([
    {"model": "Ridge", "cv_r2_mean": ridge_mean, "cv_r2_std": ridge_std},
    {"model": "RandomForest", "cv_r2_mean": rf_mean, "cv_r2_std": rf_std},
    {"model": "HistGradientBoosting", "cv_r2_mean": hgb_mean, "cv_r2_std": hgb_std},
]).sort_values("cv_r2_mean", ascending=False)

results


Unnamed: 0,model,cv_r2_mean,cv_r2_std
1,RandomForest,0.456558,0.014241
2,HistGradientBoosting,0.437065,0.019519
0,Ridge,0.282544,0.013289


Among the evaluated models, Random Forest achieved the highest cross-validated R² and served as a strong nonlinear baseline.

Given its strong and stable performance, the Random Forest regressor was selected as the initial final model.