In [None]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer                                                                                                                                                                 
from sklearn.preprocessing import OneHotEncoder, StandardScaler                                                                                                                                               
from sklearn.pipeline import Pipeline                                                                                                                                                                         
from sklearn.impute import SimpleImputer

from sklearn.model_selection import KFold, cross_val_score

from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

RANDOM = 123
np.random.seed(RANDOM)

import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("../data/CW1_train.csv")
X = train.drop(columns=["outcome"])
y = train["outcome"]

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f'Categorical columns: {categorical_cols}')
print(f'Numerical columns: {numerical_cols}')

Categorical columns: ['cut', 'color', 'clarity']
Numerical columns: ['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']


In [4]:
  # Preprocessor                                                                                                                                                                                                
preprocessor = ColumnTransformer(                                                                                                                                                                             
    transformers=[                                                                                                                                                                                            
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),                                                                                                                          
        ('num', 'passthrough', numerical_cols)]                                                                                                                                                                                                        
)                                        


preprocessor_scaled = ColumnTransformer(                                                                                                                                                                             
    transformers=[                                                                                                                                                                                            
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),                                                                                                                          
        ('num', StandardScaler(), numerical_cols)]                                                                                                                                                                                                        
)
                                                                                                                                                                                                                
  # Cross-validation setup                                                                                                                                                                                      
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM) 

In [None]:
# model dictionaries

tree_moldels = {
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=RANDOM),
    "HistGradientBoosting": HistGradientBoostingRegressor(random_state=RANDOM),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=RANDOM),
}

sclae_models = {
    "Ridge": Ridge(random_state=RANDOM),
    "Lasso": Lasso(random_state=RANDOM),
    "ElasticNet": ElasticNet(random_state=RANDOM),
    "KNeighbors": KNeighborsRegressor(),
    "MLP": MLPRegressor(hidden_layer_sizes=(100,50), max_iter=500, random_state=RANDOM),
}


In [6]:
results = []

print("Evaluating tree-based models...")
for name, model in tree_moldels.items():
    pipeline = Pipeline([('prep', preprocessor), ('model', model)])
    start_time = time.time()
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=-1)
    elapsed_time = time.time() - start_time

    results.append({
        "Model": name,
        "Mean R2": np.mean(scores),
        "Std R2": np.std(scores),
        "Time (s)": round(elapsed_time, 2)
    })
    print(f"  {name}: {scores.mean():.4f} ± {scores.std():.4f} ({elapsed_time:.1f}s)")       


Evaluating tree-based models...
  RandomForest: 0.4548 ± 0.0141 (26.6s)
  HistGradientBoosting: 0.4602 ± 0.0163 (1.2s)
  GradientBoosting: 0.4686 ± 0.0175 (11.0s)


In [7]:
print("\nEvaluating scaled models...")
for name, model in sclae_models.items():
    pipeline = Pipeline([('prep', preprocessor_scaled), ('model', model)])
    start_time = time.time()
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=-1)
    elapsed_time = time.time() - start_time

    results.append({
        "Model": name,
        "Mean R2": np.mean(scores),
        "Std R2": np.std(scores),
        "Time (s)": round(elapsed_time, 2)
    })
    print(f"  {name}: {scores.mean():.4f} ± {scores.std():.4f} ({elapsed_time:.1f}s)")       


Evaluating scaled models...
  Ridge: 0.2825 ± 0.0133 (0.9s)
  Lasso: 0.2550 ± 0.0091 (0.7s)
  ElasticNet: 0.2318 ± 0.0065 (0.1s)
  KNeighbors: 0.0850 ± 0.0155 (0.1s)
  MLP: -0.2005 ± 0.0951 (13.4s)


In [8]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Mean R2", ascending=False).reset_index(drop=True)

print("\nSummary of results:")
print(results_df.to_string(index=False))



Summary of results:
               Model   Mean R2   Std R2  Time (s)
    GradientBoosting  0.468646 0.017460     11.04
HistGradientBoosting  0.460189 0.016262      1.23
        RandomForest  0.454774 0.014112     26.55
               Ridge  0.282544 0.013289      0.85
               Lasso  0.254963 0.009101      0.73
          ElasticNet  0.231820 0.006534      0.05
          KNeighbors  0.084957 0.015477      0.12
                 MLP -0.200509 0.095094     13.40


In [10]:
print("Top 3 models:")
for i, row in results_df.head(3).iterrows():
    print(f"  {i+1}. {row['Model']}: {row['Mean R2']:.4f} ± {row['Std R2']:.4f}")    

Top 3 models:
  1. GradientBoosting: 0.4686 ± 0.0175
  2. HistGradientBoosting: 0.4602 ± 0.0163
  3. RandomForest: 0.4548 ± 0.0141
