In [15]:
import numpy as np
import pandas as pd

# preprocessing
from sklearn.compose import ColumnTransformer                                                                                                                                                           
from sklearn.preprocessing import OneHotEncoder                                                                                                                                                         
from sklearn.pipeline import Pipeline    

# Models                                                                                                                                                                                                
from sklearn.ensemble import (                                                                                                                                                                          
    GradientBoostingRegressor,                                                                                                                                                                          
    HistGradientBoostingRegressor,                                                                                                                                                                      
    RandomForestRegressor                                                                                                                                                                               
)              

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
                                                                                                                                                                                                        
# Tuning                                                                                                                                                                                                
from sklearn.model_selection import (                                                                                                                                                                   
    cross_val_score,                                                                                                                                                                                    
    KFold,                                                                                                                                                                                              
    RandomizedSearchCV,                                                                                                                                                                                 
    GridSearchCV                                                                                                                                                                                        
)       

# Utilities                                                                                                                                                                                             
import time                                                                                                                                                                                             
import warnings                                                                                                                                                                                         
warnings.filterwarnings('ignore')                                                                                                                                                                       
                                                                                                                                                                                                        
# Random seed                                                                                                                                                                                           
RANDOM = 123                                                                                                                                                                                            
np.random.seed(RANDOM)        

In [16]:
train = pd.read_csv('../data/CW1_train.csv')
X = train.drop(columns=['outcome', 'price', 'x', 'y', 'z'])  # Reduced features                                                                                                                            
y = train['outcome']


# Feature groups                                                                                                                                                                                        
categorical_cols = ['cut', 'color', 'clarity']                                                                                                                                                          
numeric_cols = [col for col in X.columns if col not in categorical_cols]                                                                                                                                
                                                                                                                                                                                                        
# Preprocessor                                                                                                                                                                                          
preprocessor = ColumnTransformer([                                                                                                                                                                      
    ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),                                                                                                                        
    ('num', 'passthrough', numeric_cols)                                                                                                                                                                
])                                                                                                                                                                                                      
                                                                                                                                                                                                        
# CV setup                                                                                                                                                                                              
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM)             


In [None]:
 # Define parameter search space                                                                                                                                                                         
param_dist_gb = {                                                                                                                                                                                       
    'model__n_estimators': [100, 200, 300, 400, 500],                                                                                                                                                   
    'model__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],                                                                                                                                               
    'model__max_depth': [3, 4, 5, 6, 7, 8],                                                                                                                                                             
    'model__min_samples_split': [2, 5, 10, 20],                                                                                                                                                         
    'model__min_samples_leaf': [1, 2, 4, 8],                                                                                                                                                            
    'model__subsample': [0.7, 0.8, 0.9, 1.0],                                                                                                                                                           
    'model__max_features': ['sqrt', 'log2', 0.5, 0.7, None]                                                                                                                                             
}                                                                                                                                                                                                       
                                                                                                                                                                                                          
  # Create pipeline                                                                                                                                                                                       
pipe_gb = Pipeline([                                                                                                                                                                                    
    ('prep', preprocessor),                                                                                                                                                                             
    ('model', GradientBoostingRegressor(random_state=RANDOM))                                                                                                                                           
])                                                                                                                                                                                                      
                                                                                                                                                                                                          
  # Randomized search (100 iterations)                                                                                                                                                                    
print("Tuning GradientBoosting...")                                                                                                                                                                     
start = time.time()                                                                                                                                                                                     
                                                                                                                                                                                                        
search_gb = RandomizedSearchCV(                                                                                                                                                                         
    pipe_gb,                                                                                                                                                                                            
    param_distributions=param_dist_gb,                                                                                                                                                                  
    n_iter=100,                                                                                                                                                                                         
    cv=cv,                                                                                                                                                                                              
    scoring='r2',                                                                                                                                                                                       
    n_jobs=-1,                                                                                                                                                                                          
    random_state=RANDOM,                                                                                                                                                                                
    verbose=1                                                                                                                                                                                           
)                                                                                                                                                                                                       
                                                                                                                                                                                                        
search_gb.fit(X, y)                                                                                                                                                                                     
elapsed = time.time() - start                                                                                                                                                                           
                                                                                                                                                                                                        
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                                                                 
print(f"Best CV R²: {search_gb.best_score_:.4f}")                                                                                                                                                       
print(f"\nBest parameters:")                                                                                                                                                                            
for param, value in search_gb.best_params_.items():                                                                                                                                                     
    print(f"  {param}: {value}")         

Tuning GradientBoosting...
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# Focused grid around best parameters                                                                                                                                                                   
param_grid_gb = {                                                                                                                                                                                       
    'model__n_estimators': [150, 200, 250, 300],                                                                                                                                                        
    'model__learning_rate': [0.03, 0.05, 0.07],                                                                                                                                                         
    'model__max_depth': [2, 3, 4],                                                                                                                                                                      
    'model__min_samples_split': [2, 3],                                                                                                                                                                 
    'model__min_samples_leaf': [3, 4, 5],                                                                                                                                                               
    'model__subsample': [0.85, 0.9, 0.95],                                                                                                                                                              
    'model__max_features': [None, 0.8]                                                                                                                                                                  
}                                                                                                                                                                                                       
                                                                                                                                                                                                        
print(f"Grid size: {np.prod([len(v) for v in param_grid_gb.values()])} combinations")                                                                                                                   
                                                                                                                                                                                                        
# Grid search                                                                                                                                                                                           
print("\nRefining GradientBoosting...")                                                                                                                                                                 
start = time.time()                                                                                                                                                                                     
                                                                                                                                                                                                        
grid_gb = GridSearchCV(                                                                                                                                                                                 
    pipe_gb,                                                                                                                                                                                            
    param_grid=param_grid_gb,                                                                                                                                                                           
    cv=cv,                                                                                                                                                                                              
    scoring='r2',                                                                                                                                                                                       
    n_jobs=-1,                                                                                                                                                                                          
    verbose=1                                                                                                                                                                                           
)                                                                                                                                                                                                       
                                                                                                                                                                                                        
grid_gb.fit(X, y)                                                                                                                                                                                       
elapsed = time.time() - start                                                                                                                                                                           
                                                                                                                                                                                                        
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                                                                 
print(f"Best CV R²: {grid_gb.best_score_:.4f}")                                                                                                                                                         
print(f"\nBest parameters:")                                                                                                                                                                            
for param, value in grid_gb.best_params_.items():                                                                                                                                                       
    print(f"  {param}: {value}") 

Grid size: 1296 combinations

Refining GradientBoosting...
Fitting 5 folds for each of 1296 candidates, totalling 6480 fits

Completed in 4032.8s
Best CV R²: 0.4741

Best parameters:
  model__learning_rate: 0.05
  model__max_depth: 2
  model__max_features: None
  model__min_samples_leaf: 5
  model__min_samples_split: 2
  model__n_estimators: 300
  model__subsample: 0.85


In [None]:
from xgboost import XGBRegressor

In [None]:
pipe_xgb = Pipeline([                                                                                                                                   
    ('prep', preprocessor),                                                                                                                             
    ('model', XGBRegressor(random_state=RANDOM, n_jobs=-1))                                                                                             
])                                                                                                                                                      
                                                                                                                                                        
# Parameter search space                                                                                                                                
param_dist_xgb = {                                                                                                                                      
    'model__n_estimators': [100, 200, 300, 400, 500],                                                                                                   
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15],                                                                                              
    'model__max_depth': [2, 3, 4, 5, 6],                                                                                                                
    'model__min_child_weight': [1, 3, 5, 7],                                                                                                            
    'model__subsample': [0.7, 0.8, 0.9, 1.0],                                                                                                           
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0],                                                                                                    
    'model__reg_alpha': [0, 0.01, 0.1, 1],                                                                                                              
    'model__reg_lambda': [0.1, 1, 5, 10]                                                                                                                
}                                                                                                                                                       
                                                                                                                                                        
print("Tuning XGBoost...")                                                                                                                              
start = time.time()                                                                                                                                     
                                                                                                                                                        
search_xgb = RandomizedSearchCV(                                                                                                                        
    pipe_xgb,                                                                                                                                           
    param_distributions=param_dist_xgb,                                                                                                                 
    n_iter=100,                                                                                                                                         
    cv=cv,                                                                                                                                              
    scoring='r2',                                                                                                                                       
    n_jobs=-1,                                                                                                                                          
    random_state=RANDOM,                                                                                                                                
    verbose=1                                                                                                                                           
)                                                                                                                                                       
                                                                                                                                                        
search_xgb.fit(X, y)                                                                                                                                    
elapsed = time.time() - start                                                                                                                           
                                                                                                                                                        
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                 
print(f"Best CV R²: {search_xgb.best_score_:.4f}")                                                                                                      
print(f"\nBest parameters:")                                                                                                                            
for param, value in search_xgb.best_params_.items():                                                                                                    
    print(f"  {param}: {value}")  

Tuning XGBoost...
Fitting 5 folds for each of 100 candidates, totalling 500 fits



Completed in 35.2s
Best CV R²: 0.4739

Best parameters:
  model__subsample: 0.7
  model__reg_lambda: 5
  model__reg_alpha: 1
  model__n_estimators: 400
  model__min_child_weight: 1
  model__max_depth: 2
  model__learning_rate: 0.03
  model__colsample_bytree: 0.7


In [None]:
from lightgbm import LGBMRegressor

In [7]:
pipe_lgb = Pipeline([                                                                                                                                        
      ('prep', preprocessor),                                                                                                                                  
      ('model', LGBMRegressor(random_state=RANDOM, n_jobs=-1, verbose=-1))                                                                                     
  ])                                                                                                                                                           
                                                                                                                                                            
param_dist_lgb = {                                                                                                                                           
    'model__n_estimators': [100, 200, 300, 400, 500],                                                                                                        
    'model__learning_rate': [0.01, 0.03, 0.05, 0.1],                                                                                                         
    'model__max_depth': [2, 3, 4, 5, -1],                                                                                                                    
    'model__num_leaves': [7, 15, 31, 63],                                                                                                                    
    'model__min_child_samples': [5, 10, 20, 30],                                                                                                             
    'model__subsample': [0.7, 0.8, 0.9, 1.0],                                                                                                                
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0],                                                                                                         
    'model__reg_alpha': [0, 0.01, 0.1, 1],                                                                                                                   
    'model__reg_lambda': [0, 0.1, 1, 5]                                                                                                                      
}                                                                                                                                                            
                                                                                                                                                            
print("Tuning LightGBM...")                                                                                                                                  
start = time.time()                                                                                                                                          
                                                                                                                                                            
search_lgb = RandomizedSearchCV(                                                                                                                             
    pipe_lgb,                                                                                                                                                
    param_distributions=param_dist_lgb,                                                                                                                      
    n_iter=100,                                                                                                                                              
    cv=cv,                                                                                                                                                   
    scoring='r2',                                                                                                                                            
    n_jobs=-1,                                                                                                                                               
    random_state=RANDOM,                                                                                                                                     
    verbose=1                                                                                                                                                
)                                                                                                                                                            
                                                                                                                                                            
search_lgb.fit(X, y)                                                                                                                                         
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                      
print(f"Best CV R²: {search_lgb.best_score_:.4f}")                                                                                                           
print(f"\nBest parameters:")                                                                                                                                 
for param, value in search_lgb.best_params_.items():                                                                                                         
    print(f"  {param}: {value}") 

Tuning LightGBM...
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Completed in 132.8s
Best CV R²: 0.4753

Best parameters:
  model__subsample: 0.8
  model__reg_lambda: 0
  model__reg_alpha: 0.01
  model__num_leaves: 7
  model__n_estimators: 300
  model__min_child_samples: 20
  model__max_depth: 5
  model__learning_rate: 0.03
  model__colsample_bytree: 0.8


In [11]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge


In [12]:
gb_best = Pipeline([                                                                                                                                         
    ('prep', preprocessor),                                                                                                                                  
    ('model', GradientBoostingRegressor(                                                                                                                     
        learning_rate=0.05, max_depth=2, n_estimators=300,                                                                                                   
        min_samples_leaf=5, subsample=0.85, random_state=RANDOM                                                                                              
    ))                                                                                                                                                       
])                                                                                                                                                           
                                                                                                                                                            
xgb_best = Pipeline([                                                                                                                                        
    ('prep', preprocessor),                                                                                                                                  
    ('model', XGBRegressor(                                                                                                                                  
        learning_rate=0.03, max_depth=2, n_estimators=400,                                                                                                   
        min_child_weight=1, subsample=0.7, colsample_bytree=0.7,                                                                                             
        reg_alpha=1, reg_lambda=5, random_state=RANDOM, n_jobs=-1                                                                                            
    ))                                                                                                                                                       
])                                                                                                                                                           
                                                                                                                                                            
lgb_best = Pipeline([                                                                                                                                        
    ('prep', preprocessor),                                                                                                                                  
    ('model', LGBMRegressor(                                                                                                                                 
        learning_rate=0.03, max_depth=5, n_estimators=300,                                                                                                   
        num_leaves=7, min_child_samples=20, subsample=0.8,                                                                                                   
        colsample_bytree=0.8, reg_alpha=0.01, reg_lambda=0,                                                                                                  
        random_state=RANDOM, n_jobs=-1, verbose=-1                                                                                                           
    ))                                                                                                                                                       
])                        

In [13]:
stack = StackingRegressor(                                                                                                                                   
    estimators=[                                                                                                                                             
        ('gb', gb_best),                                                                                                                                     
        ('xgb', xgb_best),                                                                                                                                   
        ('lgb', lgb_best)                                                                                                                                    
    ],                                                                                                                                                       
    final_estimator=Ridge(alpha=1.0),                                                                                                                        
    cv=5,                                                                                                                                                    
    n_jobs=-1                                                                                                                                                
)       

In [14]:
print("Evaluating Stacking Ensemble...")                                                                                                                     
start = time.time()                                                                                                                                          
scores = cross_val_score(stack, X, y, cv=cv, scoring='r2', n_jobs=-1)                                                                                        
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                      
print(f"Stacking CV R²: {scores.mean():.4f} ± {scores.std():.4f}")    

Evaluating Stacking Ensemble...

Completed in 39.8s
Stacking CV R²: 0.4760 ± 0.0176


In [15]:
from sklearn.ensemble import RandomForestRegressor                                                                                                           
                                                                                                                                                            
rf_tuned = Pipeline([                                                                                                                                        
    ('prep', preprocessor),                                                                                                                                  
    ('model', RandomForestRegressor(                                                                                                                         
        n_estimators=600, max_depth=None, min_samples_leaf=2,                                                                                                
        random_state=RANDOM, n_jobs=-1                                                                                                                       
    ))                                                                                                                                                       
])                                                                                                                                                           
                                                                                                                                                            
stack_with_rf = StackingRegressor(                                                                                                                           
    estimators=[                                                                                                                                             
        ('gb', gb_best),                                                                                                                                     
        ('xgb', xgb_best),                                                                                                                                   
        ('lgb', lgb_best),                                                                                                                                   
        ('rf', rf_tuned)                                                                                                                                     
    ],                                                                                                                                                       
    final_estimator=Ridge(alpha=1.0),                                                                                                                        
    cv=5,                                                                                                                                                    
    n_jobs=-1                                                                                                                                                
)                                                                                                                                                            
                                                                                                                                                            
print("Evaluating Stacking with RF...")                                                                                                                      
start = time.time()                                                                                                                                          
scores = cross_val_score(stack_with_rf, X, y, cv=cv, scoring='r2', n_jobs=-1)                                                                                
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                      
print(f"Stacking+RF CV R²: {scores.mean():.4f} ± {scores.std():.4f}")     

Evaluating Stacking with RF...

Completed in 276.6s
Stacking+RF CV R²: 0.4758 ± 0.0176


In [20]:
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

In [21]:
nn_preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

In [22]:
X_nn = nn_preprocessor.fit_transform(X)
y_nn = y.values

print(f"Input dimensions: {X_nn.shape[1]} features")
print(f"Output dimensions: {y_nn.shape[0]} samples")

Input dimensions: 40 features
Output dimensions: 10000 samples


In [23]:
mlp = MLPRegressor(
    hidden_layer_sizes=(128, 64, 32),
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=64,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=RANDOM
)

In [24]:
print("Evaluating MLP...")                                                                                                                                   
start = time.time()                                                                                                                                          
scores = cross_val_score(mlp, X_nn, y_nn, cv=cv, scoring='r2', n_jobs=-1)                                                                                    
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                      
print(f"MLP CV R²: {scores.mean():.4f} ± {scores.std():.4f}")            

Evaluating MLP...

Completed in 3.8s
MLP CV R²: 0.3904 ± 0.0127


In [10]:
latent_uniform = ['a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5']                                                                                
latent_gaussian = ['a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']                                                                             
                                                                                                                                                            
# Create engineered features                                                                                                                                 
X_eng = X.copy()                                                                                                                                             
                                                                                                                                                            
# 1. Interactions within latent groups (a*b pairs)                                                                                                           
for i in range(1, 11):                                                                                                                                       
    X_eng[f'ab_{i}'] = X[f'a{i}'] * X[f'b{i}']                                                                                                               
                                                                                                                                                            
# 2. Sum/mean aggregations                                                                                                                                   
X_eng['a_sum'] = X[[f'a{i}' for i in range(1,11)]].sum(axis=1)                                                                                               
X_eng['b_sum'] = X[[f'b{i}' for i in range(1,11)]].sum(axis=1)                                                                                               
X_eng['ab_diff'] = X_eng['a_sum'] - X_eng['b_sum']                                                                                                           
                                                                                                                                                            
# 3. Gaussian latent squared terms (capture non-linearity)                                                                                                   
for col in latent_gaussian:                                                                                                                                  
    X_eng[f'{col}_sq'] = X[col] ** 2                                                                                                                         
                                                                                                                                                            
print(f"Original features: {X.shape[1]}")                                                                                                                    
print(f"Engineered features: {X_eng.shape[1]}")                                                                                                              
                                                                                                                                                            
# Update column lists                                                                                                                                        
cat_cols_eng = categorical_cols                                                                                                                                      
num_cols_eng = [c for c in X_eng.columns if c not in categorical_cols]                                                                                               
                                                                                                                                                            
# New preprocessor                                                                                                                                           
preprocessor_eng = ColumnTransformer([                                                                                                                       
    ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_cols_eng),                                                                                 
    ('num', 'passthrough', num_cols_eng)                                                                                                                     
])                                                                                                                                                           
                                                                                                                                                            
# Test with LightGBM (fastest)                                                                                                                               
pipe_lgb_eng = Pipeline([                                                                                                                                    
    ('prep', preprocessor_eng),                                                                                                                              
    ('model', LGBMRegressor(**search_lgb.best_params_, random_state=RANDOM, n_jobs=-1, verbose=-1))                                                          
])                                                                                                                                                           
                                                                                                                                                            
# Remove 'model__' prefix from params                                                                                                                        
lgb_params = {k.replace('model__', ''): v for k, v in search_lgb.best_params_.items()}                                                                       
pipe_lgb_eng = Pipeline([                                                                                                                                    
    ('prep', preprocessor_eng),                                                                                                                              
    ('model', LGBMRegressor(**lgb_params, random_state=RANDOM, n_jobs=-1, verbose=-1))                                                                       
])                                                                                                                                                           
                                                                                                                                                            
print("\nEvaluating LightGBM with engineered features...")                                                                                                   
start = time.time()                                                                                                                                          
scores = cross_val_score(pipe_lgb_eng, X_eng, y, cv=cv, scoring='r2', n_jobs=-1)                                                                             
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"Completed in {elapsed:.1f}s")                                                                                                                        
print(f"LightGBM + Engineered CV R²: {scores.mean():.4f} ± {scores.std():.4f}")                                                                              
print(f"vs baseline LightGBM: 0.4753")   

Original features: 26
Engineered features: 49


NameError: name 'search_lgb' is not defined

In [14]:
 # Update all pipelines with engineered features                                                                                                              
gb_eng = Pipeline([                                                                                                                                          
    ('prep', preprocessor_eng),                                                                                                                              
    ('model', GradientBoostingRegressor(                                                                                                                     
        learning_rate=0.05, max_depth=2, n_estimators=300,                                                                                                   
        min_samples_leaf=5, subsample=0.85, random_state=RANDOM                                                                                              
    ))                                                                                                                                                       
])                                                                                                                                                           
                                                                                                                                                            
xgb_eng = Pipeline([                                                                                                                                         
    ('prep', preprocessor_eng),                                                                                                                              
    ('model', XGBRegressor(                                                                                                                                  
        learning_rate=0.03, max_depth=2, n_estimators=400,                                                                                                   
        min_child_weight=1, subsample=0.7, colsample_bytree=0.7,                                                                                             
        reg_alpha=1, reg_lambda=5, random_state=RANDOM, n_jobs=-1                                                                                            
    ))                                                                                                                                                       
])                                                                                                                                                           
                                                                                                                                                            
lgb_eng = Pipeline([                                                                                                                                         
    ('prep', preprocessor_eng),                                                                                                                              
    ('model', LGBMRegressor(**lgb_params, random_state=RANDOM, n_jobs=-1, verbose=-1))                                                                       
])                                                                                                                                                           
                                                                                                                                                            
# Stacking with engineered features                                                                                                                          
stack_eng = StackingRegressor(                                                                                                                               
    estimators=[                                                                                                                                             
        ('gb', gb_eng),                                                                                                                                      
        ('xgb', xgb_eng),                                                                                                                                    
        ('lgb', lgb_eng)                                                                                                                                     
    ],                                                                                                                                                       
    final_estimator=Ridge(alpha=1.0),                                                                                                                        
    cv=5,                                                                                                                                                    
    n_jobs=-1                                                                                                                                                
)                                                                                                                                                            
                                                                                                                                                            
print("Evaluating Stacking with engineered features...")                                                                                                     
start = time.time()                                                                                                                                          
scores = cross_val_score(stack_eng, X_eng, y, cv=cv, scoring='r2', n_jobs=-1)                                                                                
elapsed = time.time() - start                                                                                                                                
                                                                                                                                                            
print(f"\nCompleted in {elapsed:.1f}s")                                                                                                                      
print(f"Stacking + Engineered CV R²: {scores.mean():.4f} ± {scores.std():.4f}")                                                                              
print(f"vs baseline Stacking: 0.4760")                                                                                                                       
print(f"vs LightGBM + Engineered: 0.4788")

NameError: name 'lgb_params' is not defined

In [28]:
# Add a few more features                                                                                                                                    
X_eng2 = X_eng.copy()                                                                                                                                        
                                                                                                                                                            
# Cross-group: uniform * gaussian interactions                                                                                                               
for i in range(1, 6):                                                                                                                                        
    X_eng2[f'a{i}_x_a{i+5}'] = X[f'a{i}'] * X[f'a{i+5}']                                                                                                     
    X_eng2[f'b{i}_x_b{i+5}'] = X[f'b{i}'] * X[f'b{i+5}']                                                                                                     
                                                                                                                                                            
# Absolute values of gaussian (symmetric signal)                                                                                                             
for col in latent_gaussian:                                                                                                                                  
    X_eng2[f'{col}_abs'] = X[col].abs()                                                                                                                      
                                                                                                                                                            
print(f"Features: {X_eng2.shape[1]}")                                                                                                                        
                                                                                                                                                            
# Quick test with LightGBM                                                                                                                                   
num_cols_eng2 = [c for c in X_eng2.columns if c not in categorical_cols]                                                                                             
preprocessor_eng2 = ColumnTransformer([                                                                                                                      
    ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),                                                                                     
    ('num', 'passthrough', num_cols_eng2)                                                                                                                    
])                                                                                                                                                           
                                                                                                                                                            
pipe_test = Pipeline([                                                                                                                                       
    ('prep', preprocessor_eng2),                                                                                                                             
    ('model', LGBMRegressor(**lgb_params, random_state=RANDOM, n_jobs=-1, verbose=-1))                                                                       
])                                                                                                                                                           
                                                                                                                                                            
print("Quick test with extended features...")                                                                                                                
scores = cross_val_score(pipe_test, X_eng2, y, cv=cv, scoring='r2', n_jobs=-1)                                                                               
print(f"LightGBM + Extended: {scores.mean():.4f} ± {scores.std():.4f}")              

Features: 69
Quick test with extended features...


LightGBM + Extended: 0.4784 ± 0.0171


# ============================================================
# Phase 7: Advanced Experiments
# ============================================================
# 1. CatBoost
# 2. Feature Selection (Permutation Importance)
# 3. Target Encoding
# 4. Weighted Averaging vs Stacking
# 5. Learning Curve & Residual Analysis
# ============================================================

In [None]:
# Experiment 1: CatBoost - SKIPPED
# CatBoost fails to build on Clang 20 / Python 3.14 (Conan compiler version not supported)
# Moving on to remaining experiments

In [13]:
# --- Experiment 2: Feature Selection (Permutation Importance) ---
from sklearn.inspection import permutation_importance

# Train LightGBM on full data to get importance
pipe_lgb_eng.fit(X_eng, y)

# Permutation importance (slower but more reliable than built-in)
print("Computing permutation importance...")
start = time.time()
perm_result = permutation_importance(
    pipe_lgb_eng, X_eng, y, 
    n_repeats=10, random_state=RANDOM, scoring='r2', n_jobs=-1
)
elapsed = time.time() - start
print(f"Completed in {elapsed:.1f}s")

# Get feature names after preprocessing
cat_feature_names = pipe_lgb_eng.named_steps['prep'].transformers_[0][1].get_feature_names_out(categorical_cols).tolist()
all_feature_names = cat_feature_names + num_cols_eng

# Sort by importance
importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std
}).sort_values('importance_mean', ascending=False)

print("\nTop 20 features:")
print(importance_df.head(20).to_string(index=False))
print(f"\nFeatures with negative importance (noise):")
noise_features = importance_df[importance_df['importance_mean'] <= 0]
print(f"  {len(noise_features)} features contribute nothing or hurt performance")
print(noise_features['feature'].tolist())

NameError: name 'pipe_lgb_eng' is not defined

In [None]:
# --- Experiment 3: Target Encoding ---
from sklearn.preprocessing import TargetEncoder

preprocessor_te = ColumnTransformer([
    ('cat', TargetEncoder(random_state=RANDOM), categorical_cols),
    ('num', 'passthrough', num_cols_eng)
])

pipe_lgb_te = Pipeline([
    ('prep', preprocessor_te),
    ('model', LGBMRegressor(**lgb_params, random_state=RANDOM, n_jobs=-1, verbose=-1))
])

print("Evaluating LightGBM with Target Encoding...")
start = time.time()
scores_te = cross_val_score(pipe_lgb_te, X_eng, y, cv=cv, scoring='r2', n_jobs=-1)
elapsed = time.time() - start

print(f"Completed in {elapsed:.1f}s")
print(f"LightGBM + Target Encoding CV R²: {scores_te.mean():.4f} ± {scores_te.std():.4f}")
print(f"vs LightGBM + OneHot:             0.4788")

In [None]:
# --- Experiment 4: Weighted Averaging ---
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score

# Get out-of-fold predictions from each model
print("Getting out-of-fold predictions...")
start = time.time()

pred_gb = cross_val_predict(gb_eng, X_eng, y, cv=cv, n_jobs=-1)
pred_xgb = cross_val_predict(xgb_eng, X_eng, y, cv=cv, n_jobs=-1)
pred_lgb = cross_val_predict(lgb_eng, X_eng, y, cv=cv, n_jobs=-1)

elapsed = time.time() - start
print(f"Completed in {elapsed:.1f}s")

# Test different weight combinations
best_r2 = 0
best_weights = None

for w1 in np.arange(0.1, 0.8, 0.05):
    for w2 in np.arange(0.1, 0.8, 0.05):
        w3 = 1 - w1 - w2
        if w3 < 0.05:
            continue
        blend = w1 * pred_gb + w2 * pred_xgb + w3 * pred_lgb
        r2 = r2_score(y, blend)
        if r2 > best_r2:
            best_r2 = r2
            best_weights = (w1, w2, w3)

print(f"\nBest weighted average R²: {best_r2:.4f}")
print(f"Weights: GB={best_weights[0]:.2f}, XGB={best_weights[1]:.2f}, LGB={best_weights[2]:.2f}")
print(f"vs Stacking (Ridge meta): 0.4807")

In [None]:
# --- Experiment 5: Learning Curve & Residual Analysis ---
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

print("Computing learning curve (this may take a few minutes)...")
start = time.time()

train_sizes, train_scores, val_scores = learning_curve(
    lgb_eng, X_eng, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=cv, scoring='r2', n_jobs=-1
)

elapsed = time.time() - start
print(f"Completed in {elapsed:.1f}s")

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Learning Curve
ax = axes[0]
ax.plot(train_sizes, train_scores.mean(axis=1), 'o-', label='Train R²')
ax.plot(train_sizes, val_scores.mean(axis=1), 'o-', label='Validation R²')
ax.fill_between(train_sizes,
                val_scores.mean(axis=1) - val_scores.std(axis=1),
                val_scores.mean(axis=1) + val_scores.std(axis=1), alpha=0.2)
ax.set_xlabel('Training Set Size')
ax.set_ylabel('R² Score')
ax.set_title('Learning Curve (LightGBM)')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: Residual Analysis
pred_oof = cross_val_predict(lgb_eng, X_eng, y, cv=cv, n_jobs=-1)
residuals = y - pred_oof

ax = axes[1]
ax.scatter(pred_oof, residuals, alpha=0.15, s=5)
ax.axhline(y=0, color='red', linestyle='--')
ax.set_xlabel('Predicted Value')
ax.set_ylabel('Residual')
ax.set_title(f'Residuals (OOF R² = {r2_score(y, pred_oof):.4f})')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Diagnose: is the gap closing?
print(f"\nTrain R² at full data:      {train_scores.mean(axis=1)[-1]:.4f}")
print(f"Validation R² at full data: {val_scores.mean(axis=1)[-1]:.4f}")
gap = train_scores.mean(axis=1)[-1] - val_scores.mean(axis=1)[-1]
print(f"Gap: {gap:.4f}")
if gap > 0.15:
    print("-> Large gap: model is overfitting. More regularization or data could help.")
elif val_scores.mean(axis=1)[-1] - val_scores.mean(axis=1)[-3] < 0.005:
    print("-> Validation curve is flat: approaching irreducible noise ceiling.")
else:
    print("-> Validation still improving: more data would likely help.")