# S4E4 Hyperparameter Tuning

## Imports and Reading in Data

In [None]:
import numpy as np 
import pandas as pd 
import optuna
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


N_TRIALS = 300

# save the output into a file 
output = ""

In [None]:
data = pd.read_csv('data/feature_reduced_new_train.csv', index_col='id')

data.head()

X = data.drop(['Rings'], axis=1)
y = data['Rings']

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define RMSLE as a scorer
def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative since we can't take log of negative numbers
    y_pred[y_pred < 0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

## Optimize a LightGBM Regressor

In [None]:
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
    
    # Create and train LightGBM model
    model = lgb.LGBMRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    
    preds = model.predict(X_test)
    
    # Calculate and return RMSLE
    return rmsle(y_test, preds)

# Create a study object and specify the direction as 'minimize'
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=N_TRIALS)

output = output + "LGBM Model with " + str(study.best_trial.value) + "\n"
output = output + "Best hyperparameters:" + str(study.best_params) + "\n"

## Optimize XGBoost Regressor

In [None]:
def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10.0),
    }
    
    # Train XGBoost model
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict(X_test)
    
    # Calculate and return RMSLE
    return rmsle(y_test, preds)

# Create a study object and specify the direction as 'minimize'
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=N_TRIALS)

output = output + "XGBoosts Model with " + str(study.best_trial.value) + "\n"
output = output + "Best hyperparameters:" + str(study.best_params) + "\n"

## Optimize CatBoost Regressor

In [None]:
def objective(trial):
    params = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-8, 10.0),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),  # Excluded 'Poisson'
    }
    
    if params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    # Initialize and train CatBoost model
    model = cb.CatBoostRegressor(**params, verbose=0)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100)
    
    preds = model.predict(X_test)
    
    # Calculate and return RMSLE
    return rmsle(y_test, preds)

# Create a study object and specify the direction as 'minimize'
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=N_TRIALS)

output = output + "CatBoost Model with " + str(study.best_trial.value) + "\n"
output = output + "Best hyperparameters:" + str(study.best_params) + "\n"

## Optimize Random Forest

In [None]:
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 10, 500)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 100)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 50)

    # Create and train the model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                  random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return rmsle(y_test, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=N_TRIALS)

output = output + "RF Model with " + str(study.best_trial.value) + "\n"
output = output + "Best hyperparameters:" + str(study.best_params) + "\n"

## Optimize Nearest Neighbor

In [None]:
def objective(trial):
    # Data scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Suggest values for the hyperparameters
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'minkowski'])

    # Initialize and train the KNN regressor
    model = KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, metric=metric)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    return rmsle(y_test, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=N_TRIALS)

output = output + "KNN Model with " + str(study.best_trial.value) + "\n"
output = output + "Best hyperparameters:" + str(study.best_params) + "\n"

## Save the best parameters

In [None]:
with open("hyperparameters.txt","w") as file:
  file.write(output)