In [7]:
import numpy as np
import pandas as pd 
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
import catboost as cb


train = pd.read_csv('data/reduced_new_train.csv', index_col='id')
X = train.drop(['Rings'], axis=1)
y = train['Rings']
X = pd.get_dummies(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


def rmsle(y_true, y_pred):
    # Ensure predictions are non-negative since log of negative is undefined
    y_pred[y_pred < 0] = 0
    log_pred = np.log1p(y_pred)
    log_true = np.log1p(y_true)
    return np.sqrt(np.mean((log_pred - log_true) ** 2))


In [8]:
# LGBM Model with 0.14824868023245932
# Best hyperparameters:{'learning_rate': 0.031683333363955915, 'n_estimators': 957, 'max_depth': 16, 'num_leaves': 86, 'min_child_samples': 43, 'subsample': 0.9556403670916472, 'colsample_bytree': 0.6768299497756863}
# XGBoosts Model with 0.14895712403656916
# Best hyperparameters:{'learning_rate': 0.015135259972875157, 'n_estimators': 722, 'max_depth': 9, 'min_child_weight': 29, 'subsample': 0.822530521576753, 'colsample_bytree': 0.7400797719516602, 'reg_alpha': 3.1216939887945054e-06, 'reg_lambda': 0.010567667191794925}
# CatBoost Model with 0.1488679203959043
# Best hyperparameters:{'learning_rate': 0.10083192209799738, 'n_estimators': 894, 'max_depth': 7, 'l2_leaf_reg': 1.9327532000297312, 'random_strength': 0.0021895994884562485, 'bootstrap_type': 'MVS'}
# RF Model with 0.15002047134998422
# Best hyperparameters:{'n_estimators': 479, 'max_depth': 17, 'min_samples_split': 39, 'min_samples_leaf': 4}
# KNN Model with 0.15477076651737331
# Best hyperparameters:{'n_neighbors': 30, 'weights': 'uniform', 'metric': 'manhattan'}

lgb_model = lgb.LGBMRegressor(learning_rate=0.031683333363955915, n_estimators=957, max_depth=16, num_leaves=86, min_child_samples=43, subsample=0.9556403670916472, colsample_bytree=0.6768299497756863, verbose=-1)
xgb_model = xgb.XGBRegressor(learning_rate=0.015135259972875157, n_estimators=722, max_depth=9, min_child_weight=29, subsample=0.822530521576753, colsample_bytree=0.7400797719516602, reg_alpha=3.1216939887945054e-06, reg_lambda=0.010567667191794925, verbosity=0)
catboost_model = cb.CatBoostRegressor(learning_rate=0.10083192209799738, n_estimators=894, max_depth=7, l2_leaf_reg=1.9327532000297312, random_strength=0.0021895994884562485, bootstrap_type='MVS', verbose=0)
rf_model = RandomForestRegressor(n_estimators=479, max_depth=17, min_samples_split=39, min_samples_leaf=4)
knn_model = KNeighborsRegressor(n_neighbors=30, weights='uniform', metric='manhattan')

lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
catboost_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)

lgb_preds = lgb_model.predict(X_val)
xgb_preds = xgb_model.predict(X_val)
catboost_preds = catboost_model.predict(X_val)
rf_preds = rf_model.predict(X_val)
knn_preds = knn_model.predict(X_val)


def objective(trial):
    # Suggest weights for each model
    weights = {
        'lgb': trial.suggest_float('lgb_weight', 0, 1),
        'xgb': trial.suggest_float('xgb_weight', 0, 1),
        'catboost': trial.suggest_float('catboost_weight', 0, 1),
        'rf': trial.suggest_float('rf_weight', 0, 1),
        'knn': trial.suggest_float('knn_weight', 0, 1)
    }
    
    # Initialize and train models
    preds = {
        'lgb': lgb_preds,
        'xgb': xgb_preds,
        'catboost': catboost_preds,
        'rf': rf_preds,
        'knn': knn_preds
    }
    
    # Ensemble predictions
    weighted_preds = np.sum(np.array([weights[name] * preds[name] for name in preds.keys()]), axis=0) / np.sum(list(weights.values()))
    
    # Calculate and return RMSLE
    return rmsle(y_val, weighted_preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)

[I 2024-04-05 06:23:39,964] A new study created in memory with name: no-name-005edd1c-6b76-4179-8bcb-cd6cdb665fa2
[I 2024-04-05 06:23:39,970] Trial 0 finished with value: 0.14846128608029882 and parameters: {'lgb_weight': 0.133597258136548, 'xgb_weight': 0.6000030798631807, 'catboost_weight': 0.8234112296201658, 'rf_weight': 0.9285017745548354, 'knn_weight': 0.8135408281726052}. Best is trial 0 with value: 0.14846128608029882.
[I 2024-04-05 06:23:39,977] Trial 1 finished with value: 0.148455134108002 and parameters: {'lgb_weight': 0.477653908921563, 'xgb_weight': 0.3080461068511754, 'catboost_weight': 0.39944374760995105, 'rf_weight': 0.664259360725117, 'knn_weight': 0.761451552454175}. Best is trial 1 with value: 0.148455134108002.
[I 2024-04-05 06:23:39,986] Trial 2 finished with value: 0.14863210008621985 and parameters: {'lgb_weight': 0.20116950224377272, 'xgb_weight': 0.32087120500785515, 'catboost_weight': 0.21936657794446823, 'rf_weight': 0.7391935186359717, 'knn_weight': 0.5160

In [9]:
# Print best weights and best RMSLE 
print('Best weights:', study.best_params)
print('Best RMSLE:', study.best_value)


Best weights: {'lgb_weight': 0.8245068405028535, 'xgb_weight': 0.9818928246383205, 'catboost_weight': 0.7361790799722313, 'rf_weight': 3.802790467692451e-05, 'knn_weight': 0.22267671277922838}
Best RMSLE: 0.14792064072916333


In [12]:
# train new model with best weights
weights = study.best_params

# rename weights to be 
names = ['lgb', 'xgb', 'catboost', 'rf', 'knn']
weights = {names[i]: weights[i] for i in len(names)}


lgb_model.fit(X, y)
xgb_model.fit(X, y)
catboost_model.fit(X, y)
rf_model.fit(X, y)
knn_model.fit(X, y)


test = pd.read_csv('data/test.csv', index_col='id')
X_test = pd.get_dummies(test)

lgb_preds = lgb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
catboost_preds = catboost_model.predict(X_test)
rf_preds = rf_model.predict(X_test)
knn_preds = knn_model.predict(X_test)

preds = {
        'lgb': lgb_preds,
        'xgb': xgb_preds,
        'catboost': catboost_preds,
        'rf': rf_preds,
        'knn': knn_preds
    }

weighted_preds = np.sum(np.array([weights[name] * preds[name] for name in preds.keys()]), axis=0) / np.sum(list(weights.values()))

# Save predictions to a CSV file
submission = pd.DataFrame({'id': test.index, 'Rings': weighted_preds})
submission.to_csv('submissions/ensemble3.csv', index=False)