In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("../../data/cleaned/cleaned.csv")

In [5]:
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,1,0.550,0.430,0.150,0.7715,0.3285,0.1465,0.2400,11
1,1,0.630,0.490,0.145,1.1300,0.4580,0.2765,0.3200,11
2,2,0.160,0.110,0.025,0.0210,0.0055,0.0030,0.0050,6
3,0,0.595,0.475,0.150,0.9145,0.3755,0.2055,0.2500,10
4,2,0.555,0.425,0.130,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
79212,1,0.650,0.525,0.185,1.7070,0.6605,0.3545,0.4735,14
79213,0,0.335,0.235,0.075,0.1585,0.0685,0.0370,0.0450,6
79214,0,0.555,0.425,0.150,0.8790,0.3865,0.1815,0.2400,9
79215,2,0.435,0.330,0.095,0.3215,0.1510,0.0785,0.0815,6


In [12]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop("Rings", axis=1)
y = df["Rings"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
def objective(trial):

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'verbosity': 0,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log = True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10.0, log = True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10.0,  log = True)
    }
    
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=10, verbose=False)
    

    y_pred = model.predict(X_valid)
    

    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_valid))))
    
    return rmsle



In [24]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-04-20 02:27:28,485] A new study created in memory with name: no-name-0e556f0d-ab1f-41e3-98bc-90eb7d531367
[I 2024-04-20 02:27:33,559] Trial 0 finished with value: 0.14960036717781036 and parameters: {'n_estimators': 601, 'learning_rate': 0.08948049804127299, 'max_depth': 4, 'subsample': 0.661204210463614, 'colsample_bytree': 0.6247605481898805, 'reg_alpha': 3.5659191714371915e-05, 'reg_lambda': 0.00012435550085182072}. Best is trial 0 with value: 0.14960036717781036.
[I 2024-04-20 02:27:39,589] Trial 1 finished with value: 0.14930476191743852 and parameters: {'n_estimators': 150, 'learning_rate': 0.02557966851715941, 'max_depth': 8, 'subsample': 0.8528654693317053, 'colsample_bytree': 0.6061225511910913, 'reg_alpha': 0.44384326979362604, 'reg_lambda': 0.09584903010619371}. Best is trial 1 with value: 0.14930476191743852.
[I 2024-04-20 02:27:44,474] Trial 2 finished with value: 0.15773241386864506 and parameters: {'n_estimators': 233, 'learning_rate': 0.012639945690414442, 'max_

In [25]:
best_params = study.best_params
best_rmse = study.best_value

print("Best RMSE:", best_rmse)
print("Best hyperparameters:", best_params)

Best RMSE: 0.14793347862796888
Best hyperparameters: {'n_estimators': 958, 'learning_rate': 0.011115281944860396, 'max_depth': 8, 'subsample': 0.7432055195483133, 'colsample_bytree': 0.7679586306623761, 'reg_alpha': 4.02058965750995, 'reg_lambda': 4.6917706741543785}


# Testing

In [27]:
test = pd.read_csv("../../data/test/test.csv")

In [29]:
testId = test["id"]

In [30]:
test = test.drop("id",axis = 1)

In [32]:
sexMap = {"M":0, "F":1, "I":2}

test["Sex"] = test["Sex"].map(sexMap)

In [34]:
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(X, y)

In [42]:
y_pred_test = best_model.predict(test) 

In [37]:
#y_pred_test = np.round(y_pred_test).astype(int)

In [43]:
y_pred_test

array([ 9.83746 ,  9.810269, 10.079804, ..., 12.546453, 13.254879,
        8.391922], dtype=float32)

In [44]:
submision = pd.DataFrame({"id":testId, "Rings":y_pred_test})

In [41]:
submision.to_csv("v1SubmisionNotRound.csv", index = False)