# S03E08 - Gemstone Pricing

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

import optuna
from optuna.samplers import TPESampler
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [2]:
train_df = pd.read_csv("./data/train.csv").assign(sample = 'train', original=0)

test_df = pd.read_csv("./data/test.csv").assign(sample = 'test', original=0)

original_df = pd.read_csv("./data/cubic_zirconia.csv").assign(sample = 'train', original = 1).drop(columns='Unnamed: 0')

df = pd.concat([train_df, test_df, original_df], sort=False).sample(frac=1, random_state=13)

In [3]:
def clarity_scale(df):
    df["clarity"] = df["clarity"].apply(
        lambda x: 0 if x == "IF" else 1 if x == "VVS1" else 2 if x == "VVS2" else 3 if x == "VS1" else 4 if x == "VS2" else 5 if x == "SI1" else 6 if x == "SI2" else 7).astype(np.int8)
    return df



In [4]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']])
df[['cut']] = enc.fit_transform(df[['cut']]).astype('int8')
enc = OrdinalEncoder(categories=[['J', 'I','H','G','F','E','D']])
df[['color']] = enc.fit_transform(df[['color']]).astype('int8')

In [5]:
# define ordinal encoding
#encoder = OneHotEncoder(sparse_output=False).fit(df[['color']])
#encoded = encoder.transform(df[['color']])
# transform data
#encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

#df = pd.concat([df[['id', 'sample','carat','clarity' ,'cut', 'color', 'depth', 'table', 'x', 'y', 'z','original' ,'price']]\
#                .reset_index(drop=True), encoded_df], axis=1).sample(frac=1, random_state=13)
df = clarity_scale(df)

In [6]:
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,sample,original
174091,174091.0,0.51,4,4,4,61.7,56.0,5.14,5.17,3.18,1591.0,train,0
5599,5599.0,0.30,4,2,1,61.3,55.0,4.34,4.33,2.66,684.0,train,0
187609,187609.0,0.30,4,3,5,60.8,57.0,4.37,4.40,2.67,432.0,train,0
185622,185622.0,0.40,2,3,3,61.0,60.0,4.75,4.79,2.91,1026.0,train,0
73490,73490.0,0.53,4,4,3,62.0,55.0,5.20,5.23,3.23,1778.0,train,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25295,218868.0,0.40,4,3,0,61.5,55.0,4.74,4.78,2.93,,test,0
164706,164706.0,0.73,2,6,6,63.6,56.0,5.65,5.70,3.62,2155.0,train,0
63169,256742.0,2.04,2,0,5,59.6,61.0,8.29,8.33,4.93,,test,0
59883,253456.0,1.53,2,0,3,61.0,60.0,7.42,7.45,4.54,,test,0


In [7]:
original_features = list(df.drop(columns=['id','price', 'sample']).columns)

In [8]:
features = original_features
#features = ['carat']
target = 'price'

train = df.query("sample == 'train'").drop(columns=['sample']).reset_index(drop=True).copy()
X_train = train[features].copy()
#X_train[cat_features] = X_train[cat_features].astype('category')
y_train = train['price']

test_df = df.query("sample == 'test'").drop(columns=['sample']).copy()
test_df = test_df[features + ['id']]
test = test_df.reset_index(drop=True)

In [11]:
mod = lgb.LGBMRegressor().fit(X_train, y_train)

In [15]:
def objective(trial):
    """Define the objective function"""

    params = {
        #'gpu_id':0,
        #'tree_method':'auto',
        'tree_method': trial.suggest_categorical('tree_method', ['exact','approx', 'hist']),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.1),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 5),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 100, 700),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 1),
        #'enable_categorical': True,
        'eval_metric': 'rmse',
        'objective':'reg:squarederror',
        'seed':13,
    }


    cv = KFold(10, shuffle=True, random_state=13)
    fold_scores = []
    for i, (train_idx,val_idx) in enumerate(cv.split(train[features],train[target])):
        X_train, y_train = train.loc[train_idx, features],train.loc[train_idx, target]
        X_val, y_val = train.loc[val_idx, features],train.loc[val_idx, target]
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train,
                 y_train,
                 eval_set= [(X_val,y_val)],
                 verbose=0)
        
        pred_val = model.predict(X_val)
        #pred_test = model.predict(test[features])

        score = np.sqrt(mean_squared_error(y_val, pred_val))
        fold_scores.append(score)

    return np.mean(fold_scores)

In [22]:
def objective(trial):
    params = {
        #         "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 10000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000),
        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000),
        "max_bin": trial.suggest_int("max_bin", 100, 1000),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.001, 1),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 15),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95),
        'early_stopping_round': trial.suggest_int('early_stopping_round', 100, 700),

        "metric":"rmse",
        "categorical_features": "1,2,3,9",
        "verbose": -1
    }
    
    cv = KFold(10, shuffle=True, random_state=13)
    fold_scores = []
    for i, (train_idx,val_idx) in enumerate(cv.split(train[features],train[target])):
        X_train, y_train = train.loc[train_idx, features],train.loc[train_idx, target]
        X_val, y_val = train.loc[val_idx, features],train.loc[val_idx, target]
        
        
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train,
                 y_train,
                eval_set= [(X_val,y_val)],
                 verbose=-1)

        
        
        pred_val = model.predict(X_val)
        #pred_test = model.predict(test[features])

        score = np.sqrt(mean_squared_error(y_val, pred_val))
        fold_scores.append(score)

    return np.mean(fold_scores)


In [23]:
study = optuna.create_study(direction='minimize', sampler = TPESampler())
study.optimize(objective, n_trials=100)

[32m[I 2023-02-25 09:55:58,239][0m A new study created in memory with name: no-name-b7f53182-21e9-4927-be5b-c24c960284b4[0m




































[32m[I 2023-02-25 09:57:46,602][0m Trial 0 finished with value: 1519.6537767686332 and parameters: {'n_estimators': 4067, 'learning_rate': 0.021242335149006124, 'num_leaves': 300, 'max_depth': 14, 'min_data_in_leaf': 9915, 'max_bin': 249, 'reg_alpha': 0.6573590605413907, 'min_split_gain': 0.4541195491032368, 'bagging_fraction': 0.2861929218201531, 'bagging_freq': 7, 'feature_fraction': 0.8572586662401951, 'early_stopping_round': 316}. Best is trial 0 with value: 1519.6537767686332.[0m








































[32m[I 2023-02-25 09:59:20,043][0m Trial 1 finished with value: 733.0532103272745 and parameters: {'n_estimators': 2046, 'learning_rate': 0.0578743698380125, 'num_leaves': 2178, 'max_depth': 9, 'min_data_in_leaf': 7892, 'max_bin': 246, 'reg_alpha': 0.4242336521102798, 'min_split_gain': 7.973618483072526, 'bagging_fraction': 0.7254757128197726, 'bagging_freq': 1, 'feature_fraction': 0.42407565275435233, 'early_stopping_round': 520}. Best is trial 1 with value: 733.0532103272745.[0m








































[32m[I 2023-02-25 10:04:47,482][0m Trial 2 finished with value: 741.0516999571879 and parameters: {'n_estimators': 5338, 'learning_rate': 0.07440436180209196, 'num_leaves': 1442, 'max_depth': 10, 'min_data_in_leaf': 8393, 'max_bin': 288, 'reg_alpha': 0.5560050381232897, 'min_split_gain': 7.187740694781539, 'bagging_fraction': 0.6795258865938931, 'bagging_freq': 3, 'feature_fraction': 0.7183483448227523, 'early_stopping_round': 630}. Best is trial 1 with value: 733.0532103272745.[0m








































[32m[I 2023-02-25 10:05:25,713][0m Trial 3 finished with value: 588.6053038729062 and parameters: {'n_estimators': 1356, 'learning_rate': 0.08077394930490576, 'num_leaves': 2446, 'max_depth': 3, 'min_data_in_leaf': 837, 'max_bin': 248, 'reg_alpha': 0.7216149073036372, 'min_split_gain': 12.001493906153257, 'bagging_fraction': 0.5083805251334126, 'bagging_freq': 1, 'feature_fraction': 0.6995619776980326, 'early_stopping_round': 668}. Best is trial 3 with value: 588.6053038729062.[0m








































[32m[I 2023-02-25 10:08:18,740][0m Trial 4 finished with value: 951.035518905104 and parameters: {'n_estimators': 3824, 'learning_rate': 0.004334544779786953, 'num_leaves': 1102, 'max_depth': 11, 'min_data_in_leaf': 5492, 'max_bin': 290, 'reg_alpha': 0.7142063285656756, 'min_split_gain': 1.604053809728263, 'bagging_fraction': 0.35844522456261907, 'bagging_freq': 2, 'feature_fraction': 0.40595070985887466, 'early_stopping_round': 617}. Best is trial 3 with value: 588.6053038729062.[0m








































[32m[I 2023-02-25 10:15:42,741][0m Trial 5 finished with value: 656.945724779516 and parameters: {'n_estimators': 9033, 'learning_rate': 0.06120873354242745, 'num_leaves': 699, 'max_depth': 7, 'min_data_in_leaf': 4311, 'max_bin': 298, 'reg_alpha': 0.9266897809918326, 'min_split_gain': 8.806914390539431, 'bagging_fraction': 0.4223818990349014, 'bagging_freq': 10, 'feature_fraction': 0.8402260230405403, 'early_stopping_round': 577}. Best is trial 3 with value: 588.6053038729062.[0m








































[32m[I 2023-02-25 10:19:13,215][0m Trial 6 finished with value: 581.7361859525911 and parameters: {'n_estimators': 6779, 'learning_rate': 0.06778331444168217, 'num_leaves': 411, 'max_depth': 3, 'min_data_in_leaf': 3107, 'max_bin': 288, 'reg_alpha': 0.6121914104889178, 'min_split_gain': 2.114873201672371, 'bagging_fraction': 0.8419889286595663, 'bagging_freq': 4, 'feature_fraction': 0.8883465230806191, 'early_stopping_round': 212}. Best is trial 6 with value: 581.7361859525911.[0m








































[32m[I 2023-02-25 10:20:39,195][0m Trial 7 finished with value: 1130.38941907184 and parameters: {'n_estimators': 3040, 'learning_rate': 0.08322914834874287, 'num_leaves': 1347, 'max_depth': 4, 'min_data_in_leaf': 5145, 'max_bin': 213, 'reg_alpha': 0.8547941967391893, 'min_split_gain': 12.043845839405732, 'bagging_fraction': 0.2093010572830989, 'bagging_freq': 5, 'feature_fraction': 0.8850417499144854, 'early_stopping_round': 671}. Best is trial 6 with value: 581.7361859525911.[0m








































[32m[I 2023-02-25 10:21:27,350][0m Trial 8 finished with value: 891.7583581563755 and parameters: {'n_estimators': 1149, 'learning_rate': 0.06930710452873376, 'num_leaves': 306, 'max_depth': 5, 'min_data_in_leaf': 8663, 'max_bin': 236, 'reg_alpha': 0.7682599373254679, 'min_split_gain': 7.928397600489099, 'bagging_fraction': 0.52333575372674, 'bagging_freq': 6, 'feature_fraction': 0.6613750354691474, 'early_stopping_round': 639}. Best is trial 6 with value: 581.7361859525911.[0m








































[32m[I 2023-02-25 10:28:56,604][0m Trial 9 finished with value: 586.6173055619855 and parameters: {'n_estimators': 9238, 'learning_rate': 0.024172327886351513, 'num_leaves': 597, 'max_depth': 5, 'min_data_in_leaf': 3329, 'max_bin': 262, 'reg_alpha': 0.8760496408001737, 'min_split_gain': 12.229807396736403, 'bagging_fraction': 0.47080705524854766, 'bagging_freq': 1, 'feature_fraction': 0.4889418650039508, 'early_stopping_round': 489}. Best is trial 6 with value: 581.7361859525911.[0m








































[32m[I 2023-02-25 10:31:16,338][0m Trial 10 finished with value: 578.232328249825 and parameters: {'n_estimators': 7256, 'learning_rate': 0.09300348529068023, 'num_leaves': 1996, 'max_depth': 7, 'min_data_in_leaf': 729, 'max_bin': 271, 'reg_alpha': 0.21869920796011805, 'min_split_gain': 3.4207346837337207, 'bagging_fraction': 0.9382641741502498, 'bagging_freq': 4, 'feature_fraction': 0.31243043185490643, 'early_stopping_round': 114}. Best is trial 10 with value: 578.232328249825.[0m








































[32m[I 2023-02-25 10:33:38,448][0m Trial 11 finished with value: 603.6904846668156 and parameters: {'n_estimators': 7129, 'learning_rate': 0.09837220610060327, 'num_leaves': 2038, 'max_depth': 7, 'min_data_in_leaf': 367, 'max_bin': 269, 'reg_alpha': 0.11127759425530499, 'min_split_gain': 3.2268602384671503, 'bagging_fraction': 0.9436522645467911, 'bagging_freq': 4, 'feature_fraction': 0.21937970492442657, 'early_stopping_round': 101}. Best is trial 10 with value: 578.232328249825.[0m








































[32m[I 2023-02-25 10:35:58,600][0m Trial 12 finished with value: 573.3133790386034 and parameters: {'n_estimators': 6903, 'learning_rate': 0.09650732668170756, 'num_leaves': 1776, 'max_depth': 7, 'min_data_in_leaf': 2233, 'max_bin': 276, 'reg_alpha': 0.2741804796742497, 'min_split_gain': 3.7806774884616345, 'bagging_fraction': 0.942987585482177, 'bagging_freq': 8, 'feature_fraction': 0.9435510494619632, 'early_stopping_round': 110}. Best is trial 12 with value: 573.3133790386034.[0m








































[32m[I 2023-02-25 10:38:05,269][0m Trial 13 finished with value: 572.6171098322757 and parameters: {'n_estimators': 7525, 'learning_rate': 0.0953122615768411, 'num_leaves': 2952, 'max_depth': 7, 'min_data_in_leaf': 1794, 'max_bin': 271, 'reg_alpha': 0.24980044879634017, 'min_split_gain': 3.972180950727102, 'bagging_fraction': 0.9290714513616877, 'bagging_freq': 8, 'feature_fraction': 0.5853020759080453, 'early_stopping_round': 100}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 10:44:23,275][0m Trial 14 finished with value: 573.4117362054556 and parameters: {'n_estimators': 8139, 'learning_rate': 0.09939986573638182, 'num_leaves': 2760, 'max_depth': 12, 'min_data_in_leaf': 3164, 'max_bin': 274, 'reg_alpha': 0.346983702359864, 'min_split_gain': 5.020662544210832, 'bagging_fraction': 0.8356575718213362, 'bagging_freq': 9, 'feature_fraction': 0.5862617939223239, 'early_stopping_round': 228}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 10:48:25,758][0m Trial 15 finished with value: 573.5736877938791 and parameters: {'n_estimators': 5763, 'learning_rate': 0.08607011657183543, 'num_leaves': 2998, 'max_depth': 8, 'min_data_in_leaf': 1847, 'max_bin': 228, 'reg_alpha': 0.003605592379036182, 'min_split_gain': 5.285335876364358, 'bagging_fraction': 0.6500345588171492, 'bagging_freq': 8, 'feature_fraction': 0.7640464713207042, 'early_stopping_round': 377}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 10:53:42,851][0m Trial 16 finished with value: 583.713069132239 and parameters: {'n_estimators': 5936, 'learning_rate': 0.08943055094199254, 'num_leaves': 1676, 'max_depth': 6, 'min_data_in_leaf': 6407, 'max_bin': 259, 'reg_alpha': 0.3421223324088589, 'min_split_gain': 0.33877615639092085, 'bagging_fraction': 0.8133474662505524, 'bagging_freq': 8, 'feature_fraction': 0.5939432605642904, 'early_stopping_round': 210}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 10:56:42,549][0m Trial 17 finished with value: 575.6783612644064 and parameters: {'n_estimators': 9879, 'learning_rate': 0.07844525711769978, 'num_leaves': 2490, 'max_depth': 9, 'min_data_in_leaf': 1815, 'max_bin': 277, 'reg_alpha': 0.2643390208071234, 'min_split_gain': 5.296758163279648, 'bagging_fraction': 0.6134804513360251, 'bagging_freq': 10, 'feature_fraction': 0.7923354057654581, 'early_stopping_round': 165}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:02:03,660][0m Trial 18 finished with value: 573.3031458074148 and parameters: {'n_estimators': 7947, 'learning_rate': 0.048815330777231915, 'num_leaves': 1823, 'max_depth': 6, 'min_data_in_leaf': 1931, 'max_bin': 260, 'reg_alpha': 0.49169696940011753, 'min_split_gain': 3.60772595030274, 'bagging_fraction': 0.7539800160159155, 'bagging_freq': 7, 'feature_fraction': 0.646962483144468, 'early_stopping_round': 319}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:08:30,046][0m Trial 19 finished with value: 577.4380177441111 and parameters: {'n_estimators': 7971, 'learning_rate': 0.047703484031790956, 'num_leaves': 894, 'max_depth': 5, 'min_data_in_leaf': 3989, 'max_bin': 205, 'reg_alpha': 0.4881867394450944, 'min_split_gain': 2.3023791881225883, 'bagging_fraction': 0.7504411898939711, 'bagging_freq': 6, 'feature_fraction': 0.6112005290475171, 'early_stopping_round': 289}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:16:10,308][0m Trial 20 finished with value: 682.0535905626909 and parameters: {'n_estimators': 8328, 'learning_rate': 0.045986845046650175, 'num_leaves': 2528, 'max_depth': 8, 'min_data_in_leaf': 6384, 'max_bin': 238, 'reg_alpha': 0.47722689909296995, 'min_split_gain': 14.761416401929147, 'bagging_fraction': 0.6051787147068133, 'bagging_freq': 7, 'feature_fraction': 0.5217077118598082, 'early_stopping_round': 418}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:18:38,127][0m Trial 21 finished with value: 573.1902623905937 and parameters: {'n_estimators': 6540, 'learning_rate': 0.09134111599102349, 'num_leaves': 1771, 'max_depth': 6, 'min_data_in_leaf': 1946, 'max_bin': 260, 'reg_alpha': 0.20390716354464097, 'min_split_gain': 3.931230271803741, 'bagging_fraction': 0.8874322517578149, 'bagging_freq': 8, 'feature_fraction': 0.917106960235646, 'early_stopping_round': 153}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:22:27,857][0m Trial 22 finished with value: 572.9846279801552 and parameters: {'n_estimators': 6251, 'learning_rate': 0.0867488488088383, 'num_leaves': 1113, 'max_depth': 6, 'min_data_in_leaf': 2285, 'max_bin': 258, 'reg_alpha': 0.15950512366556877, 'min_split_gain': 4.538787334376424, 'bagging_fraction': 0.8779257846396955, 'bagging_freq': 9, 'feature_fraction': 0.7347787141661671, 'early_stopping_round': 299}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:24:13,150][0m Trial 23 finished with value: 572.9012002955163 and parameters: {'n_estimators': 5034, 'learning_rate': 0.08913944283063778, 'num_leaves': 1188, 'max_depth': 6, 'min_data_in_leaf': 1208, 'max_bin': 260, 'reg_alpha': 0.15690884009372483, 'min_split_gain': 5.880886413311163, 'bagging_fraction': 0.868404252211136, 'bagging_freq': 9, 'feature_fraction': 0.7581821572013334, 'early_stopping_round': 171}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:25:52,972][0m Trial 24 finished with value: 572.9929003742127 and parameters: {'n_estimators': 5008, 'learning_rate': 0.08498486919870507, 'num_leaves': 1173, 'max_depth': 4, 'min_data_in_leaf': 954, 'max_bin': 255, 'reg_alpha': 0.144973472789672, 'min_split_gain': 5.9724806779631106, 'bagging_fraction': 0.8817714831611544, 'bagging_freq': 9, 'feature_fraction': 0.7309276546489464, 'early_stopping_round': 247}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:29:22,539][0m Trial 25 finished with value: 574.2394818044703 and parameters: {'n_estimators': 4861, 'learning_rate': 0.09067222216076254, 'num_leaves': 905, 'max_depth': 8, 'min_data_in_leaf': 2704, 'max_bin': 240, 'reg_alpha': 0.03037870086274705, 'min_split_gain': 6.358089187792789, 'bagging_fraction': 0.7903706318124365, 'bagging_freq': 9, 'feature_fraction': 0.7902933936194815, 'early_stopping_round': 175}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:31:46,150][0m Trial 26 finished with value: 574.1621639635255 and parameters: {'n_estimators': 6119, 'learning_rate': 0.07550019735978021, 'num_leaves': 47, 'max_depth': 4, 'min_data_in_leaf': 1329, 'max_bin': 227, 'reg_alpha': 0.10420588808446987, 'min_split_gain': 4.763298155147471, 'bagging_fraction': 0.8828386187482571, 'bagging_freq': 10, 'feature_fraction': 0.6719762571427856, 'early_stopping_round': 264}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:35:43,892][0m Trial 27 finished with value: 575.6627788552441 and parameters: {'n_estimators': 4374, 'learning_rate': 0.09943265347127318, 'num_leaves': 1218, 'max_depth': 6, 'min_data_in_leaf': 4115, 'max_bin': 267, 'reg_alpha': 0.15597405833432054, 'min_split_gain': 6.754561296172416, 'bagging_fraction': 0.7951959759485308, 'bagging_freq': 9, 'feature_fraction': 0.7417569871053327, 'early_stopping_round': 370}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:39:50,118][0m Trial 28 finished with value: 574.7666985533897 and parameters: {'n_estimators': 3318, 'learning_rate': 0.0863435251495327, 'num_leaves': 1510, 'max_depth': 10, 'min_data_in_leaf': 2684, 'max_bin': 282, 'reg_alpha': 0.044821277648498095, 'min_split_gain': 6.058831249687407, 'bagging_fraction': 0.726064438046333, 'bagging_freq': 10, 'feature_fraction': 0.8058267789785063, 'early_stopping_round': 157}. Best is trial 13 with value: 572.6171098322757.[0m








































[32m[I 2023-02-25 11:40:49,513][0m Trial 29 finished with value: 572.2117861528021 and parameters: {'n_estimators': 4442, 'learning_rate': 0.0790912559858966, 'num_leaves': 926, 'max_depth': 5, 'min_data_in_leaf': 203, 'max_bin': 252, 'reg_alpha': 0.07956820483699967, 'min_split_gain': 1.0150491017289305, 'bagging_fraction': 0.8628111544155899, 'bagging_freq': 7, 'feature_fraction': 0.8471235151406886, 'early_stopping_round': 326}. Best is trial 29 with value: 572.2117861528021.[0m








































[32m[I 2023-02-25 11:42:19,015][0m Trial 30 finished with value: 575.4545215389996 and parameters: {'n_estimators': 4379, 'learning_rate': 0.07946101373970657, 'num_leaves': 859, 'max_depth': 14, 'min_data_in_leaf': 241, 'max_bin': 251, 'reg_alpha': 0.08683887788155598, 'min_split_gain': 0.819226374616477, 'bagging_fraction': 0.8377949465382662, 'bagging_freq': 7, 'feature_fraction': 0.8935383424180832, 'early_stopping_round': 339}. Best is trial 29 with value: 572.2117861528021.[0m








































[32m[I 2023-02-25 11:44:41,653][0m Trial 31 finished with value: 572.1171252249242 and parameters: {'n_estimators': 5283, 'learning_rate': 0.09188065507765517, 'num_leaves': 1004, 'max_depth': 5, 'min_data_in_leaf': 1199, 'max_bin': 253, 'reg_alpha': 0.19302595922292556, 'min_split_gain': 1.6845693258669727, 'bagging_fraction': 0.8972350756328882, 'bagging_freq': 7, 'feature_fraction': 0.8376398177318876, 'early_stopping_round': 293}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:46:53,383][0m Trial 32 finished with value: 572.2617801603951 and parameters: {'n_estimators': 2752, 'learning_rate': 0.09286592347282557, 'num_leaves': 561, 'max_depth': 5, 'min_data_in_leaf': 1244, 'max_bin': 246, 'reg_alpha': 0.07012447186983185, 'min_split_gain': 1.3340193957023498, 'bagging_fraction': 0.9470678780439172, 'bagging_freq': 7, 'feature_fraction': 0.8326630333332988, 'early_stopping_round': 436}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:49:10,334][0m Trial 33 finished with value: 572.8742870442395 and parameters: {'n_estimators': 2953, 'learning_rate': 0.09445110141796917, 'num_leaves': 619, 'max_depth': 5, 'min_data_in_leaf': 1381, 'max_bin': 244, 'reg_alpha': 0.06036922839632984, 'min_split_gain': 0.20812945165472652, 'bagging_fraction': 0.9273937956683324, 'bagging_freq': 6, 'feature_fraction': 0.830231220801196, 'early_stopping_round': 451}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:50:40,033][0m Trial 34 finished with value: 582.0479740812375 and parameters: {'n_estimators': 2410, 'learning_rate': 0.09988878007832344, 'num_leaves': 30, 'max_depth': 3, 'min_data_in_leaf': 231, 'max_bin': 229, 'reg_alpha': 0.07948909249966349, 'min_split_gain': 1.318129248109674, 'bagging_fraction': 0.8984798420601686, 'bagging_freq': 5, 'feature_fraction': 0.834916416489684, 'early_stopping_round': 446}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:52:11,209][0m Trial 35 finished with value: 573.8517562496238 and parameters: {'n_estimators': 2068, 'learning_rate': 0.081845191996181, 'num_leaves': 464, 'max_depth': 4, 'min_data_in_leaf': 811, 'max_bin': 249, 'reg_alpha': 0.008179426804114542, 'min_split_gain': 2.6076249941569145, 'bagging_fraction': 0.9482181014220273, 'bagging_freq': 7, 'feature_fraction': 0.8506245086450666, 'early_stopping_round': 518}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:55:16,362][0m Trial 36 finished with value: 741.8607901798579 and parameters: {'n_estimators': 3583, 'learning_rate': 0.09346966758813531, 'num_leaves': 807, 'max_depth': 5, 'min_data_in_leaf': 9533, 'max_bin': 221, 'reg_alpha': 0.20297567024663443, 'min_split_gain': 1.6637872990470028, 'bagging_fraction': 0.7844562843136038, 'bagging_freq': 7, 'feature_fraction': 0.6907834094987216, 'early_stopping_round': 360}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:57:37,256][0m Trial 37 finished with value: 582.3003794187096 and parameters: {'n_estimators': 4097, 'learning_rate': 0.07576678259109432, 'num_leaves': 979, 'max_depth': 3, 'min_data_in_leaf': 1481, 'max_bin': 252, 'reg_alpha': 0.3591580974552242, 'min_split_gain': 0.960023578743425, 'bagging_fraction': 0.8429634802596624, 'bagging_freq': 6, 'feature_fraction': 0.9471379243778466, 'early_stopping_round': 416}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 11:59:13,081][0m Trial 38 finished with value: 575.4471055264602 and parameters: {'n_estimators': 2427, 'learning_rate': 0.07105919059889265, 'num_leaves': 1351, 'max_depth': 4, 'min_data_in_leaf': 615, 'max_bin': 245, 'reg_alpha': 0.2702178810528131, 'min_split_gain': 2.5039305317636575, 'bagging_fraction': 0.6840897961634168, 'bagging_freq': 5, 'feature_fraction': 0.8686242687000224, 'early_stopping_round': 527}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 12:04:28,071][0m Trial 39 finished with value: 572.8502254664066 and parameters: {'n_estimators': 5376, 'learning_rate': 0.08361828079669714, 'num_leaves': 2282, 'max_depth': 7, 'min_data_in_leaf': 3553, 'max_bin': 234, 'reg_alpha': 0.11025329834921721, 'min_split_gain': 0.18872555146037318, 'bagging_fraction': 0.9135082281567716, 'bagging_freq': 8, 'feature_fraction': 0.7091899671419488, 'early_stopping_round': 276}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 12:05:58,759][0m Trial 40 finished with value: 576.381013684475 and parameters: {'n_estimators': 1711, 'learning_rate': 0.0796890107899485, 'num_leaves': 205, 'max_depth': 5, 'min_data_in_leaf': 2546, 'max_bin': 300, 'reg_alpha': 0.05833669408883502, 'min_split_gain': 1.5724661239872118, 'bagging_fraction': 0.9112421396264845, 'bagging_freq': 6, 'feature_fraction': 0.8065632224018365, 'early_stopping_round': 465}. Best is trial 31 with value: 572.1171252249242.[0m








































[32m[I 2023-02-25 12:11:00,141][0m Trial 41 finished with value: 573.0248499181934 and parameters: {'n_estimators': 5506, 'learning_rate': 0.09351127444224233, 'num_leaves': 2228, 'max_depth': 7, 'min_data_in_leaf': 3602, 'max_bin': 233, 'reg_alpha': 0.12294693048890615, 'min_split_gain': 1.554928529511706, 'bagging_fraction': 0.9093616742258245, 'bagging_freq': 8, 'feature_fraction': 0.7007905071101859, 'early_stopping_round': 271}. Best is trial 31 with value: 572.1171252249242.[0m
















[33m[W 2023-02-25 12:13:17,197][0m Trial 42 failed with parameters: {'n_estimators': 4732, 'learning_rate': 0.08296625452728301, 'num_leaves': 2820, 'max_depth': 9, 'min_data_in_leaf': 4647, 'max_bin': 242, 'reg_alpha': 0.09326612315731606, 'min_split_gain': 0.027968818581725813, 'bagging_fraction': 0.8448709276751496, 'bagging_freq': 8, 'feature_fraction': 0.7740902480391918, 'early_stopping_round': 327} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/Users/Jacob/.conda/envs/py38/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/43/mkchbv_x1qz9cxc3rr08t8c40000gp/T/ipykernel_17373/575197089.py", line 30, in objective
    model.fit(X_train,
  File "/Users/Jacob/.conda/envs/py38/lib/python3.8/site-packages/lightgbm/sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "/Users/Jacob/.conda/envs/

In [10]:
def run_results(param_input):
    test_md = test.copy()

    X = train[features]
    Y = train['price'] 

    test_data = test[features]

    XGB_cv_scores, XGB_imp = list(), list()
    preds = list()

    skf = KFold(10, random_state = 13, shuffle = True)

    for train_ix, test_ix in skf.split(X, Y):

        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]

        ## Building RF model
        XGB_md = xgb.XGBRegressor(**param_input).fit(X_train, Y_train,
                                               eval_set= [(X_test,Y_test)],
                             verbose=0)
        #XGB_md.feature_importances_


        ## Predicting on X_test and test
        XGB_pred_1 = XGB_md.predict(X_test)
        XGB_pred_2 = XGB_md.predict(test_data)

        ## Computing roc-auc score
        XGB_cv_scores.append(np.sqrt(mean_squared_error(Y_test, XGB_pred_1)))
        preds.append(XGB_pred_2)

    #XGB_cv_score = np.mean(XGB_cv_scores)    
    print('The average RMSE over 10-folds is:', np.mean(XGB_cv_scores))
    
    return pd.DataFrame(preds)

In [8]:
params = {'tree_method': 'exact', 'max_depth': 9, 'learning_rate': 0.0028019118673335355, 'n_estimators': 4008, 'reg_lambda': 0.03396770709801297, 'gamma': 0.8546917842299727, 'subsample': 0.33198783521275654, 'colsample_bytree': 0.9804536473910398, 'min_child_weight': 5, 'early_stopping_rounds': 201, 'reg_alpha': 0.09887091760877176}

In [12]:
out_df = run_results(params)

The average RMSE over 10-folds is: 566.5574511783434


In [15]:
print(params)

{'tree_method': 'exact', 'max_depth': 9, 'learning_rate': 0.0028019118673335355, 'n_estimators': 4008, 'reg_lambda': 0.03396770709801297, 'gamma': 0.8546917842299727, 'subsample': 0.33198783521275654, 'colsample_bytree': 0.9804536473910398, 'min_child_weight': 5, 'early_stopping_rounds': 201, 'reg_alpha': 0.09887091760877176}


In [13]:
submission_file = pd.DataFrame(np.transpose(np.vstack([test_df['id'].astype('int'), pd.DataFrame(out_df).mean(axis=0)])), columns=['id', 'price'])
submission_file['id'] = submission_file['id'].astype('int')
submission_file.to_csv("./submissions/8.csv", index=False)

| Feature Set | Tuned Hyperparameters | CV (5-fold) Score | LB Score |
|---|---|---|---|
| All features + OneHotEncoded | {'max_depth': 8, 'learning_rate': 0.011372635640766581, 'n_estimators': 4999, 'reg_lambda': 0.08113388104914292, 'gamma': 0.4703233640126939, 'subsample': 0.572912477610107, 'colsample_bytree': 0.8466912196410705, 'min_child_weight': 5, 'early_stopping_rounds': 378, 'reg_alpha': 0.013284008444197512} | 0.576 | 0.584 |
| All Features - (x,y,z)| {'tree_method': 'hist', 'max_depth': 7, 'learning_rate': 0.050996117076570605, 'n_estimators': 2526, 'reg_lambda': 0.07610652509281068, 'gamma': 0.1414009438093801, 'subsample': 0.8244491165581733, 'colsample_bytree': 0.9318723417490518, 'min_child_weight': 5, 'early_stopping_rounds': 476, 'reg_alpha': 0.1425023600600033} | 0.580 | 0.589 |
| All features + OneHotEncoded w/ Original label (w/ Original Data) | {'tree_method': 'exact', 'max_depth': 8, 'learning_rate': 0.005855954960496774, 'n_estimators': 7141, 'reg_lambda': 0.037816359453527223, 'gamma': 0.07151677988518425, 'subsample': 0.7557679812273866, 'colsample_bytree': 0.5926578940053994, 'min_child_weight': 4, 'early_stopping_rounds': 208, 'reg_alpha': 0.413828386261195} | 0.568 | 0.578 |
| All features + OneHotEncoded w/ Original label (w/ Original Data) - original features | {'tree_method': 'approx', 'max_depth': 7, 'learning_rate': 0.0037171191585069555, 'n_estimators': 3663, 'reg_lambda': 0.04860253888263786, 'gamma': 0.04054562087853457, 'subsample': 0.46613454139173527, 'colsample_bytree': 0.5611846478420512, 'min_child_weight': 1, 'early_stopping_rounds': 432, 'reg_alpha': 0.08333588167604829} | 0.571 | 0.578 |
| All features + OneHotEncoded w/ Original label (w/ Original Data) + single carat column | {'tree_method': 'exact', 'max_depth': 8, 'learning_rate': 0.006208783389130772, 'n_estimators': 3513, 'reg_lambda': 0.06577619549392931, 'gamma': 0.1310414716253491, 'subsample': 0.7250297630603487, 'colsample_bytree': 0.7813594794765958, 'min_child_weight': 5, 'early_stopping_rounds': 566, 'reg_alpha': 0.18205513684157554} | 0.566 | 0.577 |
| All features (+ original) + Ordinal Encoded Carat and Cut columns | {'tree_method': 'exact', 'max_depth': 9, 'learning_rate': 0.0028019118673335355, 'n_estimators': 4008, 'reg_lambda': 0.03396770709801297, 'gamma': 0.8546917842299727, 'subsample': 0.33198783521275654, 'colsample_bytree': 0.9804536473910398, 'min_child_weight': 5, 'early_stopping_rounds': 201, 'reg_alpha': 0.09887091760877176} | 0.566 | 0.576

In [14]:
train.corr()['price']

id         -0.001392
carat       0.940673
clarity     0.179931
cut        -0.086346
depth       0.001260
table       0.167787
x           0.899137
y           0.895102
z           0.887524
original   -0.002407
price       1.000000
color_D    -0.090452
color_E    -0.113868
color_F    -0.035322
color_G     0.018188
color_H     0.083528
color_I     0.118717
color_J     0.081868
Name: price, dtype: float64

In [191]:
XGB_md = xgb.XGBRegressor(**study.best_params).fit(X_train, Y_train, verbose=0)

NameError: name 'Y_train' is not defined