In [1]:
%pip install "scikit-learn<1.6"

Collecting scikit-learn<1.6
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.5.2


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/content/sample_data/final_df.csv', index_col=0)

In [4]:
# Split train and test
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# split features and labels
X_train = train_set.drop("WAR", axis=1).to_numpy()
y_train = train_set["WAR"].copy().to_numpy()
X_test = test_set.drop("WAR", axis=1).to_numpy()
y_test = test_set["WAR"].copy().to_numpy()

## NonLinear SVM

In [5]:
from sklearn.metrics import mean_squared_error, make_scorer
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV


svm_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ("SVR", SVR(kernel='poly')),
    ])

param_distribs = {'SVR__degree': np.arange(2, 11),
                  'SVR__C': np.linspace(0.01, 0.1, 10),
                  'SVR__epsilon': np.linspace(0.01, 0.1, 10),
                  }

svm_search = RandomizedSearchCV(
    svm_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


svm_search.fit(X_train, y_train)

In [7]:
pd.set_option('display.max_colwidth', 10)
svm_search.cv_results_ = pd.DataFrame(svm_search.cv_results_)
svm_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_SVR__epsilon,param_SVR__degree,param_SVR__C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.004388,0.000122,0.003783,0.001658,0.02,3,0.09,{'SVR_...,-0.428794,-0.945117,-0.431153,-0.601688,0.242843,1
1,0.003949,0.000129,0.003037,0.000122,0.08,3,0.1,{'SVR_...,-0.423333,-0.972412,-0.435273,-0.610339,0.256071,2
2,0.004494,0.000273,0.00314,0.00015,0.02,7,0.03,{'SVR_...,-0.561934,-0.92275,-0.564344,-0.683009,0.169525,3
13,0.004277,0.000188,0.003314,0.000484,0.01,5,0.02,{'SVR_...,-0.674866,-0.991156,-0.511874,-0.725965,0.198974,4
4,0.004615,0.000131,0.003313,0.001086,0.1,5,0.01,{'SVR_...,-0.699401,-0.998753,-0.553005,-0.750387,0.185513,5
7,0.006098,0.003272,0.003008,0.001538,0.01,4,0.02,{'SVR_...,-0.664008,-1.161217,-0.619326,-0.81485,0.245597,6
6,0.006071,0.001475,0.003228,0.000994,0.1,4,0.04,{'SVR_...,-0.710128,-1.143201,-0.597442,-0.816924,0.235255,7
15,0.005101,0.000466,0.004559,0.001989,0.01,7,0.07,{'SVR_...,-0.685391,-1.236361,-0.62139,-0.847714,0.276054,8
3,0.007535,0.003247,0.002952,0.000276,0.09,6,0.07,{'SVR_...,-0.993322,-1.167435,-0.544574,-0.901777,0.262392,9
10,0.004858,0.000664,0.004083,0.00154,0.1,6,0.02,{'SVR_...,-1.031885,-1.190145,-0.615375,-0.945802,0.242415,10


In [8]:
svm_search.best_params_

{'SVR__epsilon': 0.020000000000000004,
 'SVR__degree': 3,
 'SVR__C': 0.09000000000000001}

In [9]:
best_svm_model = svm_search.best_estimator_

In [29]:
from sklearn.metrics import root_mean_squared_error
y_pred = best_svm_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.9467002429355457

In [34]:
y_pred = best_svm_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

1.272263222208523

In [120]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([400, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_svm_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([-1.351])

In [26]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_svm_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([43.89935254])

In [35]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_svm_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([-1.35099968])

In [40]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_svm_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([-1.35007202])

In [42]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_svm_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([-1.35083971])

## RandomForest

In [43]:
from sklearn.ensemble import RandomForestRegressor

randomforest_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('RandomForest', RandomForestRegressor(random_state=42)),
    ])

param_distribs = {
    'RandomForest__n_estimators': np.arange(10, 100),
    'RandomForest__max_depth': np.arange(4, 20),
}

randomforest_search = RandomizedSearchCV(
    randomforest_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


randomforest_search.fit(X_train, y_train)

In [44]:
pd.set_option('display.max_colwidth', 10)
randomforest_search.cv_results_ = pd.DataFrame(randomforest_search.cv_results_)
randomforest_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_RandomForest__n_estimators,param_RandomForest__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
4,0.298675,0.014299,0.012521,0.001922,98,10,{'Rand...,-0.33238,-0.975023,-0.324877,-0.544093,0.304729,1
18,0.129584,0.003888,0.006328,0.00087,46,7,{'Rand...,-0.342522,-0.973113,-0.321453,-0.545696,0.302352,2
19,0.232224,0.042163,0.008822,0.002055,90,7,{'Rand...,-0.341801,-0.975083,-0.320208,-0.545697,0.303749,3
10,0.181103,0.014827,0.007869,0.000392,59,10,{'Rand...,-0.346665,-0.968846,-0.328572,-0.548028,0.297655,4
0,0.245509,0.011721,0.00958,0.000326,88,5,{'Rand...,-0.350472,-0.977318,-0.319047,-0.548946,0.303176,5
16,0.229358,0.019674,0.012096,0.002226,75,11,{'Rand...,-0.353364,-0.971708,-0.323491,-0.549521,0.29878,6
1,0.221163,0.012331,0.00888,0.001045,75,10,{'Rand...,-0.352378,-0.972026,-0.32487,-0.549758,0.2988,7
17,0.142549,0.006923,0.008752,0.00212,46,12,{'Rand...,-0.346568,-0.972346,-0.331846,-0.550253,0.298525,8
6,0.15019,0.005443,0.007621,0.000684,50,19,{'Rand...,-0.349959,-0.971165,-0.330124,-0.550416,0.297624,9
13,0.158918,0.010389,0.011852,0.004371,50,14,{'Rand...,-0.350254,-0.971165,-0.330124,-0.550514,0.297558,10


In [45]:
randomforest_search.best_params_

{'RandomForest__n_estimators': 98, 'RandomForest__max_depth': 10}

In [46]:
best_randomforest_model = randomforest_search.best_estimator_

In [50]:
y_pred = best_randomforest_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.2767681295483611

In [51]:
y_pred = best_randomforest_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.8474352439713385

In [52]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_randomforest_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([5.79739126])

In [53]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_randomforest_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([4.63072955])

In [54]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_randomforest_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.27237073])

In [55]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_randomforest_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.68867808])

In [56]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_randomforest_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.43677977])

## Adaboost

In [57]:
from sklearn.ensemble import AdaBoostRegressor

adaboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('Adaboost', AdaBoostRegressor(random_state=42)),
    ])

param_distribs = {
    'Adaboost__n_estimators': np.arange(10, 100),
    'Adaboost__learning_rate': np.linspace(1e-4, 1e-2),
}

adaboost_search = RandomizedSearchCV(
    adaboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


adaboost_search.fit(X_train, y_train)

In [58]:
pd.set_option('display.max_colwidth', 10)
adaboost_search.cv_results_ = pd.DataFrame(adaboost_search.cv_results_)
adaboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Adaboost__n_estimators,param_Adaboost__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,0.155173,0.006252,0.014542,0.001078,50,0.000302,{'Adab...,-0.298067,-0.988626,-0.334729,-0.540474,0.317245,1
14,0.190723,0.00756,0.024538,0.007988,65,0.007576,{'Adab...,-0.305832,-0.984763,-0.337236,-0.54261,0.312912,2
0,0.186695,0.011956,0.016818,0.000491,60,0.001918,{'Adab...,-0.303364,-0.989754,-0.336898,-0.543338,0.31596,3
1,0.286986,0.02797,0.029196,0.008252,92,0.008384,{'Adab...,-0.313737,-0.984364,-0.336959,-0.54502,0.310808,4
15,0.231923,0.006806,0.0206,0.000922,77,0.009192,{'Adab...,-0.310967,-0.983993,-0.340489,-0.545149,0.310543,5
5,0.10148,0.000563,0.011029,0.000329,34,0.007778,{'Adab...,-0.313009,-0.983869,-0.339669,-0.545516,0.310154,6
10,0.16951,0.003311,0.016732,0.000639,59,0.001716,{'Adab...,-0.305638,-0.987006,-0.34495,-0.545865,0.312347,7
2,0.126491,0.004574,0.013979,0.002012,42,0.006969,{'Adab...,-0.310052,-0.98443,-0.343464,-0.545982,0.31033,8
4,0.087012,0.011209,0.011484,0.00192,26,0.01,{'Adab...,-0.310952,-0.982536,-0.345853,-0.546447,0.30869,9
17,0.114537,0.013903,0.009719,0.000235,34,0.00111,{'Adab...,-0.310433,-0.987165,-0.342375,-0.546658,0.311759,10


In [59]:
adaboost_search.best_params_

{'Adaboost__n_estimators': 50,
 'Adaboost__learning_rate': 0.0003020408163265306}

In [60]:
best_adaboost_model = adaboost_search.best_estimator_

In [64]:
y_pred = best_adaboost_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.7801460746969361

In [72]:
y_pred = best_adaboost_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.9476350970502071

In [65]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_adaboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.5402321])

In [67]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_adaboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.53098968])

In [68]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_adaboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.29225286])

In [69]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_adaboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.30835721])

In [70]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_adaboost_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.84948742])

## Gradientboost

In [74]:
from sklearn.ensemble import GradientBoostingRegressor

gradientboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('Gradientboost', GradientBoostingRegressor(random_state=42)),
    ])

param_distribs = {
    'Gradientboost__n_estimators': np.arange(10, 100),
    'Gradientboost__max_depth': np.arange(4, 20),
    'Gradientboost__learning_rate': np.linspace(1e-4, 1e-2),
}

gradientboost_search = RandomizedSearchCV(
    gradientboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


gradientboost_search.fit(X_train, y_train)

In [75]:
pd.set_option('display.max_colwidth', 10)
gradientboost_search.cv_results_ = pd.DataFrame(gradientboost_search.cv_results_)
gradientboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Gradientboost__n_estimators,param_Gradientboost__max_depth,param_Gradientboost__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
9,0.37147,0.049335,0.005,0.002102,91,14,0.009394,{'Grad...,-0.376148,-1.072258,-0.437587,-0.628664,0.314669,1
2,0.234722,0.011463,0.005431,0.002341,86,5,0.007778,{'Grad...,-0.410306,-1.089565,-0.474553,-0.658141,0.306188,2
12,0.321308,0.030477,0.005597,0.00403,75,11,0.008384,{'Grad...,-0.412263,-1.099652,-0.481442,-0.664452,0.309026,3
17,0.345145,0.050442,0.009397,0.002594,77,8,0.007576,{'Grad...,-0.422652,-1.106729,-0.492466,-0.673949,0.307346,4
6,0.257644,0.032795,0.005777,0.003567,63,17,0.008384,{'Grad...,-0.443143,-1.118449,-0.507523,-0.689705,0.304305,5
13,0.238565,0.045105,0.005477,0.003511,55,15,0.008788,{'Grad...,-0.456971,-1.12697,-0.518782,-0.700908,0.302326,6
14,0.114075,0.027273,0.003083,0.000749,45,5,0.009192,{'Grad...,-0.486049,-1.138438,-0.54239,-0.722292,0.295157,7
8,0.331856,0.026348,0.006322,0.002633,60,12,0.005757,{'Grad...,-0.507299,-1.155924,-0.561866,-0.741696,0.293749,8
18,0.451245,0.081964,0.007587,0.004526,83,18,0.003939,{'Grad...,-0.51431,-1.160916,-0.568522,-0.747916,0.292872,9
19,0.088082,0.016265,0.002222,0.000409,32,19,0.01,{'Grad...,-0.518151,-1.161822,-0.569452,-0.749808,0.292089,10


In [76]:
gradientboost_search.best_params_

{'Gradientboost__n_estimators': 91,
 'Gradientboost__max_depth': 14,
 'Gradientboost__learning_rate': 0.009393877551020408}

In [77]:
best_gradientboost_model = gradientboost_search.best_estimator_

In [78]:
y_pred = best_gradientboost_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.9742550622403962

In [79]:
y_pred = best_gradientboost_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

1.0724951889846688

In [80]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_gradientboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.06706534])

In [81]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_gradientboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.08312472])

In [82]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_gradientboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.68548991])

In [83]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_gradientboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.92948888])

In [84]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_gradientboost_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.71355435])

## xgboost

In [85]:
import xgboost as xgb

xgboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('xgboost', xgb.XGBRegressor(random_state=42)),
    ])

param_distribs = {
    'xgboost__n_estimators': np.arange(10, 100),
    'xgboost__max_depth': np.arange(4, 20),
}

xgboost_search = RandomizedSearchCV(
    xgboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


xgboost_search.fit(X_train, y_train)

In [86]:
pd.set_option('display.max_colwidth', 10)
xgboost_search.cv_results_ = pd.DataFrame(xgboost_search.cv_results_)
xgboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgboost__n_estimators,param_xgboost__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
19,0.055043,0.012964,0.003769,0.000882,90,7,{'xgbo...,-0.282672,-0.967799,-0.33452,-0.528331,0.311471,1
18,0.05452,0.002519,0.00477,0.000954,46,7,{'xgbo...,-0.282656,-0.967799,-0.334537,-0.528331,0.311472,2
13,0.069686,0.002781,0.004545,0.000974,50,14,{'xgbo...,-0.299342,-0.972814,-0.332346,-0.534834,0.309992,3
5,0.075515,0.002197,0.005233,0.001072,74,14,{'xgbo...,-0.299366,-0.972814,-0.332342,-0.534841,0.309987,4
8,0.070862,0.007826,0.004407,0.001216,39,17,{'xgbo...,-0.299476,-0.97282,-0.33238,-0.534892,0.309953,5
6,0.081783,0.016965,0.008199,0.003287,50,19,{'xgbo...,-0.299523,-0.972814,-0.332346,-0.534895,0.309946,6
15,0.06915,0.003844,0.004923,0.000905,55,18,{'xgbo...,-0.299545,-0.972814,-0.332342,-0.5349,0.309941,7
12,0.074376,0.004871,0.006957,0.001225,63,17,{'xgbo...,-0.299545,-0.972814,-0.332342,-0.5349,0.309941,8
7,0.078434,0.001379,0.007007,0.001584,82,19,{'xgbo...,-0.299545,-0.972814,-0.332342,-0.5349,0.309941,9
17,0.080781,0.010024,0.004853,0.000744,46,12,{'xgbo...,-0.299305,-0.972814,-0.333221,-0.535113,0.309811,10


In [87]:
xgboost_search.best_params_

{'xgboost__n_estimators': 90, 'xgboost__max_depth': 7}

In [88]:
best_xgboost_model = xgboost_search.best_estimator_

In [89]:
y_pred = best_xgboost_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.0025976832793214353

In [90]:
y_pred = best_xgboost_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.9208960388005969

In [91]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_xgboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([8.252278], dtype=float32)

In [92]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_xgboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([5.4272914], dtype=float32)

In [93]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_xgboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.1823974], dtype=float32)

In [94]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_xgboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.3462124], dtype=float32)

In [95]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_xgboost_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.4725465], dtype=float32)

## lightgbm

In [96]:
%pip install lightgbm



In [97]:
from lightgbm import LGBMRegressor

lightgbm_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('lightgbm', LGBMRegressor(random_state=42)),
    ])

param_distribs = {
    'lightgbm__n_estimators': np.arange(10, 100),
    'lightgbm__max_depth': np.arange(4, 20),
}

lightgbm_search = RandomizedSearchCV(
    lightgbm_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


lightgbm_search.fit(X_train, y_train)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 11
[LightGBM] [Info] Start training from score 0.644009


In [98]:
pd.set_option('display.max_colwidth', 10)
lightgbm_search.cv_results_ = pd.DataFrame(lightgbm_search.cv_results_)
lightgbm_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lightgbm__n_estimators,param_lightgbm__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
11,0.060924,0.000861,0.019181,0.007296,35,5,{'ligh...,-0.305881,-1.009656,-0.335553,-0.550363,0.324995,1
2,0.022893,0.019678,0.007999,0.004015,18,10,{'ligh...,-0.280133,-1.031269,-0.345947,-0.55245,0.339641,2
8,0.05454,0.002765,0.015495,0.003005,39,17,{'ligh...,-0.312079,-1.009085,-0.338217,-0.553127,0.322588,3
17,0.062485,0.012217,0.01742,0.011226,46,12,{'ligh...,-0.317787,-1.008799,-0.337541,-0.554709,0.321192,4
18,0.047161,0.0211,0.013685,0.006742,46,7,{'ligh...,-0.317787,-1.008799,-0.337541,-0.554709,0.321192,4
13,0.091809,0.014008,0.014875,0.010483,50,14,{'ligh...,-0.320559,-1.009161,-0.337302,-0.555674,0.320737,6
6,0.047894,0.006014,0.010405,0.004075,50,19,{'ligh...,-0.320559,-1.009161,-0.337302,-0.555674,0.320737,6
15,0.06951,0.038455,0.014708,0.008627,55,18,{'ligh...,-0.322637,-1.009332,-0.338013,-0.556661,0.320148,8
12,0.086093,0.006329,0.020156,0.005796,63,17,{'ligh...,-0.324179,-1.009764,-0.337869,-0.557271,0.32001,9
10,0.084503,0.0011,0.020032,0.008989,59,10,{'ligh...,-0.32316,-1.009393,-0.339295,-0.557283,0.319758,10


In [99]:
lightgbm_search.best_params_

{'lightgbm__n_estimators': 35, 'lightgbm__max_depth': 5}

In [100]:
best_lightgbm_model = lightgbm_search.best_estimator_

In [101]:
y_pred = best_lightgbm_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.5968350219110156

In [102]:
y_pred = best_lightgbm_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.7813401824138744

In [103]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_lightgbm_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([4.44356748])

In [104]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_lightgbm_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([4.4871861])

In [105]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_lightgbm_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.72821165])

In [106]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_lightgbm_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.63276198])

In [107]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_lightgbm_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.87049048])

## catboost

In [108]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [109]:
from catboost import CatBoostRegressor as Catboost

catboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('catboost', Catboost(random_state=42)),
    ])

param_distribs = {
    'catboost__n_estimators': np.arange(10, 100),
    'catboost__max_depth': np.arange(4, 20),
}

catboost_search = RandomizedSearchCV(
    catboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


catboost_search.fit(X_train, y_train)

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text

Learning rate set to 0.215334
0:	learn: 0.8379678	total: 55.1ms	remaining: 4.9s
1:	learn: 0.7609917	total: 62.4ms	remaining: 2.74s
2:	learn: 0.7024511	total: 69.3ms	remaining: 2.01s
3:	learn: 0.6570651	total: 72.6ms	remaining: 1.56s
4:	learn: 0.6312244	total: 77.6ms	remaining: 1.32s
5:	learn: 0.6089074	total: 81.3ms	remaining: 1.14s
6:	learn: 0.5884607	total: 85.3ms	remaining: 1.01s
7:	learn: 0.5588328	total: 92ms	remaining: 943ms
8:	learn: 0.5407704	total: 96.2ms	remaining: 866ms
9:	learn: 0.5039799	total: 100ms	remaining: 803ms
10:	learn: 0.4752397	total: 102ms	remaining: 734ms
11:	learn: 0.4623382	total: 103ms	remaining: 668ms
12:	learn: 0.4446557	total: 105ms	remaining: 623ms
13:	learn: 0.4310252	total: 110ms	remaining: 595ms
14:	learn: 0.4138502	total: 115ms	remaining: 575ms
15:	learn: 0.3923775	total: 120ms	remaining: 557ms
16:	learn: 0.3812194	total: 125ms	remaining: 536ms
17:	learn: 0.3695018	total: 128ms	remaining: 512ms
18:	learn: 0.3611609	total: 131ms	remaining: 491ms
19:	l

In [110]:
pd.set_option('display.max_colwidth', 10)
catboost_search.cv_results_ = pd.DataFrame(catboost_search.cv_results_)
catboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_catboost__n_estimators,param_catboost__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
19,0.621295,0.174729,0.013197,0.004557,90,7,{'catb...,-0.271027,-0.972788,-0.313841,-0.519219,0.321198,1
0,0.27498,0.053393,0.008943,0.000751,88,5,{'catb...,-0.266508,-0.981199,-0.324236,-0.523981,0.32416,2
18,0.37623,0.020148,0.009646,0.005339,46,7,{'catb...,-0.266589,-0.975622,-0.334143,-0.525452,0.319511,3
3,1.372822,0.3824,0.019601,0.013049,75,4,{'catb...,-0.276996,-0.99137,-0.317323,-0.528563,0.327668,4
11,0.122707,0.016313,0.007749,0.002287,35,5,{'catb...,-0.302986,-0.967321,-0.321613,-0.53064,0.308874,5
14,0.199263,0.032768,0.007346,0.001987,61,5,{'catb...,-0.271326,-0.97604,-0.35196,-0.533109,0.314925,6
16,2.327398,0.123996,0.005983,0.000303,75,11,{'catb...,-0.286869,-0.979864,-0.342585,-0.536439,0.314373,7
9,0.657902,0.093802,0.01065,0.002779,74,6,{'catb...,-0.284033,-0.977939,-0.351649,-0.537874,0.312395,8
4,4.737945,1.148594,0.015835,0.007715,98,10,{'catb...,-0.284546,-0.992042,-0.34313,-0.539906,0.320602,9
1,5.06535,0.622023,0.01626,0.003235,75,10,{'catb...,-0.290046,-0.987913,-0.345147,-0.541035,0.31679,10


In [111]:
catboost_search.best_params_

{'catboost__n_estimators': 90, 'catboost__max_depth': 7}

In [112]:
best_catboost_model = catboost_search.best_estimator_

In [113]:
y_pred = best_catboost_model.predict(X_train)
rmse = root_mean_squared_error(np.exp(y_train) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.1748032793685214

In [114]:
y_pred = best_catboost_model.predict(X_test)
rmse = root_mean_squared_error(np.exp(y_test) + 1.351, np.exp(y_pred) + 1.351)
rmse

0.7647288101444115

In [115]:
# Chang-Ki Hong (6.67) 2023
ChangKiHong = np.array([524, np.log(109), np.log(1), np.log(216), np.log(65), np.log(88), np.log(3), np.log(6), 0.332, 0.444, 0.412, 0.856])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_catboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.69694534])

In [116]:
# Austin (4.97) 2023
Auntin = np.array([520, np.log(87), np.log(23), np.log(269), np.log(95), np.log(53), np.log(2), np.log(7), 0.313, 0.376, 0.517, 0.893])
Auntin = Auntin.reshape(1, -1)
y_pred = best_catboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([4.63091603])

In [117]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([407, np.log(62), np.log(6), np.log(168), np.log(49), np.log(45), np.log(1), np.log(1), 0.295, 0.366, 0.413, 0.779])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_catboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.69681995])

In [118]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([382, np.log(65), np.log(12), np.log(152), np.log(41), np.log(65), np.log(1), np.log(2), 0.254, 0.379, 0.398, 0.777])
Choo = Choo.reshape(1, -1)
y_pred = best_catboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.80705903])

In [119]:
# EunWonJung (0.74) 2023
EunWonJung = np.array([388, np.log(50), np.log(2), np.log(104), np.log(30), np.log(62), np.log(1), np.log(1), 0.222, 0.333, 0.268, 0.601])
EunWonJung = EunWonJung.reshape(1, -1)
y_pred = best_catboost_model.predict(EunWonJung)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.11949409])