In [1]:
%pip install "scikit-learn<1.6"

Collecting scikit-learn<1.6
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.0
    Uninstalling scikit-learn-1.6.0:
      Successfully uninstalled scikit-learn-1.6.0
Successfully installed scikit-learn-1.5.2


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
df = pd.read_csv('/content/sample_data/final_df.csv', index_col=0)

In [27]:
# Split train and test
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

# split features and labels
X_train = train_set.drop("WAR", axis=1).to_numpy()
y_train = train_set["WAR"].copy().to_numpy()
X_test = test_set.drop("WAR", axis=1).to_numpy()
y_test = test_set["WAR"].copy().to_numpy()

## NonLinear SVM

In [28]:
from sklearn.metrics import mean_squared_error, make_scorer
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV


svm_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ("SVR", SVR(kernel='poly')),
    ])

param_distribs = {'SVR__degree': np.arange(2, 11),
                  'SVR__C': np.linspace(0.01, 0.1, 10),
                  'SVR__epsilon': np.linspace(0.01, 0.1, 10),
                  }

svm_search = RandomizedSearchCV(
    svm_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


svm_search.fit(X_train, y_train)

In [30]:
pd.set_option('display.max_colwidth', 10)
svm_search.cv_results_ = pd.DataFrame(svm_search.cv_results_)
svm_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_SVR__epsilon,param_SVR__degree,param_SVR__C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.008902,0.00448,0.004442,0.001292,0.02,3,0.09,{'SVR_...,-0.72853,-1.061496,-0.582076,-0.790701,0.200599,1
1,0.004577,0.00113,0.003897,0.000934,0.08,3,0.1,{'SVR_...,-0.772431,-1.053078,-0.580924,-0.802144,0.193898,2
6,0.007613,0.001576,0.003527,0.000457,0.1,4,0.04,{'SVR_...,-0.691769,-1.209716,-0.678354,-0.859946,0.247385,3
7,0.00614,0.000608,0.003178,0.000558,0.01,4,0.02,{'SVR_...,-0.699475,-1.224789,-0.68921,-0.871158,0.25009,4
4,0.006989,0.004538,0.003288,0.000708,0.1,5,0.01,{'SVR_...,-0.957968,-1.206923,-0.661499,-0.94213,0.22295,5
13,0.003921,0.000136,0.002271,3.9e-05,0.01,5,0.02,{'SVR_...,-0.942975,-1.22206,-0.671094,-0.945376,0.224937,6
10,0.008229,0.004493,0.003209,0.000176,0.1,6,0.02,{'SVR_...,-1.061825,-1.217858,-0.69215,-0.990611,0.220448,7
3,0.009691,0.0048,0.004857,0.002719,0.09,6,0.07,{'SVR_...,-1.378454,-1.278795,-0.738223,-1.131824,0.281276,8
2,0.011012,0.004297,0.003304,0.000215,0.02,7,0.03,{'SVR_...,-4.013151,-1.355875,-0.793901,-2.054309,1.403983,9
17,0.018875,0.017338,0.002332,0.000792,0.03,8,0.05,{'SVR_...,-3.206412,-1.685347,-1.463529,-2.118429,0.774631,10


In [31]:
svm_search.best_params_

{'SVR__epsilon': 0.020000000000000004,
 'SVR__degree': 3,
 'SVR__C': 0.09000000000000001}

In [32]:
best_svm_model = svm_search.best_estimator_

In [33]:
from sklearn.metrics import root_mean_squared_error
y_pred = best_svm_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.772957630205037

In [34]:
y_pred = best_svm_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

0.7615784540732059

In [35]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_svm_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.94797459])

In [41]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_svm_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.44419758])

In [39]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_svm_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.17039867])

In [43]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_svm_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.16200567])

In [44]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_svm_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.97223634])

## RandomForest

In [46]:
from sklearn.ensemble import RandomForestRegressor

randomforest_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('RandomForest', RandomForestRegressor(random_state=42)),
    ])

param_distribs = {
    'RandomForest__n_estimators': np.arange(10, 100),
    'RandomForest__max_depth': np.arange(4, 20),
}

randomforest_search = RandomizedSearchCV(
    randomforest_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


randomforest_search.fit(X_train, y_train)

In [47]:
pd.set_option('display.max_colwidth', 10)
randomforest_search.cv_results_ = pd.DataFrame(randomforest_search.cv_results_)
randomforest_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_RandomForest__n_estimators,param_RandomForest__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
11,0.145102,0.007782,0.014606,0.006968,35,5,{'Rand...,-0.488279,-1.069157,-0.577023,-0.711486,0.255493,1
3,0.297695,0.074937,0.011619,0.004435,75,4,{'Rand...,-0.524028,-1.054902,-0.556247,-0.711726,0.243018,2
14,0.136692,0.008236,0.00665,0.000216,61,5,{'Rand...,-0.517221,-1.061649,-0.572766,-0.717212,0.244607,3
0,0.298448,0.069218,0.016286,0.005287,88,5,{'Rand...,-0.535934,-1.060068,-0.568518,-0.721507,0.239768,4
9,0.326875,0.014113,0.018855,0.004257,74,6,{'Rand...,-0.536391,-1.059244,-0.570884,-0.722173,0.238761,5
1,0.436939,0.025195,0.017688,0.002827,75,10,{'Rand...,-0.54161,-1.061862,-0.570809,-0.724761,0.238665,6
10,0.252496,0.00585,0.012999,0.00154,59,10,{'Rand...,-0.524283,-1.066746,-0.583253,-0.724761,0.243016,7
16,0.16924,0.011088,0.008385,3.5e-05,75,11,{'Rand...,-0.539861,-1.060916,-0.573539,-0.724772,0.238087,8
5,0.295856,0.031048,0.014085,0.002536,74,14,{'Rand...,-0.542887,-1.060566,-0.574014,-0.725823,0.23704,9
8,0.173567,0.008422,0.014207,0.001976,39,17,{'Rand...,-0.510305,-1.072222,-0.601206,-0.727911,0.246277,10


In [48]:
randomforest_search.best_params_

{'RandomForest__n_estimators': 35, 'RandomForest__max_depth': 5}

In [49]:
best_randomforest_model = randomforest_search.best_estimator_

In [50]:
y_pred = best_randomforest_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.3376841906106003

In [51]:
y_pred = best_randomforest_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

1.4781144096168983

In [52]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_randomforest_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.24896176])

In [53]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_randomforest_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.57359769])

In [54]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_randomforest_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.59414944])

In [55]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_randomforest_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.94486647])

In [56]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_randomforest_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.51621439])

## Adaboost

In [58]:
from sklearn.ensemble import AdaBoostRegressor

adaboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('Adaboost', AdaBoostRegressor(random_state=42)),
    ])

param_distribs = {
    'Adaboost__n_estimators': np.arange(10, 100),
    'Adaboost__learning_rate': np.linspace(1e-4, 1e-2),
}

adaboost_search = RandomizedSearchCV(
    adaboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


adaboost_search.fit(X_train, y_train)

In [59]:
pd.set_option('display.max_colwidth', 10)
adaboost_search.cv_results_ = pd.DataFrame(adaboost_search.cv_results_)
adaboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Adaboost__n_estimators,param_Adaboost__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
12,0.072926,0.03351,0.013601,0.001589,13,0.005555,{'Adab...,-0.385392,-1.09066,-0.449627,-0.641893,0.318408,1
18,0.127657,0.050414,0.021379,0.002882,12,0.002524,{'Adab...,-0.390735,-1.084514,-0.450685,-0.641978,0.313876,2
4,0.096007,0.025666,0.009382,0.003134,26,0.01,{'Adab...,-0.382203,-1.082497,-0.467633,-0.644111,0.311941,3
0,0.220835,0.056163,0.028462,0.009251,60,0.001918,{'Adab...,-0.382054,-1.080412,-0.471071,-0.644512,0.310362,4
3,0.088064,0.022158,0.008538,0.000437,26,0.00111,{'Adab...,-0.385687,-1.084454,-0.467834,-0.645992,0.311848,5
19,0.172336,0.043249,0.016401,0.007699,48,0.005757,{'Adab...,-0.385924,-1.082082,-0.470031,-0.646013,0.310254,6
9,0.18671,0.008526,0.01834,0.000922,75,0.003737,{'Adab...,-0.381174,-1.082129,-0.475076,-0.646126,0.310675,7
10,0.145423,0.000649,0.014963,0.000519,59,0.001716,{'Adab...,-0.384203,-1.082348,-0.472175,-0.646242,0.310458,8
14,0.228327,0.052495,0.021703,0.006158,65,0.007576,{'Adab...,-0.382971,-1.080281,-0.4755,-0.646251,0.309222,9
13,0.10828,0.015418,0.010501,0.004796,24,0.002727,{'Adab...,-0.385386,-1.085297,-0.468419,-0.646367,0.312216,10


In [60]:
adaboost_search.best_params_

{'Adaboost__n_estimators': 13, 'Adaboost__learning_rate': 0.005555102040816327}

In [61]:
best_adaboost_model = adaboost_search.best_estimator_

In [62]:
y_pred = best_adaboost_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.6473108557225109

In [63]:
y_pred = best_adaboost_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

0.3339275177043217

In [64]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_adaboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.897678])

In [65]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_adaboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.07421191])

In [66]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_adaboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.63157333])

In [67]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_adaboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.66085333])

In [68]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_adaboost_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.56071351])

## Gradientboost

In [69]:
from sklearn.ensemble import GradientBoostingRegressor

gradientboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('Gradientboost', GradientBoostingRegressor(random_state=42)),
    ])

param_distribs = {
    'Gradientboost__n_estimators': np.arange(10, 100),
    'Gradientboost__max_depth': np.arange(4, 20),
    'Gradientboost__learning_rate': np.linspace(1e-4, 1e-2),
}

gradientboost_search = RandomizedSearchCV(
    gradientboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


gradientboost_search.fit(X_train, y_train)

In [70]:
pd.set_option('display.max_colwidth', 10)
gradientboost_search.cv_results_ = pd.DataFrame(gradientboost_search.cv_results_)
gradientboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_Gradientboost__n_estimators,param_Gradientboost__max_depth,param_Gradientboost__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.184843,0.003028,0.008742,0.004361,86,5,0.007778,{'Grad...,-0.485939,-1.144582,-0.566033,-0.732184,0.293437,1
9,0.140274,0.01276,0.005013,0.003023,91,14,0.009394,{'Grad...,-0.494246,-1.13263,-0.575874,-0.73425,0.283661,2
17,0.098148,0.003837,0.003253,0.000532,77,8,0.007576,{'Grad...,-0.508856,-1.157156,-0.573208,-0.746407,0.29163,3
12,0.098889,0.002872,0.002476,0.000481,75,11,0.008384,{'Grad...,-0.514598,-1.154457,-0.577669,-0.748908,0.28792,4
6,0.165163,0.015103,0.003919,0.000335,63,17,0.008384,{'Grad...,-0.528281,-1.161297,-0.581979,-0.757186,0.28659,5
13,0.074883,8e-05,0.002391,0.000429,55,15,0.008788,{'Grad...,-0.536678,-1.165861,-0.584045,-0.762195,0.28609,6
14,0.075568,0.007126,0.002825,0.000804,45,5,0.009192,{'Grad...,-0.542704,-1.176119,-0.584856,-0.767893,0.289172,7
8,0.126269,0.020265,0.003189,0.001148,60,12,0.005757,{'Grad...,-0.565632,-1.185966,-0.598812,-0.78347,0.28493,8
18,0.124982,0.009001,0.004511,0.001305,83,18,0.003939,{'Grad...,-0.572024,-1.1892,-0.602286,-0.787837,0.284076,9
19,0.051432,0.01767,0.002254,0.000515,32,19,0.01,{'Grad...,-0.572504,-1.190479,-0.603228,-0.788737,0.284351,10


In [71]:
gradientboost_search.best_params_

{'Gradientboost__n_estimators': 86,
 'Gradientboost__max_depth': 5,
 'Gradientboost__learning_rate': 0.0077775510204081645}

In [72]:
best_gradientboost_model = gradientboost_search.best_estimator_

In [73]:
y_pred = best_gradientboost_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.5317335928106108

In [74]:
y_pred = best_gradientboost_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

0.9828252676490122

In [75]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_gradientboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.55721309])

In [77]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_gradientboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.09899725])

In [78]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_gradientboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.21983976])

In [79]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_gradientboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.39727344])

In [80]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_gradientboost_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.55638003])

## xgboost

In [81]:
import xgboost as xgb

xgboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('xgboost', xgb.XGBRegressor(random_state=42)),
    ])

param_distribs = {
    'xgboost__n_estimators': np.arange(10, 100),
    'xgboost__max_depth': np.arange(4, 20),
}

xgboost_search = RandomizedSearchCV(
    xgboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


xgboost_search.fit(X_train, y_train)

In [82]:
pd.set_option('display.max_colwidth', 10)
xgboost_search.cv_results_ = pd.DataFrame(xgboost_search.cv_results_)
xgboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgboost__n_estimators,param_xgboost__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,0.031797,0.0005,0.003799,0.00075,75,4,{'xgbo...,-0.433684,-1.047199,-0.564588,-0.681824,0.263828,1
11,0.023335,0.000788,0.004646,0.000349,35,5,{'xgbo...,-0.443354,-1.054231,-0.554007,-0.683864,0.265756,2
5,0.057854,0.005094,0.00398,0.000507,74,14,{'xgbo...,-0.452999,-1.06541,-0.561866,-0.693425,0.266762,3
13,0.092044,0.030184,0.004707,0.001223,50,14,{'xgbo...,-0.452997,-1.06541,-0.561868,-0.693425,0.266762,4
6,0.049281,0.001346,0.00352,0.00019,50,19,{'xgbo...,-0.452997,-1.06523,-0.562373,-0.693534,0.266595,5
12,0.067162,0.022081,0.006596,0.0035,63,17,{'xgbo...,-0.452999,-1.06523,-0.562376,-0.693535,0.266594,6
7,0.052569,0.00278,0.003301,0.000248,82,19,{'xgbo...,-0.452999,-1.06523,-0.562376,-0.693535,0.266594,7
15,0.061253,0.016081,0.004845,0.00033,55,18,{'xgbo...,-0.452999,-1.06523,-0.562375,-0.693535,0.266594,8
8,0.045156,0.001288,0.005209,0.000407,39,17,{'xgbo...,-0.452999,-1.0653,-0.562387,-0.693562,0.266625,9
14,0.037043,0.007474,0.004925,0.000119,61,5,{'xgbo...,-0.455674,-1.056842,-0.569075,-0.693864,0.260807,10


In [83]:
xgboost_search.best_params_

{'xgboost__n_estimators': 75, 'xgboost__max_depth': 4}

In [84]:
best_xgboost_model = xgboost_search.best_estimator_

In [85]:
y_pred = best_xgboost_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.06915987252816153

In [86]:
y_pred = best_xgboost_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

1.9943645964426153

In [87]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_xgboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([3.2734656], dtype=float32)

In [88]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_xgboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.5019624], dtype=float32)

In [89]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_xgboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.639875], dtype=float32)

In [90]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_xgboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.8394623], dtype=float32)

In [91]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_xgboost_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.10930967], dtype=float32)

## lightgbm

In [92]:
%pip install lightgbm



In [93]:
from lightgbm import LGBMRegressor

lightgbm_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('lightgbm', LGBMRegressor(random_state=42)),
    ])

param_distribs = {
    'lightgbm__n_estimators': np.arange(10, 100),
    'lightgbm__max_depth': np.arange(4, 20),
}

lightgbm_search = RandomizedSearchCV(
    lightgbm_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


lightgbm_search.fit(X_train, y_train)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 152, number of used features: 3
[LightGBM] [Info] Start training from score 0.644009


In [94]:
pd.set_option('display.max_colwidth', 10)
lightgbm_search.cv_results_ = pd.DataFrame(lightgbm_search.cv_results_)
lightgbm_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lightgbm__n_estimators,param_lightgbm__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.015084,0.007138,0.003852,0.000801,18,10,{'ligh...,-0.371338,-1.089593,-0.474003,-0.644978,0.317172,1
11,0.030586,0.026551,0.00925,0.008385,35,5,{'ligh...,-0.384202,-1.074517,-0.488504,-0.649074,0.303832,2
8,0.019348,0.00168,0.005907,0.000767,39,17,{'ligh...,-0.386425,-1.073613,-0.488706,-0.649581,0.302729,3
17,0.035455,0.008172,0.015438,0.004006,46,12,{'ligh...,-0.390046,-1.073182,-0.490428,-0.651219,0.301174,4
18,0.049518,0.028404,0.011822,0.008989,46,7,{'ligh...,-0.390046,-1.073182,-0.490428,-0.651219,0.301174,4
13,0.017074,0.004402,0.004802,0.001094,50,14,{'ligh...,-0.391153,-1.074039,-0.491975,-0.652389,0.300979,6
6,0.014688,0.002653,0.003711,8e-05,50,19,{'ligh...,-0.391153,-1.074039,-0.491975,-0.652389,0.300979,6
15,0.032712,0.023746,0.008244,0.008099,55,18,{'ligh...,-0.392667,-1.0744,-0.491807,-0.652958,0.30074,8
10,0.069649,0.002757,0.015982,0.007321,59,10,{'ligh...,-0.393707,-1.074729,-0.492885,-0.653774,0.300402,9
12,0.02289,0.010419,0.00365,0.000845,63,17,{'ligh...,-0.396129,-1.074466,-0.492797,-0.654464,0.299597,10


In [95]:
lightgbm_search.best_params_

{'lightgbm__n_estimators': 18, 'lightgbm__max_depth': 10}

In [96]:
best_lightgbm_model = lightgbm_search.best_estimator_

In [97]:
y_pred = best_lightgbm_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.6735379612906522

In [98]:
y_pred = best_lightgbm_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

0.3068938521147489

In [99]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_lightgbm_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.85164379])

In [100]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_lightgbm_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.51093398])

In [101]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_lightgbm_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.34126239])

In [102]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_lightgbm_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.34126239])

In [103]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_lightgbm_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.54057758])

## catboost

In [104]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [105]:
from catboost import CatBoostRegressor as Catboost

catboost_model = Pipeline([
    ("StandardScaler", StandardScaler()),
    ('catboost', Catboost(random_state=42)),
    ])

param_distribs = {
    'catboost__n_estimators': np.arange(10, 100),
    'catboost__max_depth': np.arange(4, 20),
}

catboost_search = RandomizedSearchCV(
    catboost_model,
    param_distributions=param_distribs,
    n_iter=20,
    random_state=42,
    scoring=scorer,
    cv=3,
    n_jobs=-1
    )


catboost_search.fit(X_train, y_train)

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text

Learning rate set to 0.5
0:	learn: 0.7678694	total: 118ms	remaining: 2.01s
1:	learn: 0.6728582	total: 156ms	remaining: 1.25s
2:	learn: 0.6335057	total: 172ms	remaining: 862ms
3:	learn: 0.5751989	total: 185ms	remaining: 648ms
4:	learn: 0.5627762	total: 190ms	remaining: 493ms
5:	learn: 0.5215974	total: 198ms	remaining: 397ms
6:	learn: 0.4880667	total: 224ms	remaining: 352ms
7:	learn: 0.4600441	total: 237ms	remaining: 296ms
8:	learn: 0.4351721	total: 263ms	remaining: 263ms
9:	learn: 0.4161057	total: 266ms	remaining: 213ms
10:	learn: 0.4020560	total: 275ms	remaining: 175ms
11:	learn: 0.3887569	total: 277ms	remaining: 139ms
12:	learn: 0.3710154	total: 298ms	remaining: 115ms
13:	learn: 0.3617339	total: 308ms	remaining: 87.9ms
14:	learn: 0.3487859	total: 321ms	remaining: 64.2ms
15:	learn: 0.3361386	total: 329ms	remaining: 41.1ms
16:	learn: 0.3220807	total: 340ms	remaining: 20ms
17:	learn: 0.3130323	total: 350ms	remaining: 0us


In [106]:
pd.set_option('display.max_colwidth', 10)
catboost_search.cv_results_ = pd.DataFrame(catboost_search.cv_results_)
catboost_search.cv_results_.sort_values(by=['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_catboost__n_estimators,param_catboost__max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.257745,0.051644,0.003937,0.000235,18,10,{'catb...,-0.480481,-1.053818,-0.534781,-0.689693,0.258428,1
1,0.911011,0.138564,0.018506,0.015702,75,10,{'catb...,-0.431437,-1.046578,-0.618158,-0.698724,0.257511,2
4,1.235471,0.26555,0.013568,0.008156,98,10,{'catb...,-0.431948,-1.082641,-0.583422,-0.699337,0.278002,3
0,0.306242,0.095174,0.01038,0.001901,88,5,{'catb...,-0.440216,-1.080581,-0.600857,-0.707218,0.272031,4
14,0.185354,0.04574,0.004658,0.000472,61,5,{'catb...,-0.465371,-1.0685,-0.592488,-0.708787,0.259596,5
18,0.204411,0.038613,0.003516,0.00025,46,7,{'catb...,-0.432705,-1.086351,-0.607934,-0.708997,0.276253,6
19,0.38234,0.07463,0.017228,0.014977,90,7,{'catb...,-0.490709,-1.070237,-0.56985,-0.710265,0.256581,7
9,0.153597,0.019052,0.00656,0.003756,74,6,{'catb...,-0.461412,-1.052771,-0.642673,-0.718952,0.247373,8
3,0.241604,0.084124,0.011767,0.00341,75,4,{'catb...,-0.459091,-1.04733,-0.658406,-0.721609,0.24427,9
16,1.60155,0.105394,0.018435,0.007115,75,11,{'catb...,-0.492688,-1.066264,-0.634979,-0.73131,0.243868,10


In [107]:
catboost_search.best_params_

{'catboost__n_estimators': 18, 'catboost__max_depth': 10}

In [108]:
best_catboost_model = catboost_search.best_estimator_

In [109]:
y_pred = best_catboost_model.predict(X_train)
rmsle = root_mean_squared_error(y_train, y_pred)
rmsle

0.31303231846534124

In [110]:
y_pred = best_catboost_model.predict(X_test)
rmsle = root_mean_squared_error(y_test, y_pred)
rmsle

1.174824478878005

In [111]:
# Chang-Ki Hong(6.67) 2023
ChangKiHong = np.array([np.log(109), np.log(65), np.log(216)])
ChangKiHong = ChangKiHong.reshape(1, -1)
y_pred = best_catboost_model.predict(ChangKiHong)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([4.29470032])

In [112]:
# Austin (4.97) 2023
Auntin = np.array([np.log(65), np.log(95), np.log(269)])
Auntin = Auntin.reshape(1, -1)
y_pred = best_catboost_model.predict(Auntin)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.38403209])

In [113]:
# Jae-Gyun Hwang (3.14) 2023
JaeGyunHwang = np.array([np.log(62), np.log(49), np.log(168)])
JaeGyunHwang = JaeGyunHwang.reshape(1, -1)
y_pred = best_catboost_model.predict(JaeGyunHwang)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([1.00014808])

In [114]:
# Shin-Soo Choo (1.72) 2023
Choo = np.array([np.log(65), np.log(41), np.log(152)])
Choo = Choo.reshape(1, -1)
y_pred = best_catboost_model.predict(Choo)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([2.20120885])

In [115]:
# Jae-Hwan Kim (0.44) 2023
JaeHwanKim = np.array([np.log(40), np.log(46), np.log(134)])
JaeHwanKim = JaeHwanKim.reshape(1, -1)
y_pred = best_catboost_model.predict(JaeHwanKim)
y_pred = np.exp(y_pred) - 1.351
y_pred

array([0.68052738])