## Import

In [6]:
import numpy as np
import pandas as pd
import warnings ; warnings.filterwarnings('ignore')

# model
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
# from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

## Read Data

In [7]:
feature = pd.read_parquet('../data/feature_price_0918.pqt')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(feature.iloc[:,:-1], feature.iloc[:,-1], test_size=0.2, random_state=2023)
print(f'Train: {X_train.shape[0]}, Test: {X_test.shape[0]}')

Train: 368, Test: 93


In [9]:
num = ['n_grade','running_time', 'intermission', '선예매기간', 'date_gap', 'G1', 'G2', 'G3', 'G4', 'G5', 'play_st_time']
# MinMaxScaler, RobustScaler, Normalizer, StandardScaler, MaxAbsScaler
scaler = StandardScaler()
X_train[num] = scaler.fit_transform(X_train[num])
X_test[num] = scaler.transform(X_test[num])

## Modeling

- BayesianRidge

In [10]:
model = BayesianRidge(n_iter=200) 

In [11]:
model.fit(X_train, y_train)

BayesianRidge(n_iter=200)

In [12]:
# RMSE: 62082.62974131265
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 55207.831158531066


- LinearRegression

In [13]:
model = LinearRegression()

In [14]:
model.fit(X_train, y_train)

LinearRegression()

In [15]:
# RMSE: 52043.3039726002
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 54364.8256370772


- ElasticNet

In [16]:
model = ElasticNet(max_iter=200, alpha=.1)

In [17]:
model.fit(X_train, y_train)

ElasticNet(alpha=0.1, max_iter=200)

In [18]:
# RMSE: 54200.19573514512
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 55314.70341040544


- SVR

In [19]:
model = SVR(C=100, gamma=.1, epsilon=.1)
# model = SVR(kernel='linear', C=100, gamma='auto')

In [20]:
model.fit(X_train, y_train)

SVR(C=100, gamma=0.1)

In [21]:
# RMSE: 107330.42850364567
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 107274.59842419706


- DecisionTree

In [22]:
model = DecisionTreeRegressor(max_depth=10, min_samples_leaf=10, random_state=2023)

In [23]:
model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, min_samples_leaf=10, random_state=2023)

In [24]:
# RMSE: 48413.14755723657
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 55389.74457427213


- RandomForest

In [25]:
model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=2023)

In [26]:
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, min_samples_leaf=10, random_state=2023)

In [27]:
# RMSE: 59558.21631506445
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

RMSE: 58950.59422297001


- XGB

In [28]:
parms = {'n_estimators' : 200, 'learning_rate' : 0.03, 'max_depth' : 10}
model = XGBRegressor(**parms)

In [35]:
model.fit(X_train.values, y_train.values)

Learning rate set to 0.034961
0:	learn: 84786.1698855	total: 165ms	remaining: 2m 44s
1:	learn: 83400.0069997	total: 166ms	remaining: 1m 23s
2:	learn: 82409.6546954	total: 168ms	remaining: 55.7s
3:	learn: 81622.9877059	total: 169ms	remaining: 42s
4:	learn: 80634.8798256	total: 170ms	remaining: 33.8s
5:	learn: 79567.2220173	total: 171ms	remaining: 28.3s
6:	learn: 78479.9692962	total: 172ms	remaining: 24.4s
7:	learn: 77429.9444498	total: 173ms	remaining: 21.5s
8:	learn: 76539.6141442	total: 174ms	remaining: 19.2s
9:	learn: 75785.2240278	total: 175ms	remaining: 17.4s
10:	learn: 75029.6640624	total: 177ms	remaining: 15.9s
11:	learn: 74200.8317944	total: 178ms	remaining: 14.7s
12:	learn: 73434.2202435	total: 179ms	remaining: 13.6s
13:	learn: 72667.2912762	total: 181ms	remaining: 12.7s
14:	learn: 71857.6236244	total: 182ms	remaining: 11.9s
15:	learn: 71013.5864425	total: 183ms	remaining: 11.3s
16:	learn: 70343.9764779	total: 184ms	remaining: 10.7s
17:	learn: 69530.4510196	total: 186ms	remaini

162:	learn: 28657.2640414	total: 348ms	remaining: 1.78s
163:	learn: 28527.9466997	total: 349ms	remaining: 1.78s
164:	learn: 28396.8013362	total: 350ms	remaining: 1.77s
165:	learn: 28339.9749198	total: 351ms	remaining: 1.76s
166:	learn: 28187.3754180	total: 352ms	remaining: 1.76s
167:	learn: 28104.2538963	total: 354ms	remaining: 1.75s
168:	learn: 27948.4785230	total: 355ms	remaining: 1.74s
169:	learn: 27831.7397766	total: 356ms	remaining: 1.74s
170:	learn: 27749.5394290	total: 358ms	remaining: 1.73s
171:	learn: 27679.1345726	total: 359ms	remaining: 1.73s
172:	learn: 27532.3114188	total: 361ms	remaining: 1.72s
173:	learn: 27460.1019700	total: 362ms	remaining: 1.72s
174:	learn: 27344.1975555	total: 363ms	remaining: 1.71s
175:	learn: 27302.0186184	total: 364ms	remaining: 1.71s
176:	learn: 27185.2250516	total: 366ms	remaining: 1.7s
177:	learn: 27102.5782195	total: 367ms	remaining: 1.7s
178:	learn: 26992.7512844	total: 368ms	remaining: 1.69s
179:	learn: 26878.8836689	total: 370ms	remaining: 

312:	learn: 16786.2181346	total: 522ms	remaining: 1.15s
313:	learn: 16747.6416801	total: 524ms	remaining: 1.14s
314:	learn: 16705.6272436	total: 525ms	remaining: 1.14s
315:	learn: 16636.7523543	total: 526ms	remaining: 1.14s
316:	learn: 16596.9154900	total: 527ms	remaining: 1.14s
317:	learn: 16537.0448965	total: 529ms	remaining: 1.13s
318:	learn: 16463.8077047	total: 530ms	remaining: 1.13s
319:	learn: 16408.3919117	total: 531ms	remaining: 1.13s
320:	learn: 16382.4661174	total: 532ms	remaining: 1.13s
321:	learn: 16310.0701206	total: 533ms	remaining: 1.12s
322:	learn: 16258.8650826	total: 534ms	remaining: 1.12s
323:	learn: 16222.5651252	total: 536ms	remaining: 1.12s
324:	learn: 16147.7241355	total: 537ms	remaining: 1.11s
325:	learn: 16077.8189628	total: 538ms	remaining: 1.11s
326:	learn: 16013.2792505	total: 539ms	remaining: 1.11s
327:	learn: 15963.9608021	total: 540ms	remaining: 1.11s
328:	learn: 15927.6723064	total: 541ms	remaining: 1.1s
329:	learn: 15885.2172933	total: 543ms	remaining:

459:	learn: 10877.2980501	total: 691ms	remaining: 811ms
460:	learn: 10856.2786599	total: 692ms	remaining: 809ms
461:	learn: 10823.4285574	total: 693ms	remaining: 807ms
462:	learn: 10794.1518004	total: 694ms	remaining: 805ms
463:	learn: 10741.4757470	total: 696ms	remaining: 803ms
464:	learn: 10721.4076754	total: 697ms	remaining: 802ms
465:	learn: 10705.3099190	total: 698ms	remaining: 800ms
466:	learn: 10653.4086650	total: 699ms	remaining: 798ms
467:	learn: 10620.5393034	total: 701ms	remaining: 796ms
468:	learn: 10594.2617097	total: 702ms	remaining: 794ms
469:	learn: 10584.1462136	total: 703ms	remaining: 793ms
470:	learn: 10563.1335725	total: 704ms	remaining: 791ms
471:	learn: 10545.5482254	total: 705ms	remaining: 789ms
472:	learn: 10511.4938134	total: 706ms	remaining: 787ms
473:	learn: 10484.9744270	total: 707ms	remaining: 785ms
474:	learn: 10440.6726131	total: 708ms	remaining: 783ms
475:	learn: 10408.9008442	total: 710ms	remaining: 781ms
476:	learn: 10366.6292779	total: 711ms	remaining

617:	learn: 7329.8959518	total: 872ms	remaining: 539ms
618:	learn: 7313.3256020	total: 873ms	remaining: 537ms
619:	learn: 7289.0681485	total: 874ms	remaining: 536ms
620:	learn: 7273.4209705	total: 876ms	remaining: 534ms
621:	learn: 7244.2983996	total: 877ms	remaining: 533ms
622:	learn: 7225.9968561	total: 878ms	remaining: 531ms
623:	learn: 7211.0152340	total: 880ms	remaining: 530ms
624:	learn: 7197.8229060	total: 881ms	remaining: 529ms
625:	learn: 7174.8927829	total: 882ms	remaining: 527ms
626:	learn: 7148.8794729	total: 883ms	remaining: 525ms
627:	learn: 7128.1443648	total: 884ms	remaining: 524ms
628:	learn: 7099.7234055	total: 886ms	remaining: 522ms
629:	learn: 7088.2672225	total: 887ms	remaining: 521ms
630:	learn: 7062.6174085	total: 888ms	remaining: 519ms
631:	learn: 7041.6934849	total: 889ms	remaining: 518ms
632:	learn: 7025.0765141	total: 890ms	remaining: 516ms
633:	learn: 7007.4095939	total: 891ms	remaining: 515ms
634:	learn: 6993.1942196	total: 893ms	remaining: 513ms
635:	learn

912:	learn: 3646.6657875	total: 1.22s	remaining: 116ms
913:	learn: 3636.7949954	total: 1.22s	remaining: 115ms
914:	learn: 3625.8662908	total: 1.22s	remaining: 113ms
915:	learn: 3615.3989503	total: 1.22s	remaining: 112ms
916:	learn: 3608.4389945	total: 1.22s	remaining: 111ms
917:	learn: 3600.2743482	total: 1.22s	remaining: 109ms
918:	learn: 3597.6066848	total: 1.22s	remaining: 108ms
919:	learn: 3584.8626268	total: 1.23s	remaining: 107ms
920:	learn: 3574.7430766	total: 1.23s	remaining: 105ms
921:	learn: 3565.8239516	total: 1.23s	remaining: 104ms
922:	learn: 3557.8511473	total: 1.23s	remaining: 103ms
923:	learn: 3548.7000450	total: 1.23s	remaining: 101ms
924:	learn: 3541.9156743	total: 1.23s	remaining: 99.9ms
925:	learn: 3534.1621276	total: 1.23s	remaining: 98.6ms
926:	learn: 3526.3046701	total: 1.23s	remaining: 97.2ms
927:	learn: 3522.3147685	total: 1.24s	remaining: 95.9ms
928:	learn: 3510.1933418	total: 1.24s	remaining: 94.5ms
929:	learn: 3503.8477782	total: 1.24s	remaining: 93.2ms
930:

<catboost.core.CatBoostRegressor at 0x133b8dbbf10>

In [36]:
# RMSE: 44891.27926803884
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

AttributeError: 'DataFrame' object has no attribute 'iteritems'

In [None]:
f_importance = model.get_booster().get_score(importance_type='gain')
pd.DataFrame(list(f_importance.values()), index=f_importance.keys()).plot.bar(figsize=(20,10))

- LGBM

In [None]:
parms = {'n_estimators': 200, 'max_depth': 10, 'num_leaves': 80}
model = LGBMRegressor(**parms, metrics='mse', random_state=2023)

In [30]:
model.fit(X_train, y_train)

AttributeError: module 'pandas' has no attribute 'Int64Index'

In [31]:
# RMSE: 68530.39434021145
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')

NotFittedError: need to call fit or load_model beforehand

- Catboost

In [32]:
model = CatBoostRegressor(iterations=1000, bootstrap_type ='Bayesian', random_state=2023)

In [33]:
model.fit(X_train, y_train)

AttributeError: 'DataFrame' object has no attribute 'iteritems'

In [None]:
# RMSE: 53353.17981981775
print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test))}')