In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/content/sample_data/KBO increased data_over 30 AB.csv')

In [3]:
# train set과 test set 분리
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
R = train_set["R"].to_numpy()
R = R.reshape(-1, 1)

RBI = train_set["RBI"].to_numpy()
RBI = RBI.reshape(-1, 1)

R_AND_RBI = np.concatenate((R, RBI), axis=1)

y = train_set["WAR"].copy().to_numpy()
y = y.reshape(-1, 1)

In [5]:
R_test = test_set["R"].to_numpy()
R_test = R_test.reshape(-1, 1)

RBI_test = test_set["RBI"].to_numpy()
RBI_test = RBI_test.reshape(-1, 1)

R_AND_RBI_test = np.concatenate((R_test, RBI_test), axis=1)

y_test = test_set["WAR"].copy().to_numpy()
y_test = y_test.reshape(-1, 1)

## Voting Regressor

In [6]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


single_models = [
    ('linreg', LinearRegression()),
    ('ridge', Ridge(alpha=0.0001)),
    ('lasso', Lasso(alpha=0.0001)),
    ('elasticnet', ElasticNet(alpha=0.0001, l1_ratio=0.0001)),
    ('SVR', SVR(kernel='poly', C=0.012888951526111539, degree=3, epsilon=0.07498719388057506)),
    ('tree', DecisionTreeRegressor())
]

voting_reg = VotingRegressor(single_models)

pipeline = make_pipeline(StandardScaler(), voting_reg)
pipeline.fit(R_AND_RBI, y)

  y = column_or_1d(y, warn=True)


In [8]:
from sklearn.metrics import root_mean_squared_error
y_pred = pipeline.predict(R_AND_RBI)
rmse = root_mean_squared_error(y, y_pred)
rmse

0.7250126415921435

In [9]:
y_pred_test = pipeline.predict(R_AND_RBI_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
rmse_test

0.6389957922445909

In [12]:
# 황재균 WAR(-0.30) 예측
hwang = pipeline.predict([[60, 58]])
hwang

array([1.45806174])

In [13]:
# 로하스 WAR(6.50) 예측
Rojas = pipeline.predict([[108, 112]])
Rojas

array([5.14681324])

In [14]:
# 김도영 WAR(8.32) 예측
DO_YEONG = pipeline.predict([[143, 109]])
DO_YEONG

array([7.30154443])

In [15]:
# 최정 WAR(4.55) 예측
Choi_Jung = pipeline.predict([[93, 107]])
Choi_Jung

array([4.02875769])

In [16]:
# 강백호 WAR(2.12) 예측
BaekHo = pipeline.predict([[92, 96]])
BaekHo

array([3.37141255])

## Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=-1, random_state=42)
rnd_reg.fit(R_AND_RBI, y)
y_pred = rnd_reg.predict(R_AND_RBI)
rmse = root_mean_squared_error(y, y_pred)
rmse

  return fit_method(estimator, *args, **kwargs)


0.5140228287232677

In [18]:
y_pred_test = rnd_reg.predict(R_AND_RBI_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
rmse_test

0.7468626951525716

In [19]:
# 황재균 WAR(-0.30) 예측
hwang = rnd_reg.predict([[60, 58]])
hwang

array([0.79941513])

In [20]:
# 로하스 WAR(6.50) 예측
Rojas = rnd_reg.predict([[108, 112]])
Rojas

array([5.76215172])

In [21]:
# 김도영 WAR(8.32) 예측
DO_YEONG = rnd_reg.predict([[143, 109]])
DO_YEONG

array([7.27156262])

In [22]:
# 최정 WAR(4.55) 예측
Choi_Jung = rnd_reg.predict([[93, 107]])
Choi_Jung

array([4.65439369])

In [23]:
# 강백호 WAR(2.12) 예측
BaekHo = rnd_reg.predict([[92, 96]])
BaekHo

array([2.87098853])

In [24]:
# feature importance
for score, name in zip(rnd_reg.feature_importances_, ['R', 'RBI']):
    print(round(score, 2), name)

0.8 R
0.2 RBI


## Boosting

### AdaBoost

In [25]:
from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(n_estimators=500, learning_rate=0.01, random_state=42)
ada_reg.fit(R_AND_RBI, y)

y_pred = ada_reg.predict(R_AND_RBI)
rmse = root_mean_squared_error(y, y_pred)
rmse

  y = column_or_1d(y, warn=True)


0.7363483503638959

In [26]:
y_pred_test = ada_reg.predict(R_AND_RBI_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
rmse_test

0.676871698369842

In [27]:
# 황재균 WAR(-0.30) 예측
hwang = ada_reg.predict([[60, 58]])
hwang

array([1.54675676])

In [28]:
# 로하스 WAR(6.50) 예측
Rojas = ada_reg.predict([[108, 112]])
Rojas

array([5.842])

In [29]:
# 김도영 WAR(8.32) 예측
DO_YEONG = ada_reg.predict([[143, 109]])
DO_YEONG

array([8.32])

In [30]:
# 최정 WAR(4.55) 예측
Choi_Jung = ada_reg.predict([[93, 107]])
Choi_Jung

array([4.687])

In [31]:
# 강백호 WAR(2.12) 예측
BaekHo = ada_reg.predict([[92, 96]])
BaekHo

array([3.34125])

### Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(n_estimators=500, learning_rate=0.01, random_state=42)
gbrt.fit(R_AND_RBI, y)

y_pred = gbrt.predict(R_AND_RBI)
rmse = root_mean_squared_error(y, y_pred)
rmse

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


0.5627534563442853

In [33]:
y_pred_test = gbrt.predict(R_AND_RBI_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
rmse_test

0.7031977302327256

In [35]:
# 황재균 WAR(-0.30) 예측
hwang = gbrt.predict([[60, 58]])
hwang

array([1.17387752])

In [36]:
# 로하스 WAR(6.50) 예측
Rojas = gbrt.predict([[108, 112]])
Rojas

array([6.36044335])

In [44]:
# 김도영 WAR(8.32) 예측
DO_YEONG = gbrt.predict([[143, 109]])
DO_YEONG

array([8.02528506])

In [37]:
# 최정 WAR(4.55) 예측
Choi_Jung = gbrt.predict([[93, 107]])
Choi_Jung

array([4.50071835])

In [38]:
# 강백호 WAR(2.12) 예측
BaekHo = gbrt.predict([[92, 96]])
BaekHo

array([3.02872578])

### Stacking

In [39]:
from sklearn.ensemble import StackingRegressor

stacking_base_models = [
    ('voting_reg', voting_reg),
    ('ada_reg', ada_reg),
    ('gbrt', gbrt),
    ('rnd_reg', rnd_reg),
]

stacking_reg = StackingRegressor(
    estimators=stacking_base_models,
    final_estimator=RandomForestRegressor(random_state=42),
    cv=5
)

stacking_reg.fit(R_AND_RBI, y)

y_pred = stacking_reg.predict(R_AND_RBI)
rmse = root_mean_squared_error(y, y_pred)
rmse

  y = column_or_1d(y, warn=True)


0.7285586708472944

In [40]:
y_pred_test = stacking_reg.predict(R_AND_RBI_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
rmse_test

0.7467877895119217

In [41]:
# 황재균 WAR(-0.30) 예측
hwang = stacking_reg.predict([[60, 58]])
hwang

array([1.3633])

In [42]:
# 로하스 WAR(6.50) 예측
Rojas = stacking_reg.predict([[108, 112]])
Rojas

array([6.9141])

In [43]:
# 김도영 WAR(8.32) 예측
DO_YEONG = stacking_reg.predict([[143, 109]])
DO_YEONG

array([7.1903])

In [45]:
# 최정 WAR(4.55) 예측
Choi_Jung = stacking_reg.predict([[93, 107]])
Choi_Jung

array([4.5705])

In [46]:
# 강백호 WAR(2.12) 예측
BaekHo = stacking_reg.predict([[92, 96]])
BaekHo

array([3.2911])

In [47]:
# save model
import joblib
joblib.dump(voting_reg, "voting_reg_with_R_and_RBI_model.pkl")
joblib.dump(ada_reg, "ada_reg_with_R_and_RBI_model.pkl")
joblib.dump(gbrt, "gbrt_with_R_and_RBI_model.pkl")
joblib.dump(rnd_reg, "rnd_reg_with_R_and_RBI_model.pkl")
joblib.dump(stacking_reg, "stacking_reg_with_R_and_RBI_model.pkl")

['stacking_reg_with_R_and_RBI_model.pkl']