In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgbm
import structureboost as stb
import ml_insights as mli
from structureboost import log_loss
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

In [None]:
df=pd.read_csv('df_bp1.csv')


In [None]:
df.info(max_cols=1000)


In [None]:
df.season.value_counts().sort_index()


In [None]:
hv_mean = df.home_victory.mean()
hv_mean

In [None]:
df.run_diff.value_counts()


In [None]:
df.outs_total.value_counts()


In [None]:
df.home_victory[df.outs_total==53].mean()


In [None]:
df.loc[(df.outs_total==53) & (df.home_victory!=1),:]


In [None]:
df.game_no_h.value_counts().sort_index()


In [None]:
df = df[df.run_diff!=0]
df_train = df[(df.season>1980) & (df.season<=2018)]
df_valid = df[(df.season>=2019) & (df.season<=2020)]
df_test = df[df.season>=2021]

In [None]:
features = ['OBP_162_h','OBP_162_v',
            'SLG_162_h','SLG_162_v',
            # 'OBP_30_h','OBP_30_v',
            # 'SLG_30_h','SLG_30_v',
            # 'game_no_h',
           ]
target = 'home_victory'

In [None]:
X_train = df_train.loc[:,features]
X_valid = df_valid.loc[:,features]
X_test = df_test.loc[:,features]

y_train = df_train[target].to_numpy()
y_valid = df_valid[target].to_numpy()
y_test = df_test[target].to_numpy()

In [None]:
X_train.shape, X_valid.shape, X_test.shape,


In [None]:
plt.hist(X_train.OBP_162_h, np.linspace(.25,.4,151));


In [None]:
plt.hist(X_train.SLG_162_v, np.linspace(.3,.5,201));


In [None]:
lgbm1 = lgbm.LGBMClassifier(n_estimators=1000, learning_rate=.02, max_depth=2)
lgbm1.fit(X_train, y_train, eval_set=(X_valid, y_valid), eval_metric='logloss',
          callbacks=[lgbm.early_stopping(stopping_rounds=50), lgbm.log_evaluation(10)])

In [None]:
preds_lgbm = lgbm1.predict_proba(X_test)[:,1]


In [None]:
log_loss(y_test, preds_lgbm), log_loss(y_test, hv_mean*np.ones(len(y_test)))


In [None]:
rd = mli.get_range_dict(X_train)
plot_pts = X_test.sample(3, random_state=42)

In [None]:
mli.ice_plot(lgbm1, plot_pts, X_test.columns, mli.get_range_dict(X_train), plots_per_row=2)


In [None]:
plt.figure(figsize=(16,6))
mli.plot_reliability_diagram(y_test, preds_lgbm, show_histogram=True);

In [None]:
fc = stb.get_basic_config(X_train, stb.default_config_dict())

stb1 = stb.StructureBoost(max_depth=3, learning_rate=.02, feature_configs = fc, num_trees=2000)
stb1.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stop_past_steps=5)

In [None]:
preds_stb = stb1.predict(X_test)


In [None]:
log_loss(y_test, preds_stb), log_loss(y_test, hv_mean*np.ones(len(y_test)))


In [None]:
plt.figure(figsize=(16,6))
mli.plot_reliability_diagram(y_test, preds_stb, show_histogram=True);

In [None]:
rd= mli.get_range_dict(X_train)
mli.ice_plot(stb1, plot_pts, X_test.columns, rd, plots_per_row=2)