In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [144]:
# Import csv path/file
csv_path = Path('../Project_2/Training_Data/2016_10_day.csv')
df = pd.read_csv(csv_path, index_col = 'Date')
csv_path_2 = Path('../10_Day_Lookback_2017.csv')
df_2 = pd.read_csv(csv_path_2)

In [145]:
df = df.replace([np.inf], 150)
df = df.replace([np.NaN], 0)
df = df.replace([-np.inf], -150)
df_2 = df_2.replace([np.inf], 150)
df_2 = df_2.replace([np.NaN], 0)
df_2 = df_2.replace([-np.inf], -150)

In [153]:
# Create our features
X = df.iloc[:, 10::]
# Create our target
y = df['home_win_loss']

In [154]:
X.head()

Unnamed: 0_level_0,Home_PitchingOBP_allowed,Home_PitchingSLG%_allowed,Visitor_PitchingK%,Visitor_PitchingBB%,Visitor_PitchingOBP_allowed,Visitor_PitchingSLG%_allowed,Home_HittingK%,Home_HittingBB%,Home_HittingOBP,Home_HittingSLG%,Visitor_HittingK%,Visitor_HittingBB%,Visitor_HittingOBP,Visitor_HittingSLG%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2016-04-13,0.304348,0.353448,0.234568,0.119342,0.35,0.414286,0.247059,0.109804,0.279528,0.248889,0.218623,0.089069,0.355102,0.442396
2016-04-13,0.309091,0.338384,0.213058,0.113402,0.370242,0.445344,0.217391,0.117391,0.346491,0.383838,0.241509,0.120755,0.301527,0.290749
2016-04-13,0.280576,0.330677,0.219672,0.091803,0.334426,0.476015,0.251799,0.061151,0.275362,0.366142,0.227425,0.070234,0.298658,0.364964
2016-04-13,0.218884,0.309417,0.209738,0.108614,0.328244,0.371681,0.200692,0.131488,0.371528,0.440329,0.190283,0.089069,0.331967,0.422018
2016-04-13,0.341912,0.341991,0.164794,0.093633,0.349624,0.540426,0.239203,0.106312,0.369128,0.470588,0.273092,0.116466,0.322581,0.396313


In [155]:
len(y)

2318

In [156]:
len(y[y == 1])

1237

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=9, 
                                                    stratify=y)
X_train.shape

(1738, 14)

In [158]:
X_train = X[:1159]
X_test = X[1160:]
y_train = y[:1159]
y_test = y[1160:]

In [159]:
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [160]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)

In [161]:
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False)

In [162]:
predictions = rf_model.predict(X_test_scaled)

In [163]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [164]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,208,312
Actual 1,253,385


Accuracy Score : 0.5017241379310344
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.45      0.40      0.60      0.42      0.49      0.24       520
          1       0.55      0.60      0.40      0.58      0.49      0.25       638

avg / total       0.51      0.51      0.49      0.51      0.49      0.24      1158



In [165]:
importances = rf_model.feature_importances_

In [166]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.07869475129381144, 'Visitor_HittingK%'),
 (0.07624074318963266, 'Home_HittingBB%'),
 (0.074917114287018, 'Visitor_HittingSLG%'),
 (0.07296456679006474, 'Home_PitchingOBP_allowed'),
 (0.0719591137085468, 'Visitor_PitchingK%'),
 (0.07098831493760045, 'Home_PitchingSLG%_allowed'),
 (0.07075552002876573, 'Visitor_HittingBB%'),
 (0.07069418941781165, 'Home_HittingK%'),
 (0.06942764239876868, 'Visitor_PitchingOBP_allowed'),
 (0.06941847160930609, 'Visitor_PitchingSLG%_allowed'),
 (0.06918794883964129, 'Visitor_HittingOBP'),
 (0.0685316768004627, 'Home_HittingOBP'),
 (0.06846049081925304, 'Visitor_PitchingBB%'),
 (0.06775945587931671, 'Home_HittingSLG%')]

In [167]:
clf = AdaBoostClassifier(n_estimators = 2500, random_state = 9)

In [168]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=2500, random_state=9)

In [169]:
predictions = clf.predict(X_test)

In [170]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
acc_score = balanced_accuracy_score(y_test, predictions)

In [171]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,265,255
Actual 1,289,349


Accuracy Score : 0.5283186640945261
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.48      0.51      0.55      0.49      0.53      0.28       520
          1       0.58      0.55      0.51      0.56      0.53      0.28       638

avg / total       0.53      0.53      0.53      0.53      0.53      0.28      1158



In [179]:
actual_df = pd.DataFrame(y_test)
actual_df.reset_index(inplace = True)

In [180]:
predict_df = pd.DataFrame(predictions)
predict_df.head()

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,1


In [181]:
actual_predict_df = pd.concat([actual_df,predict_df], axis = 1, join = 'inner')

In [183]:
actual_predict_df.set_index('Date',inplace = True)

In [185]:
actual_predict_df.columns = ['Actual','Predictions']

In [186]:
actual_predict_df.head()

Unnamed: 0_level_0,Actual,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-07-06,1,0
2016-07-06,1,0
2016-07-06,0,1
2016-07-06,0,0
2016-07-06,1,1


In [216]:
odds_df_new = df[['home','visitor','home_open_odds','visitor_open_odds']][1160:]
odds_df_new.reset_index(inplace = True)
odds_df_new.drop(columns = ['Date'],inplace = True)

In [217]:
home_visitor_df = pd.concat([actual_df,predict_df, odds_df_new], axis = 1, join ='inner')

In [218]:
home_visitor_df.set_index('Date', inplace = True)

In [220]:
home_visitor_df.columns = ['Actual','Predicted','Home','Visitor','Home_Open_Odds','Visitor_Open_Odds']

In [221]:
home_visitor_df.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-07-06,1,0,MIN,OAK,-106,-104
2016-07-06,1,0,TOR,KAN,-180,160
2016-07-06,0,1,TAM,LAA,-150,135
2016-07-06,0,0,BOS,TEX,-185,165
2016-07-06,1,1,HOU,SEA,-165,145
