In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [81]:
# Import csv path/file
csv_path = Path('../10_Day_Lookback.csv')
df = pd.read_csv(csv_path, index_col = 'Unnamed: 0')
csv_path_2 = Path('../10_Day_Lookback_2017.csv')
df_2 = pd.read_csv(csv_path_2)

In [120]:
df = df.replace([np.inf], 150)
df = df.replace([np.NaN], 0)
df = df.replace([-np.inf], -150)
df_2 = df_2.replace([np.inf], 150)
df_2 = df_2.replace([np.NaN], 0)
df_2 = df_2.replace([-np.inf], -150)

In [121]:
# Create our features
X = df.iloc[:, 10::]
# Create our target
y = df.iloc[:, 8]

In [122]:
X.head()

Unnamed: 0,Home_HittingG,Home_HittingPA,Home_HittingAB,Home_HittingR,Home_HittingH,Home_Hitting2B,Home_Hitting3B,Home_HittingHR,Home_HittingRBI,Home_HittingBB,...,Visitor_PitchingStr,Visitor_PitchingStL,Visitor_PitchingStS,Visitor_PitchingGB/FB,Visitor_PitchingLD,Visitor_PitchingPU,Visitor_PitchingWHIP,Visitor_PitchingBAbip,Visitor_PitchingSO9,Visitor_PitchingSO/W
2019-03-30,10,33,31,0,5,1,0,0,0,1,...,2.59,0.62,0.8,0.66,1.75,0.08,1.0,0.417,51.0,10.0
2019-03-30,11,37,31,10,7,0,0,3,10,6,...,3.03,0.53,0.78,2.29,0.0,0.0,9.7,0.523,53.1,4.0
2019-03-30,23,65,59,4,6,0,0,4,4,4,...,4.78,1.36,0.77,3.94,0.53,0.05,3.262,0.156,50.7,7.33
2019-03-30,21,67,61,10,15,1,0,5,10,4,...,5.74,1.49,1.04,4.59,2.11,0.13,7.646,1.238,60.5,9.0
2019-03-30,21,62,55,6,14,3,0,2,6,5,...,4.94,1.21,0.84,3.35,1.86,0.07,7.357,2.052,51.7,12.5


In [123]:
len(y)

2402

In [124]:
len(y[y == 1])

1271

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=9, 
                                                    stratify=y)
X_train.shape

(1801, 100)

In [126]:
X_train = X[:1159]
X_test = X[1160:]
y_train = y[:1159]
y_test = y[1160:]

In [127]:
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [128]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)

In [129]:
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False)

In [130]:
predictions = rf_model.predict(X_test_scaled)

In [131]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [132]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,201,385
Actual 1,196,460


Accuracy Score : 0.5221114625822025
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.51      0.34      0.70      0.41      0.49      0.23       586
          1       0.54      0.70      0.34      0.61      0.49      0.25       656

avg / total       0.53      0.53      0.51      0.52      0.49      0.24      1242



In [133]:
importances = rf_model.feature_importances_

In [134]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.020939282830568243, 'Home_PitchingBAbip'),
 (0.01997053259693475, 'Home_PitchingLD'),
 (0.018544364653714958, 'Home_PitchingSO9'),
 (0.017805818104940606, 'Home_PitchingWHIP'),
 (0.01744468754754077, 'Home_PitchingStS'),
 (0.01645730179820481, 'Home_PitchingPU'),
 (0.016355939502478746, 'Visitor_PitchingSO/W'),
 (0.016283326968724605, 'Home_PitchingBB'),
 (0.01534164550184438, 'Visitor_PitchingStS'),
 (0.015203983113381203, 'Home_PitchingSO/W'),
 (0.015021181099755119, 'Visitor_PitchingPU'),
 (0.014615441100957173, 'Home_PitchingStL'),
 (0.014514727303144934, 'Home_PitchingGB/FB'),
 (0.014482803955705562, 'Visitor_PitchingSO9'),
 (0.014478767661033603, 'Visitor_PitchingLD'),
 (0.013956590032033847, 'Visitor_PitchingGB/FB'),
 (0.01353280988341586, 'Home_PitchingSO'),
 (0.013491152639593138, 'Visitor_PitchingBAbip'),
 (0.01336482952785747, 'Visitor_PitchingStL'),
 (0.013149838780468692, 'Visitor_HittingSO'),
 (0.012994793470672094, 'Visitor_PitchingWHIP'),
 (0.012874792225275005, 'Ho

In [135]:
clf = AdaBoostClassifier(n_estimators = 2500, random_state = 9)

In [136]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=2500, random_state=9)

In [137]:
predictions = clf.predict(X_test)

In [138]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
acc_score = balanced_accuracy_score(y_test, predictions)

In [139]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,255,331
Actual 1,287,369


Accuracy Score : 0.4988267918088737
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.44      0.56      0.45      0.49      0.24       586
          1       0.53      0.56      0.44      0.54      0.49      0.25       656

avg / total       0.50      0.50      0.50      0.50      0.49      0.24      1242



In [140]:
actual_df = pd.DataFrame(y_test)
len(actual_df)

1242

In [141]:
predict_df = pd.DataFrame(predictions)
len(predict_df)

1242