In [26]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [27]:
# Import csv path/file
csv_path = Path('../10_Day_Lookback.csv')
df = pd.read_csv(csv_path, index_col = 'Unnamed: 0')

In [28]:
df = df.replace([np.inf], 100)
df = df.replace([np.NaN], 0)
df = df.replace([-np.inf], -100)

In [29]:
# Create our features
X = df.iloc[:, 10::]

# Create our target
y = df.iloc[:, 8]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=9, 
                                                    stratify=y)
X_train.shape

(1801, 100)

In [35]:
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

In [37]:
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [38]:
predictions = rf_model.predict(X_test_scaled)

In [39]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [40]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,138,145
Actual 1,121,197


Accuracy Score : 0.5535646820899172
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.53      0.49      0.62      0.51      0.55      0.30       283
          1       0.58      0.62      0.49      0.60      0.55      0.31       318

avg / total       0.56      0.56      0.55      0.56      0.55      0.30       601



In [41]:
importances = rf_model.feature_importances_

In [42]:
sorted(zip(rf_model.feature_importances_[0:10], X.columns), reverse=True)

[(0.012859874976278055, 'Home_HittingBB'),
 (0.01148202306575949, 'Home_HittingR'),
 (0.011445355753879779, 'Home_HittingH'),
 (0.010964991606277504, 'Home_HittingG'),
 (0.010708396074355246, 'Home_HittingRBI'),
 (0.010459432105721854, 'Home_HittingPA'),
 (0.010456454543165798, 'Home_HittingAB'),
 (0.01038136187011033, 'Home_Hitting2B'),
 (0.010216042849171075, 'Home_HittingHR'),
 (0.005082078227468171, 'Home_Hitting3B')]

In [43]:
clf = AdaBoostClassifier(n_estimators = 100, random_state = 9)

In [44]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=100, random_state=9)

In [45]:
predictions = clf.predict(X)

In [46]:
cm = confusion_matrix(y, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
acc_score = balanced_accuracy_score(y, predictions)

In [47]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,722,409
Actual 1,364,907


Accuracy Score : 0.6759922253967128
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.66      0.64      0.71      0.65      0.67      0.45      1131
          1       0.69      0.71      0.64      0.70      0.67      0.46      1271

avg / total       0.68      0.68      0.67      0.68      0.67      0.46      2402

