In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [289]:
# Import csv path/file
csv_path = Path('../Project_2/Training_Data/2019_10_day.csv')
df = pd.read_csv(csv_path, index_col = 'Date')
csv_path_2 = Path('../10_Day_Lookback_2017.csv')
df_2 = pd.read_csv(csv_path_2)

In [290]:
df = df.replace([np.inf], 150)
df = df.replace([np.NaN], 0)
df = df.replace([-np.inf], -150)
df_2 = df_2.replace([np.inf], 150)
df_2 = df_2.replace([np.NaN], 0)
df_2 = df_2.replace([-np.inf], -150)

In [291]:
# Create our features
X = df.iloc[:, 10::]
# Create our target
y = df['home_win_loss']

In [292]:
X.head()

Unnamed: 0_level_0,Home_PitchingOBP_allowed,Home_PitchingSLG%_allowed,Visitor_PitchingK%,Visitor_PitchingBB%,Visitor_PitchingOBP_allowed,Visitor_PitchingSLG%_allowed,Home_HittingK%,Home_HittingBB%,Home_HittingOBP,Home_HittingSLG%,Visitor_HittingK%,Visitor_HittingBB%,Visitor_HittingOBP,Visitor_HittingSLG%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-03-30,0.235294,0.258065,0.424242,0.030303,0.212121,0.193548,0.424242,0.030303,0.212121,0.193548,0.352941,0.088235,0.235294,0.258065
2019-03-30,0.361111,0.366667,0.243243,0.162162,0.351351,0.516129,0.243243,0.162162,0.351351,0.516129,0.25,0.166667,0.361111,0.366667
2019-03-30,0.333333,0.432432,0.246154,0.061538,0.184615,0.305085,0.246154,0.061538,0.184615,0.305085,0.148148,0.08642,0.333333,0.432432
2019-03-30,0.353659,0.555556,0.19403,0.059701,0.298507,0.508197,0.19403,0.059701,0.298507,0.508197,0.253012,0.108434,0.353659,0.555556
2019-03-30,0.235294,0.269841,0.290323,0.080645,0.322581,0.418182,0.290323,0.080645,0.322581,0.418182,0.25,0.058824,0.235294,0.269841


In [293]:
len(y)

2402

In [294]:
len(y[y == 1])

1271

In [295]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=9, 
                                                    stratify=y)
X_train.shape

(1801, 14)

In [296]:
X_train = X[:1159]
X_test = X[1160:]
y_train = y[:1159]
y_test = y[1160:]

In [297]:
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [298]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)

In [299]:
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False)

In [300]:
predictions = rf_model.predict(X_test_scaled)

In [301]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [302]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,246,340
Actual 1,273,383


Accuracy Score : 0.5018183426288187
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.47      0.42      0.58      0.45      0.50      0.24       586
          1       0.53      0.58      0.42      0.56      0.50      0.25       656

avg / total       0.50      0.51      0.50      0.50      0.50      0.25      1242



In [303]:
importances = rf_model.feature_importances_

In [304]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.0768952304372194, 'Home_HittingK%'),
 (0.07641172467243727, 'Home_PitchingOBP_allowed'),
 (0.07474198832561914, 'Visitor_PitchingOBP_allowed'),
 (0.07325347382735718, 'Visitor_PitchingSLG%_allowed'),
 (0.07295745047612547, 'Visitor_HittingSLG%'),
 (0.07249858784603377, 'Visitor_HittingBB%'),
 (0.07187883584120112, 'Home_HittingBB%'),
 (0.07173734768986574, 'Home_HittingSLG%'),
 (0.07061482132584573, 'Visitor_PitchingK%'),
 (0.07051536587039384, 'Home_PitchingSLG%_allowed'),
 (0.06729844265937747, 'Visitor_HittingK%'),
 (0.06722332144108022, 'Visitor_PitchingBB%'),
 (0.06716070400751911, 'Visitor_HittingOBP'),
 (0.06681270557992451, 'Home_HittingOBP')]

In [305]:
clf = AdaBoostClassifier(n_estimators = 2500, random_state = 9)

In [306]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=2500, random_state=9)

In [307]:
predictions = clf.predict(X_test)

In [308]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
acc_score = balanced_accuracy_score(y_test, predictions)

In [309]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,291,295
Actual 1,307,349


Accuracy Score : 0.5142996129193373
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.50      0.53      0.49      0.51      0.26       586
          1       0.54      0.53      0.50      0.54      0.51      0.27       656

avg / total       0.52      0.52      0.51      0.52      0.51      0.26      1242



In [310]:
actual_df = pd.DataFrame(y_test)
actual_df.reset_index(inplace = True)

In [311]:
predict_df = pd.DataFrame(predictions)
predict_df.head()

Unnamed: 0,0
0,1
1,0
2,1
3,1
4,1


In [312]:
actual_predict_df = pd.concat([actual_df,predict_df], axis = 1, join = 'inner')

In [313]:
actual_predict_df.set_index('Date',inplace = True)

In [314]:
actual_predict_df.columns = ['Actual','Predictions']

In [315]:
actual_predict_df.head()

Unnamed: 0_level_0,Actual,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-06-25,1,1
2019-06-26,1,0
2019-06-26,0,1
2019-06-26,1,1
2019-06-26,0,1


In [316]:
odds_df_new = df[['home','visitor','home_open_odds','visitor_open_odds']][1160:]
odds_df_new.reset_index(inplace = True)
odds_df_new.drop(columns = ['Date'],inplace = True)

In [434]:
home_visitor_df = pd.concat([actual_df,predict_df, odds_df_new], axis = 1, join ='inner')

In [435]:
home_visitor_df.set_index('Date', inplace = True)

In [436]:
home_visitor_df.columns = ['Actual','Predicted','Home','Visitor','Home_Open_Odds','Visitor_Open_Odds']

In [437]:
home_visitor_df.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-25,1,1,LAA,CIN,-150,130
2019-06-26,1,0,ARI,LOS,125,-145
2019-06-26,0,1,SFO,COL,115,-135
2019-06-26,1,1,PHI,NYM,-130,110
2019-06-26,0,1,MIA,WAS,160,-185


In [462]:
year = input()
csv_name = '../Project_2/Predictions_Vs_Actual/Predictions_Actual_' + year + '.csv'
csv_path = Path(csv_name)
df = pd.read_csv(csv_path, index_col = 'Date')
df.head()

 2017


Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-07-06,1,1,TAM,BOS,134,-149
2017-07-06,1,1,MIN,BAL,-156,141
2017-07-06,0,1,SEA,OAK,-144,129
2017-07-06,1,1,DET,SFO,-111,101
2017-07-06,1,0,CLE,SDG,-184,164


In [464]:
def find_total_profits(df):
    # Convert open odds to integers
    df['Home_Open_Odds'] = df['Home_Open_Odds'].astype(int)
    df['Visitor_Open_Odds'] = df['Visitor_Open_Odds'].astype(int)

    # Pull in predicted win and predicted loss
    home_win = df[df['Predicted'] == 1]
    home_loss = df[df['Predicted'] == 0]

    # Grab instances where the predicted winner is the favorite to win
    home_win_fav = home_win[home_win['Home_Open_Odds'] < 0]
    home_loss_fav = home_loss[home_loss['Visitor_Open_Odds'] < 0]

    # Grab instances where the predicted winner is the underdog
    home_win_dog = home_win[home_win['Home_Open_Odds'] > 0]
    home_loss_dog = home_loss[home_loss['Visitor_Open_Odds'] > 0]

    # Grab instances where the favs were the actual winners and predicted winners
    home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
    home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))
    home_loss_fav_true = home_loss_fav[home_loss_fav['Actual'] == 1]
    home_loss_fav_true['Visitor_Open_Odds'] = ((10000)/abs(home_loss_fav_true['Visitor_Open_Odds']))

    # Grab instances where the dogs were winners and predicted winners
    home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
    home_loss_dog_true = home_loss_dog[home_loss_dog['Actual']==0]

    # Grab instances where the dogs were losers, but predicted winners
    home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]
    home_loss_dog_false = home_loss_dog[home_loss_dog['Actual']==1]

    # Grab instances where the favs were predicted winners but lost
    home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
    home_win_fav_false['Home_Open_Odds'] = -100
    home_loss_fav_false = home_loss_fav[home_loss_fav['Actual'] == 1]
    home_loss_fav_false['Visitor_Open_Odds'] = -100

    # Concatinate the dataframes
    bet_results_fav_win = pd.concat([home_win_fav_false, home_win_fav_true]) # Home Open Odds
    bet_results_fav_loss = pd.concat([home_loss_fav_false, home_loss_fav_true]) # Visitor Open Odds
    bet_results_dog_win = pd.concat([home_win_dog_true, home_win_dog_false]) # Home Open Odds
    bet_results_dog_loss = pd.concat([home_loss_dog_true, home_loss_dog_false]) # Visitor Open Odds

    # Sum up final wins/loss money lines
    fav_win_sum = bet_results_fav_win['Home_Open_Odds'].sum()
    fav_loss_sum = bet_results_fav_loss['Visitor_Open_Odds'].sum()
    dog_win_sum = bet_results_dog_win['Home_Open_Odds'].sum()
    dog_loss_sum = bet_results_dog_loss['Visitor_Open_Odds'].sum()
    final_sum = fav_win_sum + fav_loss_sum + dog_win_sum + dog_loss_sum
    return final_sum

In [465]:
# Convert open odds to integers
df['Home_Open_Odds'] = df['Home_Open_Odds'].astype(int)
df['Visitor_Open_Odds'] = df['Visitor_Open_Odds'].astype(int)

# Pull in predicted win and predicted loss
home_win = df[df['Predicted'] == 1]
home_loss = df[df['Predicted'] == 0]

# Grab instances where the predicted winner is the favorite to win
home_win_fav = home_win[home_win['Home_Open_Odds'] < 0]
home_loss_fav = home_loss[home_loss['Visitor_Open_Odds'] < 0]

# Grab instances where the predicted winner is the underdog
home_win_dog = home_win[home_win['Home_Open_Odds'] > 0]
home_loss_dog = home_loss[home_loss['Visitor_Open_Odds'] > 0]

# Grab instances where the favs were the actual winners and predicted winners
home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))
home_loss_fav_true = home_loss_fav[home_loss_fav['Actual'] == 1]
home_loss_fav_true['Visitor_Open_Odds'] = ((10000)/abs(home_loss_fav_true['Visitor_Open_Odds']))

# Grab instances where the dogs were winners and predicted winners
home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
home_loss_dog_true = home_loss_dog[home_loss_dog['Actual']==0]

# Grab instances where the dogs were losers, but predicted winners
home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]
home_loss_dog_false = home_loss_dog[home_loss_dog['Actual']==1]

# Grab instances where the favs were predicted winners but lost
home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
home_win_fav_false['Home_Open_Odds'] = -100
home_loss_fav_false = home_loss_fav[home_loss_fav['Actual'] == 1]
home_loss_fav_false['Visitor_Open_Odds'] = -100

# Concatinate the dataframes
bet_results_fav_win = pd.concat([home_win_fav_false, home_win_fav_true]) # Home Open Odds
print(bet_results_fav_win)
bet_results_fav_loss = pd.concat([home_loss_fav_false, home_loss_fav_true]) # Visitor Open Odds
print(bet_results_fav_loss)
bet_results_dog_win = pd.concat([home_win_dog_true, home_win_dog_false]) # Home Open Odds
print(bet_results_dog_win)
bet_results_dog_loss = pd.concat([home_loss_dog_true, home_loss_dog_false]) # Visitor Open Odds
print(bet_results_dog_loss)

# Sum up final wins/loss money lines
fav_win_sum = bet_results_fav_win['Home_Open_Odds'].sum()
fav_loss_sum = bet_results_fav_loss['Visitor_Open_Odds'].sum()
dog_win_sum = bet_results_dog_win['Home_Open_Odds'].sum()
dog_loss_sum = bet_results_dog_loss['Visitor_Open_Odds'].sum()
final_sum = fav_win_sum + fav_loss_sum + dog_win_sum + dog_loss_sum

            Actual  Predicted Home Visitor  Home_Open_Odds  Visitor_Open_Odds
Date                                                                         
2017-07-06       0          1  SEA     OAK     -100.000000                129
2017-07-06       0          1  CUB     MIL     -100.000000                141
2017-07-07       0          1  STL     NYM     -100.000000                107
2017-07-07       0          1  SFO     MIA     -100.000000               -107
2017-07-08       0          1  WAS     ATL     -100.000000                179
2017-07-08       0          1  CUB     PIT     -100.000000                160
2017-07-08       0          1  ARI     CIN     -100.000000                155
2017-07-08       0          1  MIN     BAL     -100.000000                125
2017-07-08       0          1  SEA     OAK     -100.000000                151
2017-07-08       0          1  COL     CWS     -100.000000                129
2017-07-09       0          1  ARI     CIN     -100.000000      

In [335]:
home_win['Home_Open_Odds'] = home_win['Home_Open_Odds'].astype(int)

In [343]:
home_win_dog_pred = home_win[home_win['Home_Open_Odds'] > 0]

In [356]:
home_win_dog = home_win[home_win['Predicted'] == 1]

In [357]:
home_win_dog_pred_win = home_win_dog_pred[home_win_dog_pred['Actual'] == 0]

In [358]:
home_win_dog_pred_loss.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-26,0,1,SFO,COL,115,-135
2019-06-26,0,1,MIA,WAS,160,-185
2019-06-26,0,1,DET,TEX,115,-135
2019-06-26,0,1,BAL,SDG,130,-150
2019-06-27,0,1,COL,LOS,151,-175


In [378]:
home_win_fav = home_win_dog[home_win_dog['Home_Open_Odds'] < 0]
home_win_fav.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-25,1,1,LAA,CIN,-150,130
2019-06-26,1,1,PHI,NYM,-130,110
2019-06-26,0,1,CUB,ATL,-125,105
2019-06-26,1,1,NYY,TOR,-265,218
2019-06-26,1,1,CLE,KAN,-205,172


In [385]:
home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))

In [419]:
home_win_fav_true.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-25,1,1,LAA,CIN,66.666667,130
2019-06-26,1,1,PHI,NYM,76.923077,110
2019-06-26,1,1,NYY,TOR,37.735849,218
2019-06-26,1,1,CLE,KAN,48.780488,172
2019-06-27,1,1,PHI,NYM,71.428571,120


In [386]:
home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
home_win_fav_false.head()
home_win_fav_false['Home_Open_Odds'] = -100

In [389]:
bet_results_fav = pd.concat([home_win_fav_false, home_win_fav_true])

In [420]:
bet_results_fav.tail()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-26,1,1,TEX,BOS,95.238095,-105
2019-09-27,1,1,PHI,MIA,55.555556,155
2019-09-27,1,1,NYM,ATL,84.033613,-101
2019-09-28,1,1,WAS,CLE,50.0,170
2019-09-29,1,1,STL,CUB,35.714286,230


In [421]:
bet_results_fav['Home_Open_Odds'].sum()

-766.0513122555149

In [422]:
home_win_dog = home_win_dog[home_win_dog['Home_Open_Odds'] > 0]

In [423]:
home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]

In [424]:
home_win_dog_true['Home_Open_Odds'] = home_win_dog_true['Home_Open_Odds']
home_win_dog_false['Home_Open_Odds'] = -100

In [425]:
home_win_dog_true['Home_Open_Odds'].sum() + home_win_dog_false['Home_Open_Odds'].sum()

-1389

In [426]:
bet_results_dog = pd.concat([home_win_dog_true, home_win_dog_false])

In [427]:
bet_results_total_100 = pd.concat([bet_results_dog,bet_results_fav])

In [428]:
bet_results_total_100['Home_Open_Odds'].sum()

-2155.051312255513