In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [2]:
# Import csv path/file
csv_path = Path('../Project_2/Training_Data/2019_10_day.csv')
df = pd.read_csv(csv_path, index_col = 'Date')
csv_path_2 = Path('../10_Day_Lookback_2017.csv')
df_2 = pd.read_csv(csv_path_2)

In [290]:
df = df.replace([np.inf], 150)
df = df.replace([np.NaN], 0)
df = df.replace([-np.inf], -150)
df_2 = df_2.replace([np.inf], 150)
df_2 = df_2.replace([np.NaN], 0)
df_2 = df_2.replace([-np.inf], -150)

In [3]:
# Create our features
X = df.iloc[:, 10::]
# Create our target
y = df['home_win_loss']

In [292]:
X.head()

Unnamed: 0_level_0,Home_PitchingOBP_allowed,Home_PitchingSLG%_allowed,Visitor_PitchingK%,Visitor_PitchingBB%,Visitor_PitchingOBP_allowed,Visitor_PitchingSLG%_allowed,Home_HittingK%,Home_HittingBB%,Home_HittingOBP,Home_HittingSLG%,Visitor_HittingK%,Visitor_HittingBB%,Visitor_HittingOBP,Visitor_HittingSLG%
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-03-30,0.235294,0.258065,0.424242,0.030303,0.212121,0.193548,0.424242,0.030303,0.212121,0.193548,0.352941,0.088235,0.235294,0.258065
2019-03-30,0.361111,0.366667,0.243243,0.162162,0.351351,0.516129,0.243243,0.162162,0.351351,0.516129,0.25,0.166667,0.361111,0.366667
2019-03-30,0.333333,0.432432,0.246154,0.061538,0.184615,0.305085,0.246154,0.061538,0.184615,0.305085,0.148148,0.08642,0.333333,0.432432
2019-03-30,0.353659,0.555556,0.19403,0.059701,0.298507,0.508197,0.19403,0.059701,0.298507,0.508197,0.253012,0.108434,0.353659,0.555556
2019-03-30,0.235294,0.269841,0.290323,0.080645,0.322581,0.418182,0.290323,0.080645,0.322581,0.418182,0.25,0.058824,0.235294,0.269841


In [293]:
len(y)

2402

In [294]:
len(y[y == 1])

1271

In [295]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=9, 
                                                    stratify=y)
X_train.shape

(1801, 14)

In [5]:
X_train = X[:int(len(X)/3)]
X_test = X[int(len(X)/3)+1:]
y_train = y[:int(len(X)/3)]
y_test = y[int(len(X)/3)+1:]

In [6]:
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=9)

In [8]:
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False)

In [9]:
predictions = rf_model.predict(X_test_scaled)

In [10]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)

In [11]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,341,411
Actual 1,353,496


Accuracy Score : 0.5188370861839962
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.45      0.58      0.47      0.51      0.26       752
          1       0.55      0.58      0.45      0.56      0.51      0.27       849

avg / total       0.52      0.52      0.51      0.52      0.51      0.27      1601



In [12]:
importances = rf_model.feature_importances_

In [13]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.07909057599831773, 'Home_PitchingOBP_allowed'),
 (0.0779154845037173, 'Visitor_HittingSLG%'),
 (0.07684526932824559, 'Home_HittingK%'),
 (0.07581777430903472, 'Visitor_HittingBB%'),
 (0.07384103324215012, 'Visitor_PitchingSLG%_allowed'),
 (0.07284042454500667, 'Visitor_PitchingK%'),
 (0.07115504514945863, 'Visitor_PitchingBB%'),
 (0.07029899485720299, 'Visitor_PitchingOBP_allowed'),
 (0.07022531800713495, 'Home_HittingSLG%'),
 (0.06942075439658348, 'Home_HittingBB%'),
 (0.06879277725320407, 'Home_PitchingSLG%_allowed'),
 (0.06802793166883381, 'Visitor_HittingK%'),
 (0.06301129138560572, 'Visitor_HittingOBP'),
 (0.06271732535550431, 'Home_HittingOBP')]

In [14]:
clf = AdaBoostClassifier(n_estimators = 2500, random_state = 9)

In [15]:
clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=2500, random_state=9)

In [16]:
predictions = clf.predict(X_test)

In [17]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
acc_score = balanced_accuracy_score(y_test, predictions)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,333,419
Actual 1,352,497


Accuracy Score : 0.5141068653985916
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.49      0.44      0.59      0.46      0.51      0.26       752
          1       0.54      0.59      0.44      0.56      0.51      0.26       849

avg / total       0.52      0.52      0.51      0.52      0.51      0.26      1601



In [19]:
actual_df = pd.DataFrame(y_test)
actual_df.reset_index(inplace = True)

In [20]:
predict_df = pd.DataFrame(predictions)
predict_df.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1


In [21]:
actual_predict_df = pd.concat([actual_df,predict_df], axis = 1, join = 'inner')

In [22]:
actual_predict_df.set_index('Date',inplace = True)

In [23]:
actual_predict_df.columns = ['Actual','Predictions']

In [24]:
actual_predict_df.head()

Unnamed: 0_level_0,Actual,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-29,0,1
2019-05-30,0,1
2019-05-30,0,0
2019-05-30,1,1
2019-05-30,0,1


In [26]:
odds_df_new = df[['home','visitor','home_open_odds','visitor_open_odds']][1160:]
odds_df_new.reset_index(inplace = True)
odds_df_new.drop(columns = ['Date'],inplace = True)

In [27]:
df = pd.concat([actual_df,predict_df, odds_df_new], axis = 1, join ='inner')

In [28]:
df.set_index('Date', inplace = True)

In [29]:
df.columns = ['Actual','Predicted','Home','Visitor','Home_Open_Odds','Visitor_Open_Odds']

In [30]:
df.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-05-29,0,1,LAA,CIN,-150,130
2019-05-30,0,1,ARI,LOS,125,-145
2019-05-30,0,0,SFO,COL,115,-135
2019-05-30,1,1,PHI,NYM,-130,110
2019-05-30,0,1,MIA,WAS,160,-185


In [33]:
year = input()

 2019


In [2]:
csv_name = '../Project_2/Predictions_Vs_Actual/Predictions_Actual_' + year + '.csv'
csv_path = Path(csv_name)
df = pd.read_csv(csv_path, index_col = 'Date')
df.head()

 2016


Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-07-06,1,0,MIN,OAK,-106,-104
2016-07-06,1,0,TOR,KAN,-180,160
2016-07-06,0,1,TAM,LAA,-150,135
2016-07-06,0,0,BOS,TEX,-185,165
2016-07-06,1,1,HOU,SEA,-165,145


In [31]:
def find_total_profits(df):
    # Convert open odds to integers
    df['Home_Open_Odds'] = df['Home_Open_Odds'].astype(int)
    df['Visitor_Open_Odds'] = df['Visitor_Open_Odds'].astype(int)


    # Pull in predicted win and predicted loss
    home_win = df[df['Predicted'] == 1]
    home_loss = df[df['Predicted'] == 0]


    # Grab instances where the predicted winner is the favorite to win
    home_win_fav = home_win[home_win['Home_Open_Odds'] < 0]
    home_loss_fav = home_loss[home_loss['Visitor_Open_Odds'] < 0]


    # Grab instances where the predicted winner is the underdog
    home_win_dog = home_win[home_win['Home_Open_Odds'] > 0]
    home_loss_dog = home_loss[home_loss['Visitor_Open_Odds'] > 0]


    # Grab instances where the favs were the actual winners and predicted winners
    home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
    home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))
    home_loss_fav_true = home_loss_fav[home_loss_fav['Actual'] == 1]
    home_loss_fav_true['Visitor_Open_Odds'] = ((10000)/abs(home_loss_fav_true['Visitor_Open_Odds']))


    # Grab instances where the dogs were winners and predicted winners
    home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
    home_loss_dog_true = home_loss_dog[home_loss_dog['Actual']==0]


    # Grab instances where the dogs were losers, but predicted winners
    home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]
    home_loss_dog_false = home_loss_dog[home_loss_dog['Actual']==1]


    # Grab instances where the favs were predicted winners but lost
    home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
    home_win_fav_false['Home_Open_Odds'] = -100
    home_loss_fav_false = home_loss_fav[home_loss_fav['Actual'] == 1]
    home_loss_fav_false['Visitor_Open_Odds'] = -100


    # Grab instances where the dgos were predicted winners but lost
    home_win_dog_false = home_win_fav[home_win_fav['Actual'] == 0]
    home_win_dog_false['Home_Open_Odds'] = -100
    home_loss_dog_false = home_loss_fav[home_loss_fav['Actual'] == 1]
    home_loss_dog_false['Visitor_Open_Odds'] = -100


    # Concatinate the dataframes
    bet_results_fav_win = pd.concat([home_win_fav_false, home_win_fav_true]) # Home Open Odds
    bet_results_fav_loss = pd.concat([home_loss_fav_false, home_loss_fav_true]) # Visitor Open Odds
    bet_results_dog_win = pd.concat([home_win_dog_true, home_win_dog_false]) # Home Open Odds
    bet_results_dog_loss = pd.concat([home_loss_dog_true, home_loss_dog_false]) # Visitor Open Odds


    # Sum up final wins/loss money lines
    fav_win_sum = round(bet_results_fav_win['Home_Open_Odds'].sum(),2)
    fav_loss_sum = round(bet_results_fav_loss['Visitor_Open_Odds'].sum(),2)
    dog_win_sum = round(bet_results_dog_win['Home_Open_Odds'].sum(),2)
    dog_loss_sum = round(bet_results_dog_loss['Visitor_Open_Odds'].sum(),2)
    final_sum = round(fav_win_sum + fav_loss_sum + dog_win_sum + dog_loss_sum,2)
    total_profits = print(f'Betting on the favorites to win at home: ${fav_win_sum}\nBetting on the favorites to win on the road: ${fav_loss_sum}\nBetting on the dogs to win at home: ${dog_win_sum}\nBetting on the dogs to win on the road: ${dog_loss_sum}\nTotal profits for the second half of the {year} season: ${final_sum}')
    return total_profits

In [42]:
# Convert open odds to integers
df['Home_Open_Odds'] = df['Home_Open_Odds'].astype(int)
df['Visitor_Open_Odds'] = df['Visitor_Open_Odds'].astype(int)


# Pull in predicted win and predicted loss
home_win = df[df['Predicted'] == 1]
home_loss = df[df['Predicted'] == 0]


# Grab instances where the predicted winner is the favorite to win
home_win_fav = home_win[home_win['Home_Open_Odds'] < 0]
home_loss_fav = home_loss[home_loss['Visitor_Open_Odds'] < 0]


# Grab instances where the predicted winner is the underdog
home_win_dog = home_win[home_win['Home_Open_Odds'] > 0]
home_loss_dog = home_loss[home_loss['Visitor_Open_Odds'] > 0]


# Grab instances where the favs were the actual winners and predicted winners
home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))
home_loss_fav_true = home_loss_fav[home_loss_fav['Actual'] == 1]
home_loss_fav_true['Visitor_Open_Odds'] = ((10000)/abs(home_loss_fav_true['Visitor_Open_Odds']))


# Grab instances where the dogs were winners and predicted winners
home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
home_loss_dog_true = home_loss_dog[home_loss_dog['Actual']==0]


# Grab instances where the dogs were losers, but predicted winners
home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]
home_loss_dog_false = home_loss_dog[home_loss_dog['Actual']==1]


# Grab instances where the favs were predicted winners but lost
home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
home_win_fav_false['Home_Open_Odds'] = -100
home_loss_fav_false = home_loss_fav[home_loss_fav['Actual'] == 1]
home_loss_fav_false['Visitor_Open_Odds'] = -100


# Grab instances where the dgos were predicted winners but lost
home_win_dog_false = home_win_fav[home_win_fav['Actual'] == 0]
home_win_dog_false['Home_Open_Odds'] = -100
home_loss_dog_false = home_loss_fav[home_loss_fav['Actual'] == 1]
home_loss_dog_false['Visitor_Open_Odds'] = -100


# Concatinate the dataframes
bet_results_fav_win = pd.concat([home_win_fav_false, home_win_fav_true]) # Home Open Odds
bet_results_fav_loss = pd.concat([home_loss_fav_false, home_loss_fav_true]) # Visitor Open Odds
bet_results_dog_win = pd.concat([home_win_dog_true, home_win_dog_false]) # Home Open Odds
bet_results_dog_loss = pd.concat([home_loss_dog_true, home_loss_dog_false]) # Visitor Open Odds

# Determine the accuracy of each of the four betting methods
fav_win_win_len = len(home_win_fav_true)
fav_win_tot_len = len(bet_results_fav_win)
fav_win_acc = round(100*(fav_win_win_len/fav_win_tot_len),2)
fav_loss_win_len = len(home_loss_fav_true)
fav_loss_tot_len = len(bet_results_fav_loss)
fav_loss_acc = round(100*(fav_loss_win_len/fav_loss_tot_len),2)
dog_win_win_len = len(home_win_dog_true)
dog_win_tot_len = len(bet_results_dog_win)
dog_win_acc = round(100*(dog_win_win_len/dog_win_tot_len),2)
dog_loss_win_len = len(home_loss_dog_true)
dog_loss_tot_len = len(bet_results_dog_loss)
dog_loss_acc = round(100*(dog_loss_win_len/dog_loss_tot_len),2)

# Sum up final wins/loss money lines
fav_win_sum = round(bet_results_fav_win['Home_Open_Odds'].sum(),2)
fav_loss_sum = round(bet_results_fav_loss['Visitor_Open_Odds'].sum(),2)
dog_win_sum = round(bet_results_dog_win['Home_Open_Odds'].sum(),2)
dog_loss_sum = round(bet_results_dog_loss['Visitor_Open_Odds'].sum(),2)
final_sum = round(fav_win_sum + fav_loss_sum + dog_win_sum + dog_loss_sum,2)
print(f'Betting on the favorites to win at home: ${fav_win_sum}    The accuracy of betting on the favorites to win at home is {fav_win_acc}%\nBetting on the favorites to win on the road: ${fav_loss_sum}    The accuracy of betting on the favorites to win on the road is {fav_loss_acc}%\nBetting on the underdogs to win at home: ${dog_win_sum}    The accuracy of betting on the underdogs to win at home is {dog_win_acc}%\nBetting on the dogs to win on the road: ${dog_loss_sum}    The accuracy of betting on the underdogs to win on the road is {dog_loss_acc}%\nTotal profits for the second half of the {year} season: ${final_sum}')

Betting on the favorites to win at home: $-3693.23    The accuracy of betting on the favorites to win at home is 55.58%
Betting on the favorites to win on the road: $-3749.81    The accuracy of betting on the favorites to win on the road is 50.0%
Betting on the underdogs to win at home: $-5132    The accuracy of betting on the underdogs to win at home is 35.63%
Betting on the dogs to win on the road: $8285    The accuracy of betting on the underdogs to win on the road is 51.71%
Total profits for the second half of the 2019 season: $-4290.04


In [335]:
home_win['Home_Open_Odds'] = home_win['Home_Open_Odds'].astype(int)

In [343]:
home_win_dog_pred = home_win[home_win['Home_Open_Odds'] > 0]

In [356]:
home_win_dog = home_win[home_win['Predicted'] == 1]

In [357]:
home_win_dog_pred_win = home_win_dog_pred[home_win_dog_pred['Actual'] == 0]

In [358]:
home_win_dog_pred_loss.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-26,0,1,SFO,COL,115,-135
2019-06-26,0,1,MIA,WAS,160,-185
2019-06-26,0,1,DET,TEX,115,-135
2019-06-26,0,1,BAL,SDG,130,-150
2019-06-27,0,1,COL,LOS,151,-175


In [378]:
home_win_fav = home_win_dog[home_win_dog['Home_Open_Odds'] < 0]
home_win_fav.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-25,1,1,LAA,CIN,-150,130
2019-06-26,1,1,PHI,NYM,-130,110
2019-06-26,0,1,CUB,ATL,-125,105
2019-06-26,1,1,NYY,TOR,-265,218
2019-06-26,1,1,CLE,KAN,-205,172


In [385]:
home_win_fav_true = home_win_fav[home_win_fav['Actual'] == 1]
home_win_fav_true['Home_Open_Odds'] = ((10000)/abs(home_win_fav_true['Home_Open_Odds']))

In [419]:
home_win_fav_true.head()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-25,1,1,LAA,CIN,66.666667,130
2019-06-26,1,1,PHI,NYM,76.923077,110
2019-06-26,1,1,NYY,TOR,37.735849,218
2019-06-26,1,1,CLE,KAN,48.780488,172
2019-06-27,1,1,PHI,NYM,71.428571,120


In [386]:
home_win_fav_false = home_win_fav[home_win_fav['Actual'] == 0]
home_win_fav_false.head()
home_win_fav_false['Home_Open_Odds'] = -100

In [389]:
bet_results_fav = pd.concat([home_win_fav_false, home_win_fav_true])

In [420]:
bet_results_fav.tail()

Unnamed: 0_level_0,Actual,Predicted,Home,Visitor,Home_Open_Odds,Visitor_Open_Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-09-26,1,1,TEX,BOS,95.238095,-105
2019-09-27,1,1,PHI,MIA,55.555556,155
2019-09-27,1,1,NYM,ATL,84.033613,-101
2019-09-28,1,1,WAS,CLE,50.0,170
2019-09-29,1,1,STL,CUB,35.714286,230


In [421]:
bet_results_fav['Home_Open_Odds'].sum()

-766.0513122555149

In [422]:
home_win_dog = home_win_dog[home_win_dog['Home_Open_Odds'] > 0]

In [423]:
home_win_dog_true = home_win_dog[home_win_dog['Actual']==1]
home_win_dog_false = home_win_dog[home_win_dog['Actual']==0]

In [424]:
home_win_dog_true['Home_Open_Odds'] = home_win_dog_true['Home_Open_Odds']
home_win_dog_false['Home_Open_Odds'] = -100

In [425]:
home_win_dog_true['Home_Open_Odds'].sum() + home_win_dog_false['Home_Open_Odds'].sum()

-1389

In [426]:
bet_results_dog = pd.concat([home_win_dog_true, home_win_dog_false])

In [427]:
bet_results_total_100 = pd.concat([bet_results_dog,bet_results_fav])

In [428]:
bet_results_total_100['Home_Open_Odds'].sum()

-2155.051312255513