In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 15, 5
sns.set_style('darkgrid')
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, r2_score
from sklearn.preprocessing import StandardScaler,LabelEncoder,RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display_markdown
warnings.filterwarnings("ignore")

In [2]:
def display(classification_list,scores):
    display_markdown(f'\n|            |{classification_list[pd.Series(scores).idxmax()][14:23].capitalize()}|{classification_list[pd.Series(scores).idxmax()][27:33].capitalize()}|{classification_list[pd.Series(scores).idxmax()][35:43].capitalize()}|{classification_list[pd.Series(scores).idxmax()][46:53].capitalize()}|\n|:---|-----|-----|-----|---:|\n| 0  |{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:][:4]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:][10:14]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:][20:24]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:].split("           1")[0][31:-1]}|\n| 1  |{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:].split("           1       ")[1][:4]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:].split("           1       ")[1][10:14]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:].split("           1       ")[1][20:24]}|{classification_list[pd.Series(scores).idxmax()].split("support")[1][21:].split("           1       ")[1].split("    accuracy")[0][31:-2]}|\n|Accuracy|||{classification_list[pd.Series(scores).idxmax()].split("accuracy                           ")[1][0:4]}|{classification_list[pd.Series(scores).idxmax()].split("accuracy                           ")[1].split("   macro avg")[0][11:-1]}|\n|Macro Avg|{classification_list[pd.Series(scores).idxmax()].split("macro avg       ")[1][:4]}|{classification_list[pd.Series(scores).idxmax()].split("macro avg       ")[1][10:14]}|{classification_list[pd.Series(scores).idxmax()].split("macro avg       ")[1][20:24]}|{classification_list[pd.Series(scores).idxmax()].split("macro avg       ")[1].split("weighted avg")[0][31:-1]}|\n|Weighted Avg|{classification_list[pd.Series(scores).idxmax()].split("weighted avg")[1][7:11]}|{classification_list[pd.Series(scores).idxmax()].split("weighted avg")[1][17:21]}|{classification_list[pd.Series(scores).idxmax()].split("weighted avg")[1][27:31]}|{classification_list[pd.Series(scores).idxmax()].split("weighted avg")[1][38:-1]}|\n', raw=True)
    return

In [3]:
window_list = ['7','10','15']
dataframe_list = []
for window in window_list:
    print("Starting " + window + " Day Model")
    concat_window = pd.read_csv(window + "_Concatenated.csv")
    concat_window.drop(columns = 'Unnamed: 0',inplace=True)
    X = concat_window.drop(['Home_Win','Visitor','Home'], axis=1)
    y = concat_window[['Home_Win','Date']]

    X_train = X[X['Date'] < X.Date.unique()[-1]]
    X_test = X[X['Date'] == X.Date.unique()[-1]]
    y_train = y[y['Date'] < y.Date.unique()[-1]]
    y_test = y[y['Date'] == y.Date.unique()[-1]]

    y_train.drop(columns='Date',inplace=True)
    y_test.drop(columns='Date',inplace=True)
    X_train.drop(columns='Date',inplace=True)
    X_test.drop(columns='Date',inplace=True)

    scores = []
    matricies_scaled = []
    classification_list = []
    prediction_list = []
    for x in range(50):
        mlp = MLPClassifier(max_iter=500, activation='relu',hidden_layer_sizes=(32,16))
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
        prediction_list.append(predictions)
        scores.append(r2_score(y_test,predictions))
        cm_scale = confusion_matrix(y_test,predictions)
        classif = classification_report(y_test,predictions)
        matricies_scaled.append(cm_scale)
        classification_list.append(classif)
    temp_df = concat_window[concat_window['Date'] == concat_window.Date.unique()[-1]][['Visitor','Home']]
    total_list = []
    for y in range(len(temp_df)):
        team_wl = []
        for x in range(50):
            team_wl.append(prediction_list[x][y])
        total_list.append(round((sum(team_wl)/len(team_wl))*100,2))
    temp_df['Home_Win'] = predictions
    temp_df['Home_Win_Percent'] = total_list
    dataframe_list.append(temp_df)
    print("Finished " + window + " Day Model")

Starting 7 Day Model
Finished 7 Day Model
Starting 10 Day Model
Finished 10 Day Model
Starting 15 Day Model
Finished 15 Day Model


In [4]:
dataframe_list[0]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3123,TEX,WSN,1,84.0
3124,OAK,BOS,1,70.0
3125,CHN,NYA,1,54.0
3126,KCR,CLE,1,68.0
3127,TOR,DET,1,34.0
3128,PHI,MIA,1,34.0
3129,ATL,TBA,1,38.0
3130,SLN,CHA,1,80.0
3131,SEA,HOU,1,70.0
3132,CIN,MIL,1,58.0


In [5]:
dataframe_list[1]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3193,TEX,WSN,0,78.0
3194,OAK,BOS,0,82.0
3195,CHN,NYA,0,52.0
3196,KCR,CLE,0,84.0
3197,TOR,DET,0,14.0
3198,PHI,MIA,0,26.0
3199,ATL,TBA,0,8.0
3200,SLN,CHA,1,72.0
3201,SEA,HOU,0,66.0
3202,CIN,MIL,1,68.0


In [6]:
dataframe_list[2]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3078,TEX,WSN,1,62.0
3079,OAK,BOS,1,76.0
3080,CHN,NYA,0,60.0
3081,KCR,CLE,1,76.0
3082,TOR,DET,0,40.0
3083,PHI,MIA,0,42.0
3084,ATL,TBA,0,44.0
3085,SLN,CHA,1,78.0
3086,SEA,HOU,1,58.0
3087,CIN,MIL,1,72.0


In [3]:
window_list = ['7','10','15']
dataframe_list = []
for window in window_list:
    concat_window = pd.read_csv(window + "_Concatenated.csv")
    concat_window.drop(columns = 'Unnamed: 0',inplace=True)
    X = concat_window.drop(['Home_Win','Visitor','Home'], axis=1)
    y = concat_window[['Home_Win','Date']]

    X_train = X[X['Date'] < X.Date.unique()[-2]]
    X_test = X[X['Date'] == X.Date.unique()[-2]]
    y_train = y[y['Date'] < y.Date.unique()[-2]]
    y_test = y[y['Date'] == y.Date.unique()[-2]]

    y_train.drop(columns='Date',inplace=True)
    y_test.drop(columns='Date',inplace=True)
    X_train.drop(columns='Date',inplace=True)
    X_test.drop(columns='Date',inplace=True)

    scores = []
    matricies_scaled = []
    classification_list = []
    prediction_list = []
    for x in range(50):
        mlp = MLPClassifier(max_iter=500, activation='relu',hidden_layer_sizes=(32,16))
        mlp.fit(X_train,y_train)
        predictions = mlp.predict(X_test)
        prediction_list.append(predictions)
        scores.append(r2_score(y_test,predictions))
        cm_scale = confusion_matrix(y_test,predictions)
        classif = classification_report(y_test,predictions)
        matricies_scaled.append(cm_scale)
        classification_list.append(classif)
    temp_df = concat_window[concat_window['Date'] == concat_window.Date.unique()[-2]][['Visitor','Home']]
    total_list = []
    for y in range(len(temp_df)):
        team_wl = []
        for x in range(50):
            team_wl.append(prediction_list[x][y])
        total_list.append(round((sum(team_wl)/len(team_wl))*100,2))
    temp_df['Home_Win'] = predictions
    temp_df['Home_Win_Percent'] = total_list
    dataframe_list.append(temp_df)
    print("                 Best Classification Report")
    display(classification_list,scores)
    plt.show()

                 Best Classification Report



|            |Precision|Recall|F1-score|Support|
|:---|-----|-----|-----|---:|
| 0  |0.67|0.50|0.57|  4|
| 1  |0.82|0.90|0.86| 10|
|Accuracy|||0.79| 14|
|Macro Avg|0.74|0.70|0.71| 14|
|Weighted Avg|0.77|0.79|0.78| 14|


                 Best Classification Report



|            |Precision|Recall|F1-score|Support|
|:---|-----|-----|-----|---:|
| 0  |1.00|0.25|0.40|  4|
| 1  |0.77|1.00|0.87| 10|
|Accuracy|||0.79| 14|
|Macro Avg|0.88|0.62|0.63| 14|
|Weighted Avg|0.84|0.79|0.74| 14|


                 Best Classification Report



|            |Precision|Recall|F1-score|Support|
|:---|-----|-----|-----|---:|
| 0  |1.00|0.25|0.40|  4|
| 1  |0.77|1.00|0.87| 10|
|Accuracy|||0.79| 14|
|Macro Avg|0.88|0.62|0.63| 14|
|Weighted Avg|0.84|0.79|0.74| 14|


In [4]:
dataframe_list[0]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3109,PIT,ARI,0,40.0
3110,OAK,BOS,0,58.0
3111,SLN,CHA,1,68.0
3112,KCR,CLE,1,78.0
3113,TOR,DET,0,12.0
3114,SEA,HOU,1,40.0
3115,LAA,LAN,1,96.0
3116,PHI,MIA,0,12.0
3117,CIN,MIL,0,58.0
3118,BAL,MIN,0,16.0


In [5]:
dataframe_list[1]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3179,PIT,ARI,1,52.0
3180,OAK,BOS,1,82.0
3181,SLN,CHA,1,90.0
3182,KCR,CLE,1,84.0
3183,TOR,DET,0,20.0
3184,SEA,HOU,1,56.0
3185,LAA,LAN,1,98.0
3186,PHI,MIA,0,20.0
3187,CIN,MIL,1,74.0
3188,BAL,MIN,0,24.0


In [6]:
dataframe_list[2]

Unnamed: 0,Visitor,Home,Home_Win,Home_Win_Percent
3064,PIT,ARI,1,50.0
3065,OAK,BOS,1,72.0
3066,SLN,CHA,1,84.0
3067,KCR,CLE,1,44.0
3068,TOR,DET,0,30.0
3069,SEA,HOU,1,46.0
3070,LAA,LAN,1,88.0
3071,PHI,MIA,1,28.0
3072,CIN,MIL,1,66.0
3073,BAL,MIN,0,38.0


In [10]:
X_test['Home_Win'] = y_test
X_test['Predictions'] = dataframe_list[2]['Home_Win']
X_test

Unnamed: 0,IP_Pitch_Visitor,H_Pitch_Visitor,R_Pitch_Visitor,ER_Pitch_Visitor,BB_Pitch_Visitor,SO_Pitch_Visitor,HR_Pitch_Visitor,HBP_Pitch_Visitor,AB_Pitch_Visitor,2B_Pitch_Visitor,...,SO_Bat_Home,HBP_Bat_Home,SH_Bat_Home,SF_Bat_Home,GDP_Bat_Home,SB_Bat_Home,CS_Bat_Home,Games_Home,Home_Win,Predictions
3064,196.33,205.0,123.0,116.0,73.0,165.0,33.0,12.0,774.0,59.0,...,173,3,1,4,14,26,3,13,1,1
3065,190.0,227.0,144.0,140.0,92.0,203.0,19.0,7.0,768.0,55.0,...,204,10,0,10,6,17,3,13,1,1
3066,182.33,218.0,120.0,101.0,82.0,151.0,20.0,6.0,744.0,58.0,...,262,12,0,3,22,10,12,13,0,1
3067,193.0,226.0,136.0,120.0,73.0,146.0,40.0,7.0,783.0,41.0,...,153,10,0,1,17,34,2,14,1,1
3068,254.0,191.0,84.0,77.0,45.0,276.0,22.0,6.0,922.0,38.0,...,222,1,1,9,21,11,1,14,1,0
3069,207.33,174.0,80.0,68.0,43.0,158.0,19.0,2.0,777.0,36.0,...,135,2,3,4,19,17,3,14,1,1
3070,153.0,161.0,116.0,112.0,80.0,164.0,33.0,9.0,607.0,28.0,...,184,11,0,15,4,14,2,14,1,1
3071,210.0,150.0,52.0,49.0,42.0,219.0,23.0,3.0,754.0,17.0,...,176,4,2,1,22,4,4,14,1,1
3072,213.0,221.0,124.0,109.0,91.0,179.0,31.0,17.0,835.0,46.0,...,223,17,1,14,17,30,6,15,0,1
3073,208.0,177.0,88.0,76.0,60.0,196.0,27.0,8.0,779.0,35.0,...,154,9,1,3,12,12,5,13,0,0


In [12]:
1 - 6/len(temp_df)

0.5714285714285714