In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
import requests

In [2]:
import os

df_w_features_path = os.getcwd()+"/data/df_w_features.csv"
game_summaries_path = os.getcwd()+"/data/game_summaries.csv"
teams_path = os.getcwd()+"/data/teams.csv"
encoder_model_path = os.getcwd()+"/data/encoder_model.pk"

if(not os.path.exists(df_w_features_path)):
    open(df_w_features_path, 'w').close()
if(not os.path.exists(game_summaries_path)):
    open(game_summaries_path, 'w').close()
if(not os.path.exists(encoder_model_path)):
    open(encoder_model_path, 'w').close()



In [3]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
import scrape_in_code_folder as scrape
import features_in_code_folder as features
import dataframe_utilities_in_code_folder as util

# Update DB & Get Today Games

In [5]:

last_day = pd.to_datetime(pd.read_csv(game_summaries_path)['date']).max()
get_day = last_day + pd.Timedelta(days=1)
get_day

Timestamp('2022-05-18 00:00:00')

In [6]:
while get_day.date() < pd.datetime.today().date():
    links = scrape.get_game_links(get_day)
    for l in links:
        scrape.process_link(l)
    get_day += + pd.Timedelta(days=1)

In [7]:
test_df = scrape.get_today_games()

no pitcher {'away_team_abbr': 'PIT', 'home_team_abbr': 'CHC', 'time': '7:40PM', 'away_pitcher': 'smylydr01'}


In [8]:
test_df

Unnamed: 0,away_team_abbr,home_team_abbr,time,away_pitcher,home_pitcher,date
0,ATL,MIL,1:10PM,friedma01,burneco01,2022-05-18
1,DET,TBR,1:10PM,rodried05,rasmudr01,2022-05-18
2,SFG,COL,3:10PM,webblo01,freelky01,2022-05-18
3,MIN,OAK,3:37PM,grayso01,jeffeda01,2022-05-18
4,ARI,LAD,4:10PM,davieza02,buehlwa01,2022-05-18
5,HOU,BOS,6:10PM,garcilu05,pivetni01,2022-05-18
6,CIN,CLE,6:10PM,mahlety01,quantca01,2022-05-18
7,WSN,MIA,6:40PM,grayjo03,lopezpa01,2022-05-18
8,SDP,PHI,6:45PM,snellbl01,wheelza01,2022-05-18
9,NYY,BAL,7:05PM,colege01,lylesjo01,2022-05-18


# Process Stats for Today's Games

## Merge test and train dfs

In [9]:
test_df['is_night_game'] = True
test_df['is_night_game'][test_df['time'].str[:1].astype('int')<5] = False
test_df['is_night_game'][test_df['time'].str[1:2].isin(['0','1'])] = True #for 10,11 PM games
test_df.drop(columns='time', inplace=True)

In [10]:
test_df['is_test'] = True
test_df['home_team_win']=np.nan
test_df['game_id'] = test_df.home_team_abbr + test_df.date.astype('str').str.replace('-','') + '0'
test_df.shape

(15, 9)

In [11]:
df = features.get_game_df()
df['is_test'] = False
df.shape

(4002, 9)

In [12]:
df = pd.concat([df,test_df])
df = df.sort_values(by='date').reset_index(drop=True)
df.shape

(4017, 9)

## Add Features

In [13]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(4017, 18)

In [14]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.week
df['dow']=date.dt.weekday.astype('int')

In [15]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

### Add Stats

In [16]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()

#### Rolling 10 Day Stats

In [17]:
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True, 'batting')

In [18]:
pitching['SO_batters_faced'] = pitching['SO'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['H'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['BB'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True, 'team_pitching')

In [19]:
pitchers['SO_batters_faced'] = pitchers['SO'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['H'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['BB'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False, 'pitcher')

#### Games Stats

In [20]:
import time
try:
    df = features.game_stats(games,df)
except:
    time.sleep(1)
    df = features.game_stats(games,df)

In [21]:
df.shape

(4017, 82)

#### Season Stats

In [22]:
batting_stats = ['A', 'AB', 'BB', 'H', 'PA', 'PO', 'R', 'RBI', 'SO', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

(4017, 222)

In [23]:
pitching_stats = ['BB', 'ER', 'H', 'HR', 'IP', 'R', 'SO', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

(4017, 411)

In [24]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

(4017, 600)

## Cleanup

In [25]:
df = util.fix_na(df, False)

# Generate Predictions

In [26]:
X_test = df[df.is_test].drop(columns=['is_test'])

# make columns match training data
cols = pd.read_csv(df_w_features_path).columns
X_test = X_test[cols]

X_test.drop(columns=['home_team_win','game_id'], inplace=True)

In [27]:
import pickle

encoder, model = pickle.load(open(encoder_model_path,'rb'))

X_test = encoder.transform(X_test)
proba = model.predict_proba(X_test)[:,1]
pred = model.predict(X_test)
print(pred)

[1 1 1 1 1 0 0 1 1 1 0 0 0 1 0]


In [28]:
test_df = test_df.sort_values(by=['date','game_id']).reset_index(drop=True)

pred_df = df[df.is_test][['away_pitcher', 'away_team_abbr', 'home_pitcher', 'home_team_abbr']]

pred_df['home'] = pred_df['home_team_abbr']

pred_df['away'] = pred_df['away_team_abbr']

pred_df.drop(columns=['home_team_abbr','away_team_abbr'], inplace=True)

pred_df['xgb_proba']= proba

pred_df['xgb_winner']=pred_df.home

pred_df['xgb_winner'][pred!= 1]=pred_df.away

In [29]:
# get daily odds from covers.com
import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://www.covers.com/sports/mlb/matchups').text
soup = bs(html)
games = []
for s in soup.findAll('div',{'class':'cmg_matchup_game_box cmg_game_data'}):
    g = {}
    g['home'] = s['data-home-team-shortname-search']
    g['home_odds'] = s['data-game-odd']
    
    if g['home']=='SD':g['home']='SDP'
    if g['home']=='KC':g['home']='KCR'
    if g['home']=='SF':g['home']='SFG'
    if g['home']=='WAS':g['home']='WSN'
    if g['home']=='TB':g['home']='TBR'
    
    games.append(g)
odds = pd.DataFrame(games)

# merge in the odds
pred_df = pd.merge(left=pred_df, right=odds, on='home', how='left')
pred_df['home_odds']=pd.to_numeric(pred_df['home_odds'], errors='coerce')
# pred_df['online_odds'][pred_df.xgb_probability<0.5] = -pred_df['online_odds'] #convert odds to pred winner odds (not home team odds)

In [30]:
# online proba
#https://www.bettingexpert.com/en-au/learn/understanding-betting-odds/how-to-convert-odds
pred_df['online_proba'] = -pred_df['home_odds']/(-pred_df['home_odds']+100)
pred_df['online_proba'][pred_df['home_odds']>0] = 100/(pred_df['home_odds']+100)

# Confidence
pred_df['confidence'] = np.abs(pred_df['xgb_proba']-0.5)+.5
online_conf = np.abs(pred_df['online_proba']-0.5)+.5

pred_df['conf_diff'] = pred_df['confidence'] - online_conf
pred_df['conf_diff'][(pred_df['xgb_proba']>.5)&(pred_df['online_proba']<.5)] = 'Contrary'
pred_df['conf_diff'][(pred_df['xgb_proba']<.5)&(pred_df['online_proba']>.5)] = 'Contrary'

In [31]:
def create_teams_file():
    soup = bs(requests.get(
        'https://www.baseball-reference.com/teams/').text, "html.parser")

    f = open(
        r'D:\FreelancerJobs\fix_problems\fix_mlb_outcomes\notebooks\data\teams.csv', 'w')
    f.write('Abbr,Team\n')
    tags_a = soup.find_all('a')

    for a in tags_a:
        url = a['href']
        if(url[:7] == '/teams/' and url[-1] == '/'):
            if(url[7:-1] != ''):
                f.write(url[7:-1]+","+a.get_text()+"\n")
    f.close()

In [32]:
# update teams in the link: https://www.baseball-reference.com/teams/
create_teams_file()
# merge in team names
import os
teams = pd.read_csv(teams_path)
pred_df = pd.merge(left=pred_df, right=teams,
                   left_on='xgb_winner', right_on='Abbr',
                   how='left')

pred_df['pred_winner'] = pred_df['Team']
pred_df.drop(columns=['xgb_winner', 'Abbr', 'Team'], inplace=True)
print(pred_df)


   away_pitcher home_pitcher home away  xgb_proba  home_odds  online_proba  \
0     rodried05    rasmudr01  TBR  DET   0.519196       -294      0.746193   
1     snellbl01    wheelza01  PHI  SDP   0.534701       -115      0.534884   
2     snellbl01    wheelza01  PHI  SDP   0.534701       -115      0.534884   
3      grayso01    jeffeda01  OAK  MIN   0.551036       -108      0.519231   
4      grayso01    jeffeda01  OAK  MIN   0.551036       -108      0.519231   
5      grayso01    jeffeda01  OAK  MIN   0.551036       -108      0.519231   
6      grayso01    jeffeda01  OAK  MIN   0.551036       -108      0.519231   
7     hicksjo03    scherma01  NYM  STL   0.526491        101      0.497512   
8     hicksjo03    scherma01  NYM  STL   0.526491        101      0.497512   
9     hicksjo03    scherma01  NYM  STL   0.526491       -130      0.565217   
10    hicksjo03    scherma01  NYM  STL   0.526491       -130      0.565217   
11    friedma01    burneco01  MIL  ATL   0.502494       -145    

In [33]:
from IPython.display import HTML
pd.options.display.float_format = '{:.3f}'.format

pred_df['conf'] = pred_df.confidence
HTML(pred_df.sort_values(by='confidence', ascending=False).to_html(index=False))

away_pitcher,home_pitcher,home,away,xgb_proba,home_odds,online_proba,confidence,conf_diff,pred_winner,conf
smylydr01,Unknown,CHC,PIT,0.619,-145,0.592,0.619,0.027,Chicago Cubs,0.619
smylydr01,Unknown,CHC,PIT,0.619,-145,0.592,0.619,0.027,Chicago Cubs,0.619
davieza02,buehlwa01,LAD,ARI,0.586,-238,0.704,0.586,-0.118,Los Angeles Dodgers,0.586
davieza02,buehlwa01,LAD,ARI,0.586,-238,0.704,0.586,-0.118,Los Angeles Dodgers,0.586
davieza02,buehlwa01,LAD,ARI,0.586,-238,0.704,0.586,-0.118,Los Angeles Dodgers,0.586
davieza02,buehlwa01,LAD,ARI,0.586,-204,0.671,0.586,-0.085,Los Angeles Dodgers,0.586
davieza02,buehlwa01,LAD,ARI,0.586,-204,0.671,0.586,-0.085,Los Angeles Dodgers,0.586
davieza02,buehlwa01,LAD,ARI,0.586,-204,0.671,0.586,-0.085,Los Angeles Dodgers,0.586
grayso01,jeffeda01,OAK,MIN,0.551,-108,0.519,0.551,0.032,Oakland Athletics,0.551
grayso01,jeffeda01,OAK,MIN,0.551,-108,0.519,0.551,0.032,Oakland Athletics,0.551


In [34]:
print("2-Team Parlays:")
pred_df = pred_df.sort_values(by='conf', ascending=False).reset_index(drop=True)
for i in range(5):
    t1 = pred_df.iloc[i]
    t2 = pred_df.iloc[i+1]
    print(f"- {t1.pred_winner}, {t2.pred_winner}  \t{t1.conf*t2.conf: .3f}")

2-Team Parlays:
- Chicago Cubs, Chicago Cubs  	 0.383
- Chicago Cubs, Los Angeles Dodgers  	 0.363
- Los Angeles Dodgers, Los Angeles Dodgers  	 0.344
- Los Angeles Dodgers, Los Angeles Dodgers   	 0.344
- Los Angeles Dodgers , Los Angeles Dodgers  	 0.344
