In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [286]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [287]:
import sys
sys.path.append('../code')
import scrape, features
import dataframe_utilities as util

# Update DB & Get Today Games

In [288]:
last_day = pd.to_datetime(pd.read_csv('../data/game_summaries.csv')['date']).max()
get_day = last_day + pd.Timedelta(days=1)
get_day

Timestamp('2024-09-27 00:00:00')

In [289]:
#home and away
#game time
#pitchers


In [290]:
import datetime

# Example of creating a datetime object
date_obj = datetime.date.today()
while get_day.date() < date_obj:
    links = scrape.get_game_links(get_day)
    for l in links:
        scrape.process_link(l)
    get_day += + pd.Timedelta(days=1)

In [291]:
import lxml
test_df = scrape.get_today_games()

no pitcher {'away_team_abbr': 'STL', 'home_team_abbr': 'MIL', 'time': '8:10PM', 'away_pitcher': 'lynnla01'}


In [292]:
test_df

Unnamed: 0,away_team_abbr,home_team_abbr,time,away_pitcher,home_pitcher,date
0,HOU,DET,6:40PM,valdefr01,mizeca01,2024-05-10
1,CHC,PIT,6:40PM,assadja01,jonesja09,2024-05-10
2,NYY,TBR,6:50PM,schmicl01,bradlta01,2024-05-10
3,ARI,BAL,7:05PM,pfaadbr01,irvinco01,2024-05-10
4,MIN,TOR,7:07PM,ryanjo04,kikucyu01,2024-05-10
5,WSN,BOS,7:10PM,corbipa01,houckta01,2024-05-10
6,PHI,MIA,7:10PM,suarera01,rogertr01,2024-05-10
7,ATL,NYM,7:10PM,mortoch02,quintjo01,2024-05-10
8,CLE,CHW,7:40PM,carraca01,crochga01,2024-05-10
9,STL,MIL,8:10PM,lynnla01,,2024-05-10


# Process Stats for Today's Games

## Merge test and train dfs

In [293]:
test_df['is_night_game'] = True
test_df['is_night_game'][test_df['time'].str[:1].astype('int')<5] = False
test_df['is_night_game'][test_df['time'].str[1:2].isin(['0','1'])] = True #for 10,11 PM games
test_df.drop(columns='time', inplace=True)

In [294]:
test_df['is_test'] = True
test_df['home_team_win']=np.nan
test_df['game_id'] = test_df.home_team_abbr + test_df.date.astype('str').str.replace('-','') + '0'
test_df.shape

(15, 9)

In [295]:
df = features.get_game_df()
df['is_test'] = False
df.shape

(935, 9)

In [296]:
df = pd.concat([df,test_df])
df = df.sort_values(by='date').reset_index(drop=True)
df.shape

(950, 9)

## Add Features

In [297]:
df = features.add_trueskill_ratings(df)
df = features.add_rest_durations(df)
df.shape

(950, 18)

In [298]:
#datetime
date = pd.to_datetime(df['date'])
df['season'] = date.dt.year
df['month']=date.dt.month
df['week_num'] = date.dt.isocalendar().week
df['dow']=date.dt.weekday.astype('int')

In [299]:
df['dh_game_no'] = pd.to_numeric(df['game_id'].str[-1:],errors='coerce')
df['date'] = (pd.to_datetime(df['date']) - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #epoch time

In [300]:
test_df['game_id']

0     DET202405100
1     PIT202405100
2     TBR202405100
3     BAL202405100
4     TOR202405100
5     BOS202405100
6     MIA202405100
7     NYM202405100
8     CHW202405100
9     MIL202405100
10    COL202405100
11    LAA202405100
12    SDP202405100
13    SEA202405100
14    SFG202405100
Name: game_id, dtype: object

### Add Stats

In [301]:
games = features.get_games()
batting = features.get_batting()
pitching = features.get_pitching()
pitchers = features.get_pitchers()
print(df.columns)

Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre', 'away_trueskill_pre', 'ts_diff', 'home_team_rest',
       'away_team_rest', 'home_pitcher_rest', 'away_pitcher_rest',
       'team_rest_diff', 'pitcher_rest_diff', 'season', 'month', 'week_num',
       'dow', 'dh_game_no'],
      dtype='object')


#### Rolling 10 Day Stats

In [302]:
b_stats = ['batting_avg','leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging']
df = features.add_10RA_rolling(batting, df, b_stats, True)
print(df.columns)

0      STL
1      NYM
2      HOU
3      STL
4      LAA
      ... 
125    DET
126    HI2
127    SFP
128    TB2
129    CWS
Name: team, Length: 130, dtype: object
Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre', 'away_trueskill_pre', 'ts_diff', 'home_team_rest',
       'away_team_rest', 'home_pitcher_rest', 'away_pitcher_rest',
       'team_rest_diff', 'pitcher_rest_diff', 'season', 'month', 'week_num',
       'dow', 'dh_game_no', 'batting_avg_10RA', 'leverage_index_avg_10RA',
       'onbase_perc_10RA', 'onbase_plus_slugging_10RA'],
      dtype='object')


In [303]:
pitching['SO_batters_faced'] = pitching['so'] / pitching['batters_faced']
pitching['H_batters_faced'] = pitching['h'] / pitching['batters_faced']
pitching['BB_batters_faced'] = pitching['bb'] / pitching['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitching, df, b_stats, True)

0       STL
1       NYM
2       KC2
3       OAK
4       ATL
       ... 
1713    SEA
1714    ATL
1715    BOS
1716    OL2
1717    SFC
Name: team, Length: 1718, dtype: object


In [304]:
pitchers['SO_batters_faced'] = pitchers['so'] / pitchers['batters_faced']
pitchers['H_batters_faced'] = pitchers['h'] / pitchers['batters_faced']
pitchers['BB_batters_faced'] = pitchers['bb'] / pitchers['batters_faced']

# create rolling stat
b_stats = ['earned_run_avg','SO_batters_faced','H_batters_faced','BB_batters_faced']
df = features.add_10RA_rolling(pitchers, df, b_stats, False)

0       STL
1       STL
2       STL
3       STL
4       STL
       ... 
8739    OL2
8740    OL2
8741    SFC
8742    SFC
8743    SFC
Name: team, Length: 8744, dtype: object
An unexpected error occurred: No numeric types to aggregate


In [305]:
print("GAMES:", df['date'])

GAMES: 0       1709251200
1       1709251200
2       1709251200
3       1709251200
4       1709251200
           ...    
2060    1715299200
2061    1719446400
2062    1723161600
2063    1724976000
2064    1727308800
Name: date, Length: 2065, dtype: int64


#### Games Stats

In [306]:
df = features.game_stats(games,df)

GAMES DATE: 0     2024-03-01
1     2024-03-01
2     2024-03-01
3     2024-03-02
4     2024-03-02
         ...    
930   2024-05-07
931   2024-05-07
932   2024-05-07
933   2024-05-07
934   2024-05-08
Name: date, Length: 935, dtype: datetime64[ns]


In [307]:
df.shape

(2065, 54)

#### Season Stats

In [308]:
batting_stats = ['a', 'ab', 'bb', 'h', 'pa', 'po', 'r', 'rbi', 'so', 'batting_avg',
             'leverage_index_avg', 'onbase_perc', 'onbase_plus_slugging', 'pitches', 
             're24_bat', 'slugging_perc', 'strikes_total', 'wpa_bat', 'wpa_bat_neg', 
             'wpa_bat_pos']
df = features.add_season_rolling(batting, df, batting_stats, True,'batting')
df.shape

Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre',
       ...
       'strikes_total_skew_home_batting', 'wpa_bat_mean_home_batting',
       'wpa_bat_std_home_batting', 'wpa_bat_skew_home_batting',
       'wpa_bat_neg_mean_home_batting', 'wpa_bat_neg_std_home_batting',
       'wpa_bat_neg_skew_home_batting', 'wpa_bat_pos_mean_home_batting',
       'wpa_bat_pos_std_home_batting', 'wpa_bat_pos_skew_home_batting'],
      dtype='object', length=114)
Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre',
       ...
       'strikes_total_skew_away_batting', 'wpa_bat_mean_away_batting',
       'wpa_bat_std_away_batting', 'wpa_bat_skew_away_batting',
       'wpa_bat_neg_mean_away_batting', 'wpa_bat_neg_std_away_batting',
       'wpa_bat_neg_skew_away_batt

(2065, 194)

In [309]:
pitching_stats = ['bb', 'er', 'h', 'hr', 'ip', 'r', 'so', 'batters_faced',
               'earned_run_avg', 'game_score', 'inherited_runners',
               'inherited_score', 'inplay_fb_total', 'inplay_gb_total', 'inplay_ld',
               'inplay_unk', 'leverage_index_avg', 'pitches', 're24_def',
               'strikes_contact', 'strikes_looking', 'strikes_swinging',
               'strikes_total', 'wpa_def','SO_batters_faced','H_batters_faced',
                'BB_batters_faced']
df = features.add_season_rolling(pitching, df, pitching_stats, True,'team_pitching')
df.shape

Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre',
       ...
       'wpa_def_skew_home_team_pitching',
       'SO_batters_faced_mean_home_team_pitching',
       'SO_batters_faced_std_home_team_pitching',
       'SO_batters_faced_skew_home_team_pitching',
       'H_batters_faced_mean_home_team_pitching',
       'H_batters_faced_std_home_team_pitching',
       'H_batters_faced_skew_home_team_pitching',
       'BB_batters_faced_mean_home_team_pitching',
       'BB_batters_faced_std_home_team_pitching',
       'BB_batters_faced_skew_home_team_pitching'],
      dtype='object', length=275)
Index(['game_id', 'home_team_abbr', 'away_team_abbr', 'date', 'is_night_game',
       'home_team_win', 'home_pitcher', 'away_pitcher', 'is_test',
       'home_trueskill_pre',
       ...
       'wpa_def_skew_away_team_pitching',
       'SO_batters_faced_mean_away_team_pitching',
     

(2065, 383)

In [310]:
df = features.add_season_rolling(pitchers, df, pitching_stats, False,'pitcher')
df.shape

DataError: No numeric types to aggregate

## Cleanup

In [None]:
df = util.fix_na(df, False)

In [None]:
print(df['home_team_win'])

0       0.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
2060    1.0
2061    0.0
2062    0.0
2063    0.0
2064    0.0
Name: home_team_win, Length: 2065, dtype: float64


# Generate Predictions

In [None]:
X = df.drop(columns=['home_team_win', 'home_team_abbr','away_team_abbr', 'game_id','home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season'])  # Assuming 'target' is the name of your target column
y = df['home_team_win']

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)




In [245]:
from sklearn.metrics import accuracy_score
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
model.fit(X_train, y_train_encoded)

# Evaluate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val_encoded, y_pred))



Validation Accuracy: 0.7191283292978208


In [313]:
filtered_df = df[df['game_id'].isin(test_df['game_id'])]
X_pred = filtered_df.drop(columns=[
    'home_team_win', 'home_team_abbr', 'away_team_abbr', 'game_id',
    'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season'
])
y_pred = model.predict(X_pred)
print("Predictions:", y_pred)

Predictions: [1 1 0 1 1 1 1 1 1 1 1 1 1 1 1]


In [315]:
y_pred_proba = model.predict_proba(X_pred)
print("Predicted Probabilities for the Positive Class:", y_pred_proba[:, 1])

Predicted Probabilities for the Positive Class: [0.8476117  0.9281515  0.4303208  0.92766136 0.96507925 0.99340993
 0.9929612  0.9994122  0.8691642  0.99424464 0.95496744 0.97527415
 0.9312412  0.9880201  0.99775153]


In [314]:

selected_abbr = np.where(np.array(y_pred) == 1, test_df['home_team_abbr'], test_df['away_team_abbr'])

# Print the result
print(selected_abbr)

['DET' 'PIT' 'NYY' 'BAL' 'TOR' 'BOS' 'MIA' 'NYM' 'CHW' 'MIL' 'COL' 'LAA'
 'SDP' 'SEA' 'SFG']


In [317]:
import json
predictions = []

# Loop through the DataFrame and associated predictions
for index, row in test_df.iterrows():
    # Create a dictionary for each prediction following the specified structure
    prediction = {
        "id": str(index),  # Convert index to string to match 'id: string' type
        "home_team": row['home_team_abbr'],
        "away_team": row['away_team_abbr'],
        "ml_pred": selected_abbr[index],
        "ml_conf": str(y_pred_proba[index]),  # Convert float to string to match 'ml_conf: string'
        "ou_pred": "0",  # Placeholder values as specified
        "ou_conf": "0"   # Placeholder values as specified
    }
    predictions.append(prediction)

# Convert the list of dictionaries to JSON
json_output = json.dumps(predictions, indent=2)
print(json_output)

TypeError: Object of type int32 is not JSON serializable

In [187]:
import pickle
encoder, model = pickle.load(open('../data/encoder_model.pk','rb'))

X_test = encoder.transform(X_test)
proba = model.predict_proba(X_test)[:,1]
pred = model.predict(X_test)

FileNotFoundError: [Errno 2] No such file or directory: '../data/encoder_model.pk'

In [None]:
test_df = test_df.sort_values(by=['date','game_id']).reset_index(drop=True)
pred_df = df[df.is_test][['away_pitcher', 'away_team_abbr', 'home_pitcher', 'home_team_abbr']]
pred_df['home'] = pred_df['home_team_abbr']
pred_df['away'] = pred_df['away_team_abbr']
pred_df.drop(columns=['home_team_abbr','away_team_abbr'], inplace=True)
pred_df['xgb_proba']= proba
pred_df['xgb_winner']=pred_df.home
pred_df['xgb_winner'][~pred]=pred_df.away

In [None]:
# get daily odds from covers.com
import requests
from bs4 import BeautifulSoup as bs
html = requests.get('https://www.covers.com/sports/mlb/matchups').text
soup = bs(html)
games = []
for s in soup.findAll('div',{'class':'cmg_matchup_game_box cmg_game_data'}):
    g = {}
    g['home'] = s['data-home-team-shortname-search']
    g['home_odds'] = s['data-game-odd']
    
    if g['home']=='SD':g['home']='SDP'
    if g['home']=='KC':g['home']='KCR'
    if g['home']=='SF':g['home']='SFG'
    if g['home']=='WAS':g['home']='WSN'
    if g['home']=='TB':g['home']='TBR'
    
    games.append(g)
odds = pd.DataFrame(games)

# merge in the odds
pred_df = pd.merge(left=pred_df, right=odds, on='home', how='left')
pred_df['home_odds']=pd.to_numeric(pred_df['home_odds'], errors='coerce')
# pred_df['online_odds'][pred_df.xgb_probability<0.5] = -pred_df['online_odds'] #convert odds to pred winner odds (not home team odds)

In [None]:
# online proba
#https://www.bettingexpert.com/en-au/learn/understanding-betting-odds/how-to-convert-odds
pred_df['online_proba'] = -pred_df['home_odds']/(-pred_df['home_odds']+100)
pred_df['online_proba'][pred_df['home_odds']>0] = 100/(pred_df['home_odds']+100)

# Confidence
pred_df['confidence'] = np.abs(pred_df['xgb_proba']-0.5)+.5
online_conf = np.abs(pred_df['online_proba']-0.5)+.5

pred_df['conf_diff'] = pred_df['confidence'] - online_conf
pred_df['conf_diff'][(pred_df['xgb_proba']>.5)&(pred_df['online_proba']<.5)] = 'Contrary'
pred_df['conf_diff'][(pred_df['xgb_proba']<.5)&(pred_df['online_proba']>.5)] = 'Contrary'

In [None]:
# merge in team names
teams = pd.read_csv("../data/teams.csv")
pred_df = pd.merge(left=pred_df, right=teams, 
                   left_on='xgb_winner',right_on='Abbr',
                   how='left')

pred_df['pred_winner'] = pred_df['Team']
pred_df.drop(columns=['xgb_winner','Abbr','Team'], inplace=True)

In [None]:
from IPython.display import HTML
pd.options.display.float_format = '{:.3f}'.format

pred_df['conf'] = pred_df.confidence
HTML(pred_df.sort_values(by='confidence', ascending=False).to_html(index=False))

In [None]:
print("2-Team Parlays:")
pred_df = pred_df.sort_values(by='conf', ascending=False).reset_index(drop=True)
for i in range(5):
    t1 = pred_df.iloc[i]
    t2 = pred_df.iloc[i+1]
    print(f"- {t1.pred_winner}, {t2.pred_winner}  \t{t1.conf*t2.conf: .3f}")