In [1]:
#pip install torch --index-url https://download.pytorch.org/whl/cu118
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


data = pd.read_csv("../data/model_data.csv")


In [2]:
home_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' not in i)]].copy()
away_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' in i)]].copy()

surface_interactions = pd.DataFrame()
for col in home_surfaces.columns:
    for acol in away_surfaces.columns:
        surface_interactions[f"{col}*{acol}"] = (
            home_surfaces[col]*away_surfaces[acol]
        )

sit = surface_interactions.T

surface_interactions = sit[sit.sum(axis=1)>30].T

data[list(surface_interactions.columns)] = surface_interactions

data['coverage_flg'] = np.where(data['coverage']>0,1,0)

In [3]:
this_season = data[data['season']==2023].copy()

model_data = data[data['season']<2023].copy()
model_data = model_data[model_data['coverage']!=0].copy()

exclusions = ['away_team','home_team','away_score','home_score','result','season','coverage','coverage_flg']

first_features = [
    i for i in model_data.columns if 
    (i not in exclusions)&
    ('_team_id_' not in i)
]
target = 'coverage_flg'
spread = 'spread_line'
result = 'result'



In [4]:
Xtrain,Xtest,ytrain,ytest=train_test_split(model_data[first_features],model_data[target],test_size=.2,random_state=42,shuffle=True)

In [5]:
corr = pd.concat([Xtrain,ytrain],axis=1).corr(numeric_only=True,method='pearson')[target].sort_values()
corr.sort_values(ascending=False)
feats_used = corr[(abs(corr)>.04)&(abs(corr)<1)]
features = [i for i in feats_used.index if i != target]

In [6]:
features

['cumemeanresult_diff',
 'away_surface_astroturf',
 'surface_grass*away_surface_astroturf',
 'home_cumemeanresult_shift1',
 'home_attempts',
 'spread_line',
 'away_sack_fumbles_lost',
 'home_passing_air_yards',
 'away_rushing_first_downs',
 'home_rushing_2pt_conversions',
 'away_special_teams_tds',
 'home_pacr',
 'home_receiving_tds',
 'home_receiving_fumbles',
 'away_passing_2pt_conversions',
 'away_rushing_fumbles',
 'away_cumemeanresult_shift1',
 'week',
 'away_receiving_2pt_conversions']

In [7]:
ytrain.mean()

0.477124183006536

In [8]:
"""features = [
    'cumemeanresult_diff',
    'away_surface_astroturf',
    'away_cumemeanresult_shift1',
]"""

"features = [\n    'cumemeanresult_diff',\n    'away_surface_astroturf',\n    'away_cumemeanresult_shift1',\n]"

In [9]:
"""for col in first_features:
    plt.scatter(Xtrain[col],ytrain)
    plt.title(col)
    plt.show()"""

'for col in first_features:\n    plt.scatter(Xtrain[col],ytrain)\n    plt.title(col)\n    plt.show()'

In [10]:
Xtrain,Xtest,ytrain,ytest,strain,stest,rtrain,rtest=train_test_split(
    model_data[features],model_data[target],model_data[spread],model_data[result],
    test_size=.2,random_state=42,shuffle=True)



In [11]:
Xtrain

Unnamed: 0,cumemeanresult_diff,away_surface_astroturf,surface_grass*away_surface_astroturf,home_cumemeanresult_shift1,home_attempts,spread_line,away_sack_fumbles_lost,home_passing_air_yards,away_rushing_first_downs,home_rushing_2pt_conversions,away_special_teams_tds,home_pacr,home_receiving_tds,home_receiving_fumbles,away_passing_2pt_conversions,away_rushing_fumbles,away_cumemeanresult_shift1,week,away_receiving_2pt_conversions
1177,-1.400000,0,0,-7.200000,13.653782,10.0,0.190476,116.880672,1.095389,,0.600000,0.933732,0.392157,0.158333,0.142063,0.177612,-5.80,6,0.071195
826,-4.000000,0,0,-5.000000,17.316667,-5.5,1.000000,167.705556,1.205833,,0.583333,1.138762,0.333402,0.071429,0.066667,0.168254,-1.00,2,0.075000
1317,6.571429,0,0,6.571429,11.009804,10.5,0.250000,82.990605,1.127841,0.058824,0.541667,0.914010,0.264310,0.102727,,0.291667,0.00,16,0.062500
609,3.000000,0,0,0.000000,13.332315,7.0,,112.411980,0.886310,0.125000,1.000000,0.918059,0.314454,0.066964,,0.062500,-3.00,5,0.083333
1213,-1.500000,0,0,-3.250000,18.290598,-4.0,0.288462,148.615385,0.851853,,,0.926478,0.331775,0.071429,0.074176,0.279376,-1.75,9,0.065126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1119,-13.000000,0,0,3.000000,13.847912,2.5,0.125000,90.729963,0.858748,0.072917,0.333333,1.172346,0.326613,0.062663,,0.139423,16.00,2,0.113095
1156,14.500000,1,1,9.500000,18.617647,7.0,0.288462,151.316176,0.585245,0.058824,0.500000,0.977478,0.210274,0.100198,0.076923,0.071429,-5.00,5,0.200000
1325,-3.928571,0,0,-6.428571,13.653782,-3.0,0.077381,116.880672,1.780564,,,0.933732,0.392157,0.158333,0.119048,0.239601,-2.50,16,0.058824
879,12.500000,0,0,13.750000,13.075000,6.0,,106.050000,0.832006,,,1.045422,0.299297,0.125000,,0.253440,1.25,5,0.142857


In [12]:

#inner = LogisticRegression(max_iter=4000)
"""inner = MLPClassifier(
    hidden_layer_sizes=(100,100,),
    activation='logistic',
    solver='sgd',
    learning_rate_init=.0001,
    learning_rate='constant',
    max_iter=4000,
    early_stopping=True,
    validation_fraction=.1,
    n_iter_no_change=100,
    verbose=True,
    batch_size=1
)"""
"""
inner = RandomForestClassifier(random_state=42,
                               n_estimators=100,criterion='entropy',
                               max_depth=3,min_samples_leaf=20)"""
#inner = SVC(C=1,kernel='rbf',verbose=True)
inner = XGBClassifier(random_state=41)

model = Pipeline(steps=[
    ('scaler',StandardScaler()),
    ('learner',inner
    )
]).fit(Xtrain.fillna(0),ytrain)

coverage_flg_pred = pd.Series(model.predict(Xtest.fillna(0)),Xtest.index)
coverage_flg_fit = pd.Series(model.predict(Xtrain.fillna(0)),Xtrain.index)

  if is_sparse(data):


In [13]:
result = pd.DataFrame()
result[target] = ytest
result[target+'_pred'] = coverage_flg_pred

result['outcome'] = np.where(
    (result['coverage_flg']==0)&(result['coverage_flg_pred']==0),'TN',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==1),'TP',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==0),'FN','FP')))
result['correct'] = np.where(result['outcome'].str[0]=='T','T','F')
result['cnt']=1
print(result.groupby(['outcome'])['cnt'].sum())
print(result.groupby(['correct'])['cnt'].sum())


outcome
FN    65
FP    58
TN    82
TP    63
Name: cnt, dtype: int64
correct
F    123
T    145
Name: cnt, dtype: int64


In [14]:
result.mean(numeric_only=True)

coverage_flg         0.477612
coverage_flg_pred    0.451493
cnt                  1.000000
dtype: float64

In [15]:
this_seasonX = this_season[features].copy()
this_season_coverage_flg_prediction = pd.Series(
    model.predict(this_seasonX.fillna(0)),
    this_seasonX.index)
this_season['coverage_flg_pred'] = this_season_coverage_flg_prediction

In [16]:

this_season[this_season['week']==1][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1372,DET,KC,1
1373,CAR,ATL,0
1374,HOU,BAL,1
1375,CIN,CLE,0
1376,JAX,IND,1
1377,TB,MIN,0
1378,TEN,NO,0
1379,SF,PIT,0
1380,ARI,WAS,0
1381,GB,CHI,1


In [17]:
this_season[this_season['week']==2][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1388,MIN,PHI,0
1389,GB,ATL,1
1390,LV,BUF,0
1391,BAL,CIN,0
1392,SEA,DET,0
1393,IND,HOU,1
1394,KC,JAX,0
1395,CHI,TB,0
1396,LAC,TEN,0
1397,NYG,ARI,0


In [18]:
this_season[this_season['week']==3][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1404,NYG,SF,0
1405,IND,BAL,1
1406,TEN,CLE,0
1407,ATL,DET,0
1408,NO,GB,0
1409,HOU,JAX,0
1410,DEN,MIA,0
1411,LAC,MIN,1
1412,NE,NYJ,0
1413,BUF,WAS,0


In [20]:
this_season[this_season['week']==4][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1420,DET,GB,1
1421,ATL,JAX,1
1422,MIA,BUF,0
1423,MIN,CAR,0
1424,DEN,CHI,0
1425,BAL,CLE,0
1426,PIT,HOU,0
1427,LA,IND,1
1428,TB,NO,0
1429,WAS,PHI,0


In [None]:
from joblib import dump
dump(model,"../artifacts/model.joblib")