In [1]:
#pip install torch --index-url https://download.pytorch.org/whl/cu118
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


data = pd.read_csv("../data/model_data.csv")


In [2]:
home_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' not in i)]].copy()
away_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' in i)]].copy()

surface_interactions = pd.DataFrame()
for col in home_surfaces.columns:
    for acol in away_surfaces.columns:
        surface_interactions[f"{col}*{acol}"] = (
            home_surfaces[col]*away_surfaces[acol]
        )

sit = surface_interactions.T

surface_interactions = sit[sit.sum(axis=1)>30].T

data[list(surface_interactions.columns)] = surface_interactions

data['coverage_flg'] = np.where(data['coverage']>0,1,0)

In [3]:
this_season = data[data['season']==2023].copy()

model_data = data[data['season']<2023].copy()
model_data = model_data[model_data['coverage']!=0].copy()

exclusions = ['away_team','home_team','away_score','home_score','result','season','coverage','coverage_flg']

first_features = [
    i for i in model_data.columns if 
    (i not in exclusions)&
    ('_team_id_' not in i)&
    ('2pt' not in i)
]
target = 'coverage_flg'
spread = 'spread_line'
result = 'result'



In [4]:
Xtrain,Xtest,ytrain,ytest=train_test_split(model_data[first_features],model_data[target],
                                           test_size=.1,random_state=42,shuffle=True)

In [5]:
corr = pd.concat([Xtrain,ytrain],axis=1).corr(numeric_only=True,method='pearson')[target].sort_values()
corr.sort_values(ascending=False)
feats_used = corr[(abs(corr)>.038)&(abs(corr)<1)]
features = [i for i in feats_used.index if i != target]

In [6]:
model_data[['home_cumemeanpoints_scored_shift1','home_cumemeanpoints_allowed_shift1',
            'away_cumemeanpoints_scored_shift1','away_cumemeanpoints_allowed_shift1',
            'cumemeanpoints_allowed_diff','cumemeanpoints_scored_diff',
            'away_offense_power','home_offense_power',
            'coverage_flg'
            ]].corr()['coverage_flg']

home_cumemeanpoints_scored_shift1    -0.020936
home_cumemeanpoints_allowed_shift1    0.013423
away_cumemeanpoints_scored_shift1     0.015886
away_cumemeanpoints_allowed_shift1   -0.039493
cumemeanpoints_allowed_diff           0.056527
cumemeanpoints_scored_diff           -0.038419
away_offense_power                    0.003151
home_offense_power                    0.016625
coverage_flg                          1.000000
Name: coverage_flg, dtype: float64

In [7]:
pd.set_option("display.max_rows",164)
corr.sort_values(ascending=False)


coverage_flg                                1.000000
home_pacr                                   0.052029
week                                        0.050180
home_receiving_fumbles                      0.050001
away_cumemeanresult_shift1                  0.047333
home_receiving_tds                          0.043258
away_fantasy_points                         0.043080
surface_matrixturf                          0.042944
away_rushing_epa                            0.042569
diff_sack_fumbles_lost                      0.042173
cumemeanpoints_allowed_diff                 0.041616
surface_matrixturf*away_surface_grass       0.039669
away_rushing_fumbles                        0.038733
diff_receiving_fumbles                      0.036003
away_surface_grass                          0.034810
away_receiving_air_yards                    0.034297
home_carries                                0.033915
away_fantasy_points_ppr                     0.033844
home_rushing_yards                          0.

In [8]:
features

['away_sack_fumbles_lost',
 'cumemeanresult_diff',
 'surface_grass*away_surface_astroturf',
 'home_attempts',
 'away_surface_astroturf',
 'home_passing_air_yards',
 'cumemeanpoints_scored_diff',
 'diff_special_teams_tds',
 'away_surface_matrixturf',
 'spread_line',
 'diff_passing_air_yards',
 'home_completions',
 'away_rushing_fumbles',
 'surface_matrixturf*away_surface_grass',
 'cumemeanpoints_allowed_diff',
 'diff_sack_fumbles_lost',
 'away_rushing_epa',
 'surface_matrixturf',
 'away_fantasy_points',
 'home_receiving_tds',
 'away_cumemeanresult_shift1',
 'home_receiving_fumbles',
 'week',
 'home_pacr']

In [9]:
ytrain.mean()

0.47717842323651455

In [10]:
Xtrain,Xtest,ytrain,ytest,strain,stest=train_test_split(
    model_data[features],model_data[target],
    ((model_data['season']-min(model_data['season']))/(max(model_data['season'])-min(model_data['season'])))*0+1,
    test_size=.2,random_state=42,shuffle=True)



In [11]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


from sklearn.impute import KNNImputer

In [12]:


inner = SGDClassifier(loss='perceptron',penalty='l2',alpha=.0001,
                      fit_intercept=False,
                      early_stopping=True,shuffle=True,
                      validation_fraction=.1,n_iter_no_change=26,random_state=42)


inner = KNeighborsClassifier(n_neighbors=15,weights='uniform')
#inner = RandomForestClassifier(random_state=42,n_estimators=35,max_depth=6,min_samples_leaf=5)
#inner = SVC(C=1,kernel='rbf',)
model = Pipeline(steps=[
    #('imputer',KNNImputer(n_neighbors=5)),
    ('poly',PolynomialFeatures(degree=2,interaction_only=False,include_bias=True)),
    ('VarianceThreshold',VarianceThreshold()),
    ('scaler',StandardScaler()),
    ('selector',SelectKBest(k=45)),
    ('selector2',SequentialFeatureSelector(estimator=inner,n_features_to_select=5,n_jobs=-1)),
    ('learner',inner
    )
]).fit(Xtrain.fillna(0),ytrain)#,learner__sample_weight=strain)

coverage_flg_pred = pd.Series(model.predict(Xtest.fillna(0)),Xtest.index)
coverage_flg_fit = pd.Series(model.predict(Xtrain.fillna(0)),Xtrain.index)

In [13]:
print("Training: ")
result = pd.DataFrame()
result[target] = ytrain
result[target+'_pred'] = coverage_flg_fit

result['outcome'] = np.where(
    (result['coverage_flg']==0)&(result['coverage_flg_pred']==0),'TN',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==1),'TP',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==0),'FN','FP')))

result['correct'] = np.where(result['outcome'].str[0]=='T','T','F')
result['cnt']=1
conf = result.groupby(['outcome'])['cnt'].sum()
acc = result.groupby(['correct'])['cnt'].sum()

if 'FN' in conf.index:
    print(conf)
    print(acc)
    print('recall: ',conf['TP']/(conf['TP']+conf['FN']))
    print('precision: ',conf['TP']/(conf['TP']+conf['FP']))
    print('selectivity: ',conf['TN']/(conf['TN']+conf['FP']))
    print('neg pred val: ',conf['TN']/(conf['TN']+conf['FN']))
    print('accuracy: ',acc['T']/(acc['T']+acc['F']))

print("Test: ")
result = pd.DataFrame()
result[target] = ytest
result[target+'_pred'] = coverage_flg_pred

result['outcome'] = np.where(
    (result['coverage_flg']==0)&(result['coverage_flg_pred']==0),'TN',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==1),'TP',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==0),'FN','FP')))
result['correct'] = np.where(result['outcome'].str[0]=='T','T','F')
result['cnt']=1
conf = result.groupby(['outcome'])['cnt'].sum()
acc = result.groupby(['correct'])['cnt'].sum()
print(conf)
print(acc)
print('recall: ',conf['TP']/(conf['TP']+conf['FN']))
print('precision: ',conf['TP']/(conf['TP']+conf['FP']))
print('selectivity: ',conf['TN']/(conf['TN']+conf['FP']))
print('neg pred val: ',conf['TN']/(conf['TN']+conf['FN']))
print('accuracy: ',acc['T']/(acc['T']+acc['F']))

Training: 
outcome
FN    231
FP    162
TN    398
TP    280
Name: cnt, dtype: int64
correct
F    393
T    678
Name: cnt, dtype: int64
recall:  0.547945205479452
precision:  0.6334841628959276
selectivity:  0.7107142857142857
neg pred val:  0.6327503974562798
accuracy:  0.6330532212885154
Test: 
outcome
FN    63
FP    61
TN    79
TP    65
Name: cnt, dtype: int64
correct
F    124
T    144
Name: cnt, dtype: int64
recall:  0.5078125
precision:  0.5158730158730159
selectivity:  0.5642857142857143
neg pred val:  0.5563380281690141
accuracy:  0.5373134328358209


In [14]:
allY = pd.concat([ytrain,ytest])
allX = pd.concat([Xtrain,Xtest])

model.fit(allX.fillna(0),allY)

In [15]:
result.mean(numeric_only=True)

coverage_flg         0.477612
coverage_flg_pred    0.470149
cnt                  1.000000
dtype: float64

In [16]:
this_seasonX = this_season[features].copy()
this_season_coverage_flg_prediction = pd.Series(
    model.predict(this_seasonX.fillna(0)),
    this_seasonX.index)
this_season['coverage_flg_pred'] = this_season_coverage_flg_prediction
this_season['_correct']=np.where(this_season['coverage_flg']==this_season['coverage_flg_pred'],1,0)

In [17]:

this_season[this_season['week']==1][['away_team','home_team','coverage_flg_pred','coverage_flg','_correct']]

Unnamed: 0,away_team,home_team,coverage_flg_pred,coverage_flg,_correct
1372,DET,KC,0,0,1
1373,CAR,ATL,0,1,0
1374,HOU,BAL,0,1,0
1375,CIN,CLE,0,1,0
1376,JAX,IND,0,0,1
1377,TB,MIN,0,0,1
1378,TEN,NO,0,0,1
1379,SF,PIT,0,0,1
1380,ARI,WAS,0,0,1
1381,GB,CHI,0,0,1


In [18]:
this_season[this_season['week']==2][['away_team','home_team','coverage_flg_pred','coverage_flg','_correct']]

Unnamed: 0,away_team,home_team,coverage_flg_pred,coverage_flg,_correct
1388,MIN,PHI,0,0,1
1389,GB,ATL,0,0,1
1390,LV,BUF,0,1,0
1391,BAL,CIN,0,0,1
1392,SEA,DET,0,0,1
1393,IND,HOU,0,0,1
1394,KC,JAX,0,0,1
1395,CHI,TB,0,1,0
1396,LAC,TEN,0,1,0
1397,NYG,ARI,0,1,0


In [19]:
this_season[this_season['week']==3][['away_team','home_team','coverage_flg_pred','coverage_flg','_correct']]

Unnamed: 0,away_team,home_team,coverage_flg_pred,coverage_flg,_correct
1404,NYG,SF,0,1,0
1405,IND,BAL,0,0,1
1406,TEN,CLE,1,1,1
1407,ATL,DET,0,1,0
1408,NO,GB,0,1,0
1409,HOU,JAX,0,0,1
1410,DEN,MIA,0,1,0
1411,LAC,MIN,0,0,1
1412,NE,NYJ,0,0,1
1413,BUF,WAS,0,0,1


In [20]:
this_season[this_season['week']==4][['away_team','home_team','coverage_flg_pred','coverage_flg','_correct']]

Unnamed: 0,away_team,home_team,coverage_flg_pred,coverage_flg,_correct
1420,DET,GB,1,0,0
1421,ATL,JAX,0,1,0
1422,MIA,BUF,1,1,1
1423,MIN,CAR,0,0,1
1424,DEN,CHI,0,0,1
1425,BAL,CLE,1,0,0
1426,PIT,HOU,0,1,0
1427,LA,IND,1,0,0
1428,TB,NO,0,0,1
1429,WAS,PHI,0,0,1


In [21]:
this_season[this_season['week']==5][['away_team','home_team','coverage_flg_pred','spread_line','coverage_flg','_correct']]

Unnamed: 0,away_team,home_team,coverage_flg_pred,spread_line,coverage_flg,_correct
1436,CHI,WAS,0,6.0,0,1
1437,JAX,BUF,0,5.5,0,1
1438,HOU,ATL,0,2.5,0,1
1439,CAR,DET,0,9.5,1,0
1440,TEN,IND,0,-2.5,1,0
1441,NYG,MIA,0,12.5,1,0
1442,NO,NE,0,2.5,0,1
1443,BAL,PIT,0,-4.5,1,0
1444,CIN,ARI,1,-3.0,0,0
1445,PHI,LA,0,-3.5,0,1


In [31]:
this_season[this_season['week']<5]['coverage_flg_pred'].mean()

0.203125

In [32]:
this_season[this_season['week']<4]['coverage_flg'].mean()

0.3958333333333333

In [33]:
poly = model['poly']

In [34]:
feats_after_var = [poly.get_feature_names_out()[i] for i in model['VarianceThreshold'].get_support(indices=True)]

In [35]:
first_45 = [feats_after_var[i] for i in model['selector'].get_support(indices=True)]

In [36]:
last_5 = [first_45[i] for i in model['selector2'].get_support(indices=True)]

In [37]:
last_5

['surface_grass*away_surface_astroturf',
 'surface_grass*away_surface_astroturf^2',
 'surface_grass*away_surface_astroturf away_rushing_epa',
 'away_surface_astroturf diff_special_teams_tds',
 'home_completions away_cumemeanresult_shift1']

In [29]:
'''
166+ features
Downselected by correlation to 30ish
Polynomial interactions calculated to 300+
Downselected by f-value of ANOVA to 45
Forward selected to final 5 by perceptron
These are the final 5 features. All are interaction features.
cumemeanresult_diff away_fantasy_points and home_pacr each appear in multiple features
'''

'\n166+ features\nDownselected by correlation to 30ish\nPolynomial interactions calculated to 300+\nDownselected by f-value of ANOVA to 45\nForward selected to final 5 by perceptron\nThese are the final 5 features. All are interaction features.\ncumemeanresult_diff away_fantasy_points and home_pacr each appear in multiple features\n'

In [30]:
import joblib

joblib.dump(model,'../artifacts/model.joblib')

['../artifacts/model.joblib']