In [1]:
#pip install torch --index-url https://download.pytorch.org/whl/cu118
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


data = pd.read_csv("../data/model_data.csv")


In [2]:
home_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' not in i)]].copy()
away_surfaces = data[[i for i in data.columns if ('surface_' in i)&('away_' in i)]].copy()

surface_interactions = pd.DataFrame()
for col in home_surfaces.columns:
    for acol in away_surfaces.columns:
        surface_interactions[f"{col}*{acol}"] = (
            home_surfaces[col]*away_surfaces[acol]
        )

sit = surface_interactions.T

surface_interactions = sit[sit.sum(axis=1)>30].T

data[list(surface_interactions.columns)] = surface_interactions

data['coverage_flg'] = np.where(data['coverage']>0,1,0)

In [3]:
this_season = data[data['season']==2023].copy()

model_data = data[data['season']<2023].copy()
model_data = model_data[model_data['coverage']!=0].copy()

exclusions = ['away_team','home_team','away_score','home_score','result','season','coverage','coverage_flg']

first_features = [
    i for i in model_data.columns if 
    (i not in exclusions)&
    ('_team_id_' not in i)&
    ('2pt' not in i)
]
target = 'coverage_flg'
spread = 'spread_line'
result = 'result'



In [4]:
Xtrain,Xtest,ytrain,ytest=train_test_split(model_data[first_features],model_data[target],
                                           test_size=.2,random_state=42,shuffle=True)

In [5]:
corr = pd.concat([Xtrain,ytrain],axis=1).corr(numeric_only=True,method='pearson')[target].sort_values()
corr.sort_values(ascending=False)
feats_used = corr[(abs(corr)>.04)&(abs(corr)<1)]
features = [i for i in feats_used.index if i != target]

In [6]:
pd.set_option("display.max_rows",164)
corr.sort_values(ascending=False)


coverage_flg                                1.000000e+00
week                                        5.472462e-02
away_cumemeanresult_shift1                  5.445879e-02
away_rushing_fumbles                        5.046088e-02
home_receiving_fumbles                      4.751360e-02
home_receiving_tds                          4.650585e-02
home_pacr                                   4.369584e-02
away_special_teams_tds                      4.165177e-02
diff_passing_tds                            4.061924e-02
away_rushing_first_downs                    4.057666e-02
surface_matrixturf*away_surface_grass       3.859767e-02
away_rushing_yards                          3.857847e-02
diff_receiving_tds                          3.705187e-02
away_rushing_epa                            3.638679e-02
away_fantasy_points                         3.566433e-02
home_carries                                3.547623e-02
diff_sack_fumbles                           3.538471e-02
diff_passing_epa               

In [7]:
features

['cumemeanresult_diff',
 'diff_special_teams_tds',
 'away_surface_astroturf',
 'surface_grass*away_surface_astroturf',
 'home_cumemeanresult_shift1',
 'home_attempts',
 'spread_line',
 'away_sack_fumbles_lost',
 'home_passing_air_yards',
 'away_rushing_first_downs',
 'diff_passing_tds',
 'away_special_teams_tds',
 'home_pacr',
 'home_receiving_tds',
 'home_receiving_fumbles',
 'away_rushing_fumbles',
 'away_cumemeanresult_shift1',
 'week']

In [8]:
ytrain.mean()

0.477124183006536

In [9]:
Xtrain,Xtest,ytrain,ytest,strain,stest=train_test_split(
    model_data[features],model_data[target],
    ((model_data['season']-min(model_data['season']))/(max(model_data['season'])-min(model_data['season'])))*0+1,
    test_size=.2,random_state=42,shuffle=True)



In [10]:
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector

from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [11]:


inner = SGDClassifier(loss='perceptron',penalty='l2',alpha=.0001,
                      fit_intercept=False,
                      early_stopping=True,shuffle=True,
                      validation_fraction=.1,n_iter_no_change=25,random_state=42)

inner = KNeighborsClassifier(n_neighbors=60,weights='distance')
#inner = RandomForestClassifier(random_state=42,n_estimators=35,max_depth=4)
model = Pipeline(steps=[
    ('poly',PolynomialFeatures(degree=2,interaction_only=False,include_bias=True)),
    ('VarianceThreshold',VarianceThreshold()),
    ('scaler',StandardScaler()),
    ('selector',SelectKBest(k=15)),
    #('selector',SequentialFeatureSelector(estimator=inner)),
    ('learner',inner
    )
]).fit(Xtrain.fillna(0),ytrain)#,learner__sample_weight=strain)

coverage_flg_pred = pd.Series(model.predict(Xtest.fillna(0)),Xtest.index)
coverage_flg_fit = pd.Series(model.predict(Xtrain.fillna(0)),Xtrain.index)

In [12]:
print("Training: ")
result = pd.DataFrame()
result[target] = ytrain
result[target+'_pred'] = coverage_flg_fit

result['outcome'] = np.where(
    (result['coverage_flg']==0)&(result['coverage_flg_pred']==0),'TN',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==1),'TP',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==0),'FN','FP')))

result['correct'] = np.where(result['outcome'].str[0]=='T','T','F')
result['cnt']=1
conf = result.groupby(['outcome'])['cnt'].sum()
acc = result.groupby(['correct'])['cnt'].sum()

if 'FN' in conf.index:
    print(conf)
    print(acc)
    print('recall: ',conf['TP']/(conf['TP']+conf['FN']))
    print('precision: ',conf['TP']/(conf['TP']+conf['FP']))
    print('selectivity: ',conf['TN']/(conf['TN']+conf['FP']))
    print('neg pred val: ',conf['TN']/(conf['TN']+conf['FN']))
    print('accuracy: ',acc['T']/(acc['T']+acc['F']))

print("Test: ")
result = pd.DataFrame()
result[target] = ytest
result[target+'_pred'] = coverage_flg_pred

result['outcome'] = np.where(
    (result['coverage_flg']==0)&(result['coverage_flg_pred']==0),'TN',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==1),'TP',
    np.where(
    (result['coverage_flg']==1)&(result['coverage_flg_pred']==0),'FN','FP')))
result['correct'] = np.where(result['outcome'].str[0]=='T','T','F')
result['cnt']=1
conf = result.groupby(['outcome'])['cnt'].sum()
acc = result.groupby(['correct'])['cnt'].sum()
print(conf)
print(acc)
print('recall: ',conf['TP']/(conf['TP']+conf['FN']))
print('precision: ',conf['TP']/(conf['TP']+conf['FP']))
print('selectivity: ',conf['TN']/(conf['TN']+conf['FP']))
print('neg pred val: ',conf['TN']/(conf['TN']+conf['FN']))
print('accuracy: ',acc['T']/(acc['T']+acc['F']))

Training: 
Test: 
outcome
FN    81
FP    46
TN    94
TP    47
Name: cnt, dtype: int64
correct
F    127
T    141
Name: cnt, dtype: int64
recall:  0.3671875
precision:  0.5053763440860215
selectivity:  0.6714285714285714
neg pred val:  0.5371428571428571
accuracy:  0.5261194029850746


In [13]:
allY = pd.concat([ytrain,ytest])
allX = pd.concat([Xtrain,Xtest])

model.fit(allX.fillna(0),allY)

In [14]:
result.mean(numeric_only=True)

coverage_flg         0.477612
coverage_flg_pred    0.347015
cnt                  1.000000
dtype: float64

In [15]:
this_seasonX = this_season[features].copy()
this_season_coverage_flg_prediction = pd.Series(
    model.predict(this_seasonX.fillna(0)),
    this_seasonX.index)
this_season['coverage_flg_pred'] = this_season_coverage_flg_prediction

In [16]:

this_season[this_season['week']==1][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1372,DET,KC,0
1373,CAR,ATL,0
1374,HOU,BAL,0
1375,CIN,CLE,0
1376,JAX,IND,1
1377,TB,MIN,0
1378,TEN,NO,1
1379,SF,PIT,0
1380,ARI,WAS,0
1381,GB,CHI,1


In [17]:
this_season[this_season['week']==2][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1388,MIN,PHI,0
1389,GB,ATL,1
1390,LV,BUF,1
1391,BAL,CIN,1
1392,SEA,DET,0
1393,IND,HOU,0
1394,KC,JAX,0
1395,CHI,TB,0
1396,LAC,TEN,1
1397,NYG,ARI,0


In [18]:
this_season[this_season['week']==3][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1404,NYG,SF,0
1405,IND,BAL,0
1406,TEN,CLE,0
1407,ATL,DET,1
1408,NO,GB,0
1409,HOU,JAX,0
1410,DEN,MIA,0
1411,LAC,MIN,0
1412,NE,NYJ,0
1413,BUF,WAS,0


In [19]:
this_season[this_season['week']==4][['away_team','home_team','coverage_flg_pred']]

Unnamed: 0,away_team,home_team,coverage_flg_pred
1420,DET,GB,0
1421,ATL,JAX,1
1422,MIA,BUF,0
1423,MIN,CAR,0
1424,DEN,CHI,1
1425,BAL,CLE,0
1426,PIT,HOU,0
1427,LA,IND,0
1428,TB,NO,0
1429,WAS,PHI,0


In [20]:
this_season[this_season['week']<4]['coverage_flg_pred'].mean()

0.3958333333333333

In [21]:
this_season[this_season['week']<4]['coverage_flg'].mean()

0.3958333333333333

In [22]:
this_season['coverage_flg']

1372    0
1373    1
1374    1
1375    1
1376    0
       ..
1639    0
1640    0
1641    0
1642    0
1643    0
Name: coverage_flg, Length: 272, dtype: int32