# Position Performance

This notebook looks at a position as a whole, and then predicts the number of fantasy points. 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector


from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score

from sklearn.dummy import DummyRegressor

from sklearn.metrics import plot_roc_curve

from xgboost import XGBRegressor
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression,ElasticNet,Ridge,Lasso
from sklearn.svm import SVR

In [2]:
import math
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

In [47]:
df = pd.read_csv('./QB_df.csv')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264 entries, 0 to 1263
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    1264 non-null   object 
 1   PPR       1264 non-null   float64
 2   Day       1264 non-null   object 
 3   Week      1264 non-null   int64  
 4   Age       1264 non-null   int64  
 5   Team      1264 non-null   object 
 6   Home      1264 non-null   int64  
 7   Opp       1264 non-null   object 
 8   Pos       1264 non-null   object 
 9   Dome      1264 non-null   int64  
 10  RANK      1264 non-null   int64  
 11  PTS       1264 non-null   int64  
 12  SACKS     1264 non-null   int64  
 13  INT       1264 non-null   int64  
 14  TO        1264 non-null   int64  
 15  PTS.1     1264 non-null   int64  
 16  PASS YDS  1264 non-null   int64  
 17  PASS TD   1264 non-null   int64  
 18  RUSH YDS  1264 non-null   int64  
 19  RUSH TD   1264 non-null   int64  
 20  RZ ATT    1264 non-null   int6

In [49]:
df.head()

Unnamed: 0,Player,PPR,Day,Week,Age,Team,Home,Opp,Pos,Dome,...,PASS YDS,PASS TD,RUSH YDS,RUSH TD,RZ ATT,RZ TD,RZ TD%,1D,3D%,4D%
0,Aaron Rodgers,30.76,Sun,1,36,GNB,0,MIN,QB,1,...,4141,30,2151,19,58,34,58.6,366,39.5,70.8
1,Philip Rivers,10.86,Sun,2,38,IND,1,MIN,QB,2,...,4141,30,2151,19,58,34,58.6,366,39.5,70.8
2,Ryan Tannehill,12.74,Sun,3,32,TEN,0,MIN,QB,1,...,4141,30,2151,19,58,34,58.6,366,39.5,70.8
3,Deshaun Watson,20.9,Sun,4,25,HOU,1,MIN,QB,2,...,4141,30,2151,19,58,34,58.6,366,39.5,70.8
4,Russell Wilson,24.48,Sun,5,31,SEA,1,MIN,QB,1,...,4141,30,2151,19,58,34,58.6,366,39.5,70.8


# Profile Report for DF

This section may take a few minutes to run.

In [50]:
from pandas_profiling import ProfileReport

In [51]:
profile = ProfileReport(df, title="QBs")

In [86]:
#uncomment below to run the ProfileReport
#profile.to_notebook_iframe()

In [53]:
df.corr()

Unnamed: 0,PPR,Week,Age,Home,Dome,RANK,PTS,SACKS,INT,TO,...,PASS YDS,PASS TD,RUSH YDS,RUSH TD,RZ ATT,RZ TD,RZ TD%,1D,3D%,4D%
PPR,1.0,-0.099635,0.065807,0.013342,0.057212,0.148668,-0.067867,-0.095968,-0.136237,-0.113721,...,0.050487,0.080389,0.009999,0.072268,0.044091,0.074942,0.096432,0.042347,0.121992,0.075191
Week,-0.099635,1.0,0.026114,-0.003987,0.016491,0.006681,0.090799,0.181135,0.171281,0.204267,...,0.288048,0.244829,0.269814,0.211431,0.277822,0.255232,0.040401,0.30621,0.055536,0.031593
Age,0.065807,0.026114,1.0,-0.010546,0.084374,0.052273,-0.051362,-0.032109,-0.063724,-0.052003,...,0.021112,0.044322,-0.000895,0.012593,0.029869,0.025712,-0.019944,0.018827,0.02574,0.02627
Home,0.013342,-0.003987,-0.010546,1.0,-0.011101,0.018685,-0.011774,-0.007796,-0.023234,-0.014793,...,0.003123,0.006211,-0.007946,0.000762,0.009939,0.008455,0.004263,0.004534,0.006244,-0.002044
Dome,0.057212,0.016491,0.084374,-0.011101,1.0,0.164985,-0.097315,-0.074468,0.040319,0.034892,...,0.046459,0.096974,0.058038,0.070292,0.035927,0.101643,0.155397,0.053351,0.11423,0.048283
RANK,0.148668,0.006681,0.052273,0.018685,0.164985,1.0,-0.404927,-0.423231,-0.500096,-0.480682,...,0.243014,0.450753,0.375148,0.557119,0.41304,0.524122,0.364305,0.299069,0.587826,0.377945
PTS,-0.067867,0.090799,-0.051362,-0.011774,-0.097315,-0.404927,1.0,0.330064,0.42681,0.478748,...,0.106369,0.046176,0.021604,-0.06286,0.039724,0.018477,-0.02051,0.08977,-0.141337,-0.156678
SACKS,-0.095968,0.181135,-0.032109,-0.007796,-0.074468,-0.423231,0.330064,1.0,0.465506,0.570068,...,0.35301,0.116315,0.25533,0.104573,0.287459,0.103123,-0.255647,0.34974,-0.31113,-0.274496
INT,-0.136237,0.171281,-0.063724,-0.023234,0.040319,-0.500096,0.42681,0.465506,1.0,0.898087,...,0.352881,0.126498,0.258163,0.003079,0.226442,0.11288,-0.158114,0.315704,-0.260822,-0.116761
TO,-0.113721,0.204267,-0.052003,-0.014793,0.034892,-0.480682,0.478748,0.570068,0.898087,1.0,...,0.446518,0.259777,0.34619,0.062671,0.328464,0.213977,-0.122985,0.432737,-0.199236,-0.147132


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264 entries, 0 to 1263
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    1264 non-null   object 
 1   PPR       1264 non-null   float64
 2   Day       1264 non-null   object 
 3   Week      1264 non-null   int64  
 4   Age       1264 non-null   int64  
 5   Team      1264 non-null   object 
 6   Home      1264 non-null   int64  
 7   Opp       1264 non-null   object 
 8   Pos       1264 non-null   object 
 9   Dome      1264 non-null   int64  
 10  RANK      1264 non-null   int64  
 11  PTS       1264 non-null   int64  
 12  SACKS     1264 non-null   int64  
 13  INT       1264 non-null   int64  
 14  TO        1264 non-null   int64  
 15  PTS.1     1264 non-null   int64  
 16  PASS YDS  1264 non-null   int64  
 17  PASS TD   1264 non-null   int64  
 18  RUSH YDS  1264 non-null   int64  
 19  RUSH TD   1264 non-null   int64  
 20  RZ ATT    1264 non-null   int6

# Train Test Split of QB data

In [55]:
X = df.drop(['PPR', 'Pos'], axis=1)
y = df['PPR']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=.2)

# Subpipe for Numericals and Categoricals

In [56]:
#subpipes that scale numeric data and use one hot encoder on categorical 
subpipe_num = Pipeline(steps=[
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [57]:
#category features to be one hot encoded
cat_feat = ['Player', 'Team', 'Opp', 'Day']

In [58]:
ss = StandardScaler()
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

#Create a column transformer to apply the subpipes and transform the data
CT = ColumnTransformer(transformers=[
    ('ss', ss, selector(dtype_include=np.number)),
     ('ohe', ohe, cat_feat)], remainder='passthrough')

In [59]:
X_train_CT = CT.fit_transform(X_train)

In [60]:
X_test_CT = CT.transform(X_test)

In [61]:
X_train.shape

(1011, 24)

In [62]:
pd.DataFrame(CT.fit_transform(X_train)).shape

(1011, 186)

# Dummy Model

In [63]:
#Create a pipeline for dummy model
dummy_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('dum', DummyRegressor(strategy='median'))
])

In [64]:
dummy_model_pipe.fit(X_train, y_train)
dummy_model_pipe.score(X_train,y_train)

-6.967700962468015e-05

In [65]:
X_train

Unnamed: 0,Player,Day,Week,Age,Team,Home,Opp,Dome,RANK,PTS,...,PASS YDS,PASS TD,RUSH YDS,RUSH TD,RZ ATT,RZ TD,RZ TD%,1D,3D%,4D%
32,Taylor Heinicke,Sun,16,27,WAS,1,CAR,0,18,18,...,3825,28,1936,17,57,36,63.2,360,49.2,60.0
156,Kyler Murray,Sun,5,23,ARI,0,NYJ,1,26,8,...,4409,34,1792,16,60,36,60.0,381,44.6,41.7
382,Patrick Mahomes,Thu,1,24,KAN,1,HOU,1,27,6,...,4104,30,2564,24,63,40,63.5,390,47.5,52.4
1166,Tom Brady,Sun,8,44,TAM,0,NOR,1,4,12,...,3821,20,1589,12,46,20,43.5,304,37.1,42.9
365,Nick Foles,Thu,5,31,CHI,1,TAM,0,9,8,...,3945,29,1289,10,51,32,62.7,319,40.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,Tyler Huntley,Sun,17,23,BAL,1,LAR,1,15,0,...,4109,17,1754,18,56,29,51.8,347,41.3,53.1
1095,Josh Rosen,Thu,11,24,ATL,1,NWE,1,2,18,...,3181,21,2103,9,48,23,47.9,308,36.5,53.1
1130,Joe Flacco,Sun,10,36,NYJ,1,BUF,0,1,8,...,2771,12,1866,19,45,23,51.1,285,30.8,45.7
860,Matthew Stafford,Sun,16,33,LAR,0,MIN,2,24,12,...,4300,29,2222,15,59,33,55.9,391,36.4,50.0


# Simple Regression Models

In [66]:
#These are the various regression models we will try initially
regressors = [
    
    LinearRegression(),
    Ridge(),
    Lasso(),
    SVR(),
]

In [67]:
for regressor in regressors:
    steps = [
        ('ct', CT),
        ('rg', regressor)
    ]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)   
    print(regressor)
    print("model score: %.3f" % pipeline.score(X_train, y_train))
    

LinearRegression()
model score: 0.378
Ridge()
model score: 0.404
Lasso()
model score: 0.012
SVR()
model score: 0.132


# XGBoost

In [68]:
import xgboost as xgb

In [69]:
GB = xgb.XGBRegressor(random_state=42, max_depth=4, n_estimators=100)

In [70]:
xgb_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('xgb', GB)
])

In [71]:
xgb_model_pipe.fit(X_train, y_train)
xgb_model_pipe.score(X_train,y_train)

0.7500141871345987

In [72]:
xgb_model_pipe.score(X_test,y_test)

0.21205979677222242

In [73]:
#Looking at feature importance
GB.get_booster().get_score(importance_type='gain')

{'f66': 526.2905901999999,
 'f4': 115.88009255757576,
 'f0': 67.12087131738333,
 'f7': 244.3408587855556,
 'f119': 68.40828815,
 'f138': 483.57156185,
 'f53': 374.46563225999995,
 'f40': 529.913208,
 'f161': 159.015625,
 'f70': 314.7771311,
 'f1': 199.14098079643904,
 'f18': 129.45412142608697,
 'f17': 248.77382642000003,
 'f3': 81.88684156958001,
 'f10': 138.8399765277778,
 'f169': 278.588867,
 'f91': 356.11494895,
 'f6': 115.2795448417143,
 'f11': 166.50260875466668,
 'f74': 309.3538882125,
 'f16': 111.51375825384618,
 'f158': 86.38023082,
 'f14': 125.25689080833332,
 'f20': 340.5799782,
 'f75': 314.0736483999999,
 'f90': 309.78265242000003,
 'f19': 116.86672085730771,
 'f12': 92.06122061481481,
 'f174': 155.478516,
 'f8': 90.84303174499999,
 'f22': 338.098684,
 'f94': 220.60663591666665,
 'f13': 212.3426897,
 'f2': 86.12549502666666,
 'f180': 119.72263699999999,
 'f5': 73.7384912011111,
 'f9': 86.56994494272726,
 'f15': 155.8732691,
 'f59': 181.6513300833333,
 'f26': 166.29421446249

In [74]:
GB.feature_importances_

array([0.00418511, 0.01241679, 0.00537008, 0.00510579, 0.00722533,
       0.00459773, 0.00718788, 0.01523509, 0.00566422, 0.00539779,
       0.00865692, 0.01038173, 0.00574018, 0.01323994, 0.00780999,
       0.00971898, 0.00695308, 0.01551149, 0.00807169, 0.00728685,
       0.02123577, 0.        , 0.02108105, 0.00994354, 0.00196647,
       0.        , 0.01036874, 0.00968543, 0.        , 0.00847222,
       0.00552043, 0.00806227, 0.00483218, 0.        , 0.00675368,
       0.0082602 , 0.00165254, 0.00604296, 0.00318253, 0.01079367,
       0.03304103, 0.00634406, 0.00126111, 0.00518734, 0.01357494,
       0.        , 0.00763723, 0.00613628, 0.00758719, 0.        ,
       0.        , 0.00211854, 0.00301943, 0.0233486 , 0.00181128,
       0.        , 0.01275965, 0.        , 0.00246342, 0.01132628,
       0.        , 0.00403995, 0.00795726, 0.00515214, 0.00339486,
       0.00338248, 0.03281515, 0.00301169, 0.00294123, 0.        ,
       0.01962691, 0.00241162, 0.00566237, 0.        , 0.01928