### Read in libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Read in and subset nflfastR datasets

Note about EP vs. EPA
- EPA is the difference between the Expected Points (EP) at the beginning of the play compared to end of the play. It measures the plays impact on the score of the game.

In [5]:
YEARS = [2018, 2019, 2020]

fastR = pd.DataFrame()

for i in YEARS:
    i_data = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    fastR = fastR.append(i_data, sort=True)
    
fastR.reset_index(drop = True, inplace = True)

In [6]:
fastR.shape

(144422, 372)

In [7]:
fastR = fastR[['play_id', 'old_game_id', 'ep', 'epa', 'punt_inside_twenty', 'punt_in_endzone', 
               'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch', 'play_type', 'return_team', 'weather',
               'surface', 'wind']]

fastR.head(2)

Unnamed: 0,play_id,old_game_id,season_type,ep,epa,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,play_type,return_team,weather,surface,wind
0,1,2018090600,REG,,,,,,,,,,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",grass,8.0
1,37,2018090600,REG,0.770222,-0.0,0.0,0.0,0.0,0.0,0.0,kickoff,,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",grass,8.0


In [8]:
fastR = fastR[fastR['play_type'] == 'punt']
fastR = fastR.drop('play_type', axis = 1)

In [9]:
fastR.shape

(6526, 14)

In [10]:
# Rename columns to eventually merge with Kaggle data
fastR = fastR.rename(columns = {'play_id' : 'playId', 'old_game_id' : 'gameId'})
fastR.head(2)

Unnamed: 0,playId,gameId,season_type,ep,epa,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind
16,366,2018090600,REG,-2.117271,-0.758654,0.0,0.0,0.0,0.0,0.0,ATL,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",grass,8.0
35,872,2018090600,REG,-0.936326,0.410948,0.0,0.0,0.0,0.0,0.0,,"Cloudy Temp: 81° F, Humidity: 71%, Wind: NNW 8...",grass,8.0


### Read in and subset datasets from NFL Big Data Bowl

In [11]:
YEARS = [2018, 2019, 2020]

tracking = pd.DataFrame()

for i in YEARS:
    i_data = pd.read_csv('C:\\Users\\cmcle\\Documents\\NFL Big Data Bowl\\tracking' + str(i) + '.csv')

    tracking = tracking.append(i_data, sort=True)
    
tracking.reset_index(drop = True, inplace = True)

In [12]:
tracking = tracking.drop_duplicates(['gameId', 'playId'])
tracking = tracking[['gameId', 'playId', 'displayName', 'nflId', 'a', 's', 'dir', 'dis', 'o', 'x', 'y']]
tracking.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y
0,2018123000,36,Justin Tucker,39470.0,1.33,4.36,128.44,0.43,130.42,41.32,29.45
2415,2018123000,373,Sam Koch,31018.0,0.0,0.0,11.6,0.0,44.46,81.81,27.76


In [13]:
pff = pd.read_csv('C:\\Users\\cmcle\\Documents\\NFL Big Data Bowl\\PFFScoutingData.csv')
pff = pff[pff['kickType'].isin(['N', 'R', 'A'])]
pff = pff[['gameId', 'playId', 'hangTime', 'kickType']]
pff.head(2)

Unnamed: 0,gameId,playId,hangTime,kickType
1,2018090600,366,4.46,N
4,2018090600,872,4.35,N


In [14]:
games = pd.read_csv('C:\\Users\\cmcle\\Documents\\NFL Big Data Bowl\\games.csv')
games = games[['gameId', 'season', 'week', 'homeTeamAbbr', 'visitorTeamAbbr']]
games.head(2)

Unnamed: 0,gameId,season,week,homeTeamAbbr,visitorTeamAbbr
0,2018090600,2018,1,PHI,ATL
1,2018090900,2018,1,BAL,BUF


In [15]:
plays = pd.read_csv('C:\\Users\\cmcle\\Documents\\NFL Big Data Bowl\\plays.csv')
plays = plays[plays['specialTeamsPlayType'] == 'Punt']
plays = plays[['gameId', 'playId', 'playDescription', 'quarter', 'down', 'yardsToGo', 'specialTeamsPlayType',
              'kickLength', 'playResult', 'kickReturnYardage']]
plays.head(2)

Unnamed: 0,gameId,playId,playDescription,quarter,down,yardsToGo,specialTeamsPlayType,kickLength,playResult,kickReturnYardage
1,2018090600,366,"(9:20) C.Johnston punts 56 yards to ATL 36, Ce...",1,4,4,Punt,56.0,36,5.0
4,2018090600,872,"(:33) C.Johnston punts 65 yards to end zone, C...",1,4,18,Punt,65.0,45,


In [16]:
players = pd.read_csv('C:\\Users\\cmcle\\Documents\\NFL Big Data Bowl\\players.csv')
players = players[['nflId', 'collegeName', 'Position']]
players.head(2)

Unnamed: 0,nflId,collegeName,Position
0,42901,James Madison,SS
1,43501,Central Michigan,FS


### Merge datasets together

In [17]:
main = tracking.merge(players, on = ['nflId'])
main.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y,collegeName,Position
0,2018123000,36,Justin Tucker,39470.0,1.33,4.36,128.44,0.43,130.42,41.32,29.45,Texas,K
1,2018123000,392,Justin Tucker,39470.0,0.97,3.72,129.4,0.37,113.74,40.67,31.97,Texas,K


In [18]:
main = main.merge(plays, on = ['gameId', 'playId'])
main.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y,collegeName,Position,playDescription,quarter,down,yardsToGo,specialTeamsPlayType,kickLength,playResult,kickReturnYardage
0,2018123000,2165,Sam Koch,31018.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,Nebraska,P,(:42) (Punt formation) S.Koch punts 36 yards t...,2,4,17,Punt,36.0,36,
1,2018123000,2502,Sam Koch,31018.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,Nebraska,P,(12:50) (Punt formation) S.Koch punts 53 yards...,3,4,14,Punt,53.0,16,37.0


In [19]:
main = main.merge(games, on = 'gameId')
main.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y,collegeName,Position,playDescription,quarter,down,yardsToGo,specialTeamsPlayType,kickLength,playResult,kickReturnYardage,season,week,homeTeamAbbr,visitorTeamAbbr
0,2018123000,2165,Sam Koch,31018.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,Nebraska,P,(:42) (Punt formation) S.Koch punts 36 yards t...,2,4,17,Punt,36.0,36,,2018,17,BAL,CLE
1,2018123000,2502,Sam Koch,31018.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,Nebraska,P,(12:50) (Punt formation) S.Koch punts 53 yards...,3,4,14,Punt,53.0,16,37.0,2018,17,BAL,CLE


In [20]:
main = main.merge(pff, on = ['gameId', 'playId'])
main.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y,collegeName,Position,playDescription,quarter,down,yardsToGo,specialTeamsPlayType,kickLength,playResult,kickReturnYardage,season,week,homeTeamAbbr,visitorTeamAbbr,hangTime,kickType
0,2018123000,2165,Sam Koch,31018.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,Nebraska,P,(:42) (Punt formation) S.Koch punts 36 yards t...,2,4,17,Punt,36.0,36,,2018,17,BAL,CLE,4.56,A
1,2018123000,2502,Sam Koch,31018.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,Nebraska,P,(12:50) (Punt formation) S.Koch punts 53 yards...,3,4,14,Punt,53.0,16,37.0,2018,17,BAL,CLE,4.17,N


In [21]:
main = main.merge(fastR, on = ['gameId', 'playId'])
main.head(2)

Unnamed: 0,gameId,playId,displayName,nflId,a,s,dir,dis,o,x,y,collegeName,Position,playDescription,quarter,down,yardsToGo,specialTeamsPlayType,kickLength,playResult,kickReturnYardage,season,week,homeTeamAbbr,visitorTeamAbbr,hangTime,kickType,season_type,ep,epa,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind
0,2018123000,2165,Sam Koch,31018.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,Nebraska,P,(:42) (Punt formation) S.Koch punts 36 yards t...,2,4,17,Punt,36.0,36,,2018,17,BAL,CLE,4.56,A,REG,0.372952,-0.331417,1.0,0.0,0.0,1.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0
1,2018123000,2502,Sam Koch,31018.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,Nebraska,P,(12:50) (Punt formation) S.Koch punts 53 yards...,3,4,14,Punt,53.0,16,37.0,2018,17,BAL,CLE,4.17,N,REG,-0.90379,-1.739167,0.0,0.0,0.0,0.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0


In [22]:
# Reorganize columns
main = main[['gameId', 'season', 'season_type', 'week', 'homeTeamAbbr', 'visitorTeamAbbr', 'playId', 
             'quarter', 'down', 'yardsToGo', 'nflId', 'displayName', 'Position', 'collegeName', 'playDescription',
             'specialTeamsPlayType', 'playResult', 'ep', 'epa', 'kickReturnYardage', 'kickType', 'hangTime', 
             'kickLength', 'punt_inside_twenty', 'punt_in_endzone', 
            'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch', 'return_team', 'weather', 'surface', 'wind',
            'a', 's', 'dir', 'dis', 'o', 'x', 'y']]
main.head(2)

Unnamed: 0,gameId,season,season_type,week,homeTeamAbbr,visitorTeamAbbr,playId,quarter,down,yardsToGo,nflId,displayName,Position,collegeName,playDescription,specialTeamsPlayType,playResult,ep,epa,kickReturnYardage,kickType,hangTime,kickLength,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind,a,s,dir,dis,o,x,y
0,2018123000,2018,REG,17,BAL,CLE,2165,2,4,17,31018.0,Sam Koch,P,Nebraska,(:42) (Punt formation) S.Koch punts 36 yards t...,Punt,36,0.372952,-0.331417,,A,4.56,36.0,1.0,0.0,0.0,1.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85
1,2018123000,2018,REG,17,BAL,CLE,2502,3,4,14,31018.0,Sam Koch,P,Nebraska,(12:50) (Punt formation) S.Koch punts 53 yards...,Punt,16,-0.90379,-1.739167,37.0,N,4.17,53.0,0.0,0.0,0.0,0.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21


### Preliminary Analysis and Create New Variables

In [23]:
# Correlation Matrix
main.corr().style.background_gradient(cmap = 'coolwarm')

  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax


Unnamed: 0,gameId,season,week,playId,quarter,down,yardsToGo,nflId,playResult,ep,epa,kickReturnYardage,hangTime,kickLength,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,wind,a,s,dir,dis,o,x,y
gameId,1.0,0.989288,0.065103,-0.003136,-0.000853,,-0.01882,0.28649,0.006569,0.004951,-0.001866,0.011546,0.0297,0.035844,-0.019258,-0.010005,-0.00887,-0.020693,0.015029,0.073439,-0.025027,-0.005951,0.026803,-0.009754,-0.018638,-0.008387,-0.008433
season,0.989288,1.0,0.020661,2.9e-05,0.001052,,-0.018573,0.288742,0.00942,0.005029,-1.1e-05,0.007812,0.033714,0.036182,-0.016409,-0.010099,-0.008097,-0.019166,0.014033,0.083952,-0.027446,-0.009182,0.024575,-0.011997,-0.017325,-0.004789,-0.009015
week,0.065103,0.020661,1.0,-0.011,-0.006201,,-0.01924,0.05285,-0.020016,-0.009519,-0.018761,0.013095,-0.059444,-0.030917,-0.017619,0.008562,-0.016365,0.026166,0.023068,0.026888,-0.010645,-0.000468,-0.001819,-0.013569,-0.022138,-0.014441,0.013343
playId,-0.003136,2.9e-05,-0.011,1.0,0.959622,,0.068174,-0.008349,-0.019394,0.083674,-0.042782,-0.007744,-0.015444,-0.028076,0.007624,0.029687,0.010197,0.028205,-0.00693,0.009816,-0.004447,0.009799,-0.00894,0.012506,-0.006827,-0.022161,0.012905
quarter,-0.000853,0.001052,-0.006201,0.959622,1.0,,0.061173,-0.004357,-0.017327,0.048596,-0.033923,0.000953,-0.013031,-0.022417,0.00055,0.017432,0.007198,0.02259,-0.002569,0.00938,-0.000308,0.018427,-0.010987,0.020188,-0.009011,-0.0273,0.014186
down,,,,,,,,,,,,,,,,,,,,,,,,,,,
yardsToGo,-0.01882,-0.018573,-0.01924,0.068174,0.061173,,1.0,-0.018898,-0.032959,-0.293637,0.105073,0.019187,0.009548,-0.011906,-0.076278,-0.000393,-0.005554,-0.001622,-0.082465,-0.004191,0.007172,0.01646,0.005676,0.018164,0.007452,-0.015089,-0.000517
nflId,0.28649,0.288742,0.05285,-0.008349,-0.004357,,-0.018898,1.0,0.006701,-0.028761,0.003677,0.033592,0.01934,0.028697,-0.030778,0.00336,0.025384,-0.018127,-0.031296,0.012911,0.01406,0.022148,-0.00453,0.023973,0.009741,0.011311,0.003973
playResult,0.006569,0.00942,-0.020016,-0.019394,-0.017327,,-0.032959,0.006701,1.0,-0.224786,0.715135,-0.76413,0.235369,0.562492,0.176046,0.021446,-0.02192,0.058621,0.033344,-0.033198,-0.009114,-0.010924,0.013993,-0.004568,-0.007729,0.005359,0.022224
ep,0.004951,0.005029,-0.009519,0.083674,0.048596,,-0.293637,-0.028761,-0.224786,1.0,-0.131833,-0.057041,-0.014324,-0.33263,0.518962,0.016565,-0.001819,0.142147,0.194884,-0.006311,-0.021706,-0.023913,-0.003742,-0.034088,0.021482,0.044785,0.004444


In [24]:
main.head(2)

Unnamed: 0,gameId,season,season_type,week,homeTeamAbbr,visitorTeamAbbr,playId,quarter,down,yardsToGo,nflId,displayName,Position,collegeName,playDescription,specialTeamsPlayType,playResult,ep,epa,kickReturnYardage,kickType,hangTime,kickLength,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind,a,s,dir,dis,o,x,y
0,2018123000,2018,REG,17,BAL,CLE,2165,2,4,17,31018.0,Sam Koch,P,Nebraska,(:42) (Punt formation) S.Koch punts 36 yards t...,Punt,36,0.372952,-0.331417,,A,4.56,36.0,1.0,0.0,0.0,1.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85
1,2018123000,2018,REG,17,BAL,CLE,2502,3,4,14,31018.0,Sam Koch,P,Nebraska,(12:50) (Punt formation) S.Koch punts 53 yards...,Punt,16,-0.90379,-1.739167,37.0,N,4.17,53.0,0.0,0.0,0.0,0.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21


In [25]:
# Create new variable of average EPA grouping by various items
main['avg_epa'] = main.groupby(['down', 'yardsToGo', 'kickLength', 'weather'])['epa'].transform('mean')
main.head(2)

Unnamed: 0,gameId,season,season_type,week,homeTeamAbbr,visitorTeamAbbr,playId,quarter,down,yardsToGo,nflId,displayName,Position,collegeName,playDescription,specialTeamsPlayType,playResult,ep,epa,kickReturnYardage,kickType,hangTime,kickLength,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind,a,s,dir,dis,o,x,y,avg_epa
0,2018123000,2018,REG,17,BAL,CLE,2165,2,4,17,31018.0,Sam Koch,P,Nebraska,(:42) (Punt formation) S.Koch punts 36 yards t...,Punt,36,0.372952,-0.331417,,A,4.56,36.0,1.0,0.0,0.0,1.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,-0.331417
1,2018123000,2018,REG,17,BAL,CLE,2502,3,4,14,31018.0,Sam Koch,P,Nebraska,(12:50) (Punt formation) S.Koch punts 53 yards...,Punt,16,-0.90379,-1.739167,37.0,N,4.17,53.0,0.0,0.0,0.0,0.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,-1.739167


In [26]:
main.shape

(5854, 40)

In [27]:
main.dtypes

gameId                    int64
season                    int64
season_type              object
week                      int64
homeTeamAbbr             object
visitorTeamAbbr          object
playId                    int64
quarter                   int64
down                      int64
yardsToGo                 int64
nflId                   float64
displayName              object
Position                 object
collegeName              object
playDescription          object
specialTeamsPlayType     object
playResult                int64
ep                      float64
epa                     float64
kickReturnYardage       float64
kickType                 object
hangTime                float64
kickLength              float64
punt_inside_twenty      float64
punt_in_endzone         float64
punt_out_of_bounds      float64
punt_downed             float64
punt_fair_catch         float64
return_team              object
weather                  object
surface                  object
wind    

#### Model Building

In [28]:
# Check for missing values
main.isnull().sum()

gameId                     0
season                     0
season_type                0
week                       0
homeTeamAbbr               0
visitorTeamAbbr            0
playId                     0
quarter                    0
down                       0
yardsToGo                  0
nflId                      0
displayName                0
Position                   0
collegeName                0
playDescription            0
specialTeamsPlayType       0
playResult                 0
ep                         0
epa                        0
kickReturnYardage       3499
kickType                   0
hangTime                   6
kickLength                 0
punt_inside_twenty         0
punt_in_endzone            0
punt_out_of_bounds         0
punt_downed                0
punt_fair_catch            0
return_team                4
weather                    0
surface                    0
wind                    1630
a                          0
s                          0
dir           

In [29]:
# Sub in missing kickReturnYardage values with groupby of similar situations
main['kickReturnYardage'] = main.groupby(['down', 'yardsToGo', 'kickLength', 'weather']).transform(lambda x: x.fillna(x.mean()))
# Sub in zeros for wind since missing values most likely mean game was indoors
main['wind'] = main['wind'].replace('nan', np.nan).fillna(0)
main.head(2)

Unnamed: 0,gameId,season,season_type,week,homeTeamAbbr,visitorTeamAbbr,playId,quarter,down,yardsToGo,nflId,displayName,Position,collegeName,playDescription,specialTeamsPlayType,playResult,ep,epa,kickReturnYardage,kickType,hangTime,kickLength,punt_inside_twenty,punt_in_endzone,punt_out_of_bounds,punt_downed,punt_fair_catch,return_team,weather,surface,wind,a,s,dir,dis,o,x,y,avg_epa
0,2018123000,2018,REG,17,BAL,CLE,2165,2,4,17,31018.0,Sam Koch,P,Nebraska,(:42) (Punt formation) S.Koch punts 36 yards t...,Punt,36,0.372952,-0.331417,2018123000.0,A,4.56,36.0,1.0,0.0,0.0,1.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,268.97,0.0,278.65,65.77,28.85,-0.331417
1,2018123000,2018,REG,17,BAL,CLE,2502,3,4,14,31018.0,Sam Koch,P,Nebraska,(12:50) (Punt formation) S.Koch punts 53 yards...,Punt,16,-0.90379,-1.739167,2018123000.0,N,4.17,53.0,0.0,0.0,0.0,0.0,0.0,CLE,"Partly Cloudy Temp: 48° F, Humidity: 59%, Wind...",grass,2.0,0.0,0.0,191.59,0.0,84.96,28.35,28.21,-1.739167


In [30]:
# Check for missing values to make sure substitutions worked correctly
main.isnull().sum()

gameId                  0
season                  0
season_type             0
week                    0
homeTeamAbbr            0
visitorTeamAbbr         0
playId                  0
quarter                 0
down                    0
yardsToGo               0
nflId                   0
displayName             0
Position                0
collegeName             0
playDescription         0
specialTeamsPlayType    0
playResult              0
ep                      0
epa                     0
kickReturnYardage       0
kickType                0
hangTime                6
kickLength              0
punt_inside_twenty      0
punt_in_endzone         0
punt_out_of_bounds      0
punt_downed             0
punt_fair_catch         0
return_team             4
weather                 0
surface                 0
wind                    0
a                       0
s                       0
dir                     0
dis                     0
o                       0
x                       0
y           

In [31]:
main.columns

Index(['gameId', 'season', 'season_type', 'week', 'homeTeamAbbr',
       'visitorTeamAbbr', 'playId', 'quarter', 'down', 'yardsToGo', 'nflId',
       'displayName', 'Position', 'collegeName', 'playDescription',
       'specialTeamsPlayType', 'playResult', 'ep', 'epa', 'kickReturnYardage',
       'kickType', 'hangTime', 'kickLength', 'punt_inside_twenty',
       'punt_in_endzone', 'punt_out_of_bounds', 'punt_downed',
       'punt_fair_catch', 'return_team', 'weather', 'surface', 'wind', 'a',
       's', 'dir', 'dis', 'o', 'x', 'y', 'avg_epa'],
      dtype='object')

In [64]:
# Drop columns we don't need
main = main.drop(['surface', 'weather', 'return_team', 'homeTeamAbbr', 'visitorTeamAbbr', 'nflId', 'displayName', 
                  'Position', 'collegeName', 
                  'playDescription', 'specialTeamsPlayType'], axis = 1)
main = main.dropna()
main.columns

Index(['gameId', 'season', 'season_type', 'week', 'playId', 'quarter', 'down',
       'yardsToGo', 'playResult', 'ep', 'epa', 'kickReturnYardage', 'kickType',
       'hangTime', 'kickLength', 'punt_inside_twenty', 'punt_in_endzone',
       'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch', 'wind', 'a',
       's', 'dir', 'dis', 'o', 'x', 'y', 'avg_epa'],
      dtype='object')

In [65]:
# Get dummies takes a look at all unique values and creates column for those
# Allows us to put data into numerical format
main_new = pd.get_dummies(main, columns = ['season', 'season_type', 'week','quarter', 'down', 'yardsToGo',
                                          'kickType', 'punt_inside_twenty', 'punt_in_endzone',
                                          'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch'])

In [66]:
# Separate target variable and rest of variables
data_y = pd.DataFrame(main_new['kickReturnYardage'])
data_x = main_new.drop(['kickReturnYardage'], axis = 1)
print(data_x.columns)
print(data_y.columns)

Index(['gameId', 'playId', 'playResult', 'ep', 'epa', 'hangTime', 'kickLength',
       'wind', 'a', 's', 'dir', 'dis', 'o', 'x', 'y', 'avg_epa', 'season_2018',
       'season_2019', 'season_2020', 'season_type_REG', 'week_1', 'week_2',
       'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'week_9',
       'week_10', 'week_11', 'week_12', 'week_13', 'week_14', 'week_15',
       'week_16', 'week_17', 'quarter_1', 'quarter_2', 'quarter_3',
       'quarter_4', 'quarter_5', 'down_4', 'yardsToGo_1', 'yardsToGo_2',
       'yardsToGo_3', 'yardsToGo_4', 'yardsToGo_5', 'yardsToGo_6',
       'yardsToGo_7', 'yardsToGo_8', 'yardsToGo_9', 'yardsToGo_10',
       'yardsToGo_11', 'yardsToGo_12', 'yardsToGo_13', 'yardsToGo_14',
       'yardsToGo_15', 'yardsToGo_16', 'yardsToGo_17', 'yardsToGo_18',
       'yardsToGo_19', 'yardsToGo_20', 'yardsToGo_21', 'yardsToGo_22',
       'yardsToGo_23', 'yardsToGo_24', 'yardsToGo_25', 'yardsToGo_26',
       'yardsToGo_27', 'yardsToGo_28', 'yardsToGo_29',

In [67]:
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.3, random_state = 2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4093, 93)
(1755, 93)
(4093, 1)
(1755, 1)


In [68]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [69]:
y_pred = clf.predict(X_test)

In [70]:
accuracy_score(y_test, y_pred)

0.0011396011396011395

In [72]:
y_pred

array([2.01910131e+09, 2.01910131e+09, 2.01910131e+09, ...,
       2.01910131e+09, 2.01910131e+09, 2.01910131e+09])