# Getting Our Data

In [None]:
import pandas as pd 

# looking at play-by-play data over a 3 year span
years = [2018, 2019, 2020]

df = pd.DataFrame()

for i in years:  
    # low_memory=False eliminates a warning
    i_df = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                         'play_by_play_' + str(i) + '.csv.gz?raw=True',
                         compression='gzip', low_memory=False)

    # sort=True eliminates a warning and alphabetically sorts columns
    df = df.append(i_df, sort=True)

# give each row a unique index
df.reset_index(drop=True, inplace=True)

In [None]:
import pandas as pd

In [None]:
import sklearn

In [2]:
df.head()

Unnamed: 0,aborted_play,air_epa,air_wpa,air_yards,assist_tackle,assist_tackle_1_player_id,assist_tackle_1_player_name,assist_tackle_1_team,assist_tackle_2_player_id,assist_tackle_2_player_name,...,xyac_median_yardage,xyac_success,yac_epa,yac_wpa,yardline_100,yards_after_catch,yards_gained,ydsnet,ydstogo,yrdln
0,0,,,,,,,,,,...,,,,,,,,,0,PHI 35
1,0,,,,0.0,,,,,,...,,,,,35.0,,0.0,73.0,0,PHI 35
2,0,,,,0.0,,,,,,...,,,,,75.0,,0.0,73.0,10,ATL 25
3,0,0.321213,0.0,8.0,0.0,,,,,,...,2.0,0.998706,0.528905,0.038693,80.0,2.0,10.0,73.0,15,ATL 20
4,0,,,,0.0,,,,,,...,,,,,70.0,,11.0,73.0,5,ATL 30


In [3]:
df.shape

(122479, 340)

In [4]:
# only using regular season data
df = df.loc[df.season_type=='REG']

In [5]:
# the dataset labels QB scrambles as a run, when in reality they are passing plays
df.play_type.loc[df['pass']==1] = 'pass'
df.play_type.loc[df.rush==1] = 'run'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
# decide which feaures will be valuable for run/pass predictions
run_pass_df = df[['yardline_100', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 
    'drive', 'qtr', 'down', 'goal_to_go', 'time', 'ydstogo', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 
    'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'posteam_score', 'defteam_score', 'score_differential',
    'wp']]

In [7]:
# grab all plays that are run or pass
run_pass_df = run_pass_df.loc[(run_pass_df['play_type'] == 'pass') | (run_pass_df['play_type'] == 'run')]

In [8]:
# create Target column consisting of 0's and 1's for run and pass
run_pass_df['Target'] = run_pass_df.play_type.map(lambda x: 1 if x == 'pass' else 0)

In [9]:
# drop play_type once we've created our target column
run_pass_df.drop(['play_type'], axis = 1, inplace = True)

In [10]:
run_pass_df.isna().sum()

yardline_100                  126
quarter_seconds_remaining       3
half_seconds_remaining          3
game_seconds_remaining          3
drive                           1
qtr                             0
down                          348
goal_to_go                      0
time                            3
ydstogo                         0
yards_gained                  126
shotgun                         0
no_huddle                       0
posteam_timeouts_remaining    126
defteam_timeouts_remaining    126
posteam_score                 126
defteam_score                 126
score_differential            126
wp                            126
Target                          0
dtype: int64

In [11]:
# drop NaN values
run_pass_df = run_pass_df.dropna()

In [12]:
# potentially needs to be rebalanced
run_pass_df.Target.value_counts()

1    53863
0    32385
Name: Target, dtype: int64

In [13]:
import pickle

run_pass_df.to_pickle('run_pass_df.pkl')

In [14]:
run_pass_df.head()

Unnamed: 0,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,drive,qtr,down,goal_to_go,time,ydstogo,yards_gained,shotgun,no_huddle,posteam_timeouts_remaining,defteam_timeouts_remaining,posteam_score,defteam_score,score_differential,wp,Target
3,80.0,900.0,1800.0,3600.0,1.0,1,1.0,0,15:00,15,10.0,0,0,3.0,3.0,0.0,0.0,0.0,0.394005,1
4,70.0,862.0,1762.0,3562.0,1.0,1,2.0,0,14:22,5,11.0,0,0,3.0,3.0,0.0,0.0,0.0,0.432698,0
5,59.0,826.0,1726.0,3526.0,1.0,1,1.0,0,13:46,10,20.0,0,0,3.0,3.0,0.0,0.0,0.0,0.460501,0
6,39.0,790.0,1690.0,3490.0,1.0,1,1.0,0,13:10,10,0.0,0,0,3.0,3.0,0.0,0.0,0.0,0.50872,1
7,39.0,785.0,1685.0,3485.0,1.0,1,2.0,0,13:05,10,0.0,1,0,3.0,3.0,0.0,0.0,0.0,0.489379,1
