In [1]:
import pandas as pd
import datetime
import numpy as np
import re
from string import punctuation

### Import Parser and Engineer Class

In [2]:
import pandas as pd
import datetime
import numpy as np
import re
from string import punctuation


class DataParser:
    """

    This class implements a parser that cleans the variables of the NFL dataset
    for the 2019 NFL data competition held on Kaggle.

    Parameters
    ----------
    data: the nfl data in pandas DataFrame format

    Notes
    -----
    Might not work if some of the columns have been edited since initial
    import. In that case, either implement additional feature engineering
    methods or reload the data.

    References
    ----------
    The methods were implemented based from the one found at:
    https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win

    """

    def __init__(self, data):
        self.data = data

    def cleanWindSpeed(self, x):
        x = str(x)
        x = x.lower()
        if '-' in x:
            x = (int(x.split('-')[0]) + int(x.split('-')[1])) / 2
        elif ' gusts up to 25 ' in x:
            x = (int(x.split(' gusts up tp 25 ')))
        try:
            return float(x)
        except:
            return -1

    def cleanGameWeather(self, x):
        x = str(x).lower()
        if 'sunny' in x or 'clear' in x or 'fair' in x:
            return 'sunny'
        elif 'cloud' in x or 'coudy' in x or 'clouidy' in x or 'hazy' in x or 'sun & clouds' in x or 'overcast' in x:
            return 'cloudy'
        elif 'rain' in x or 'shower' in x or 'rainy' in x:
            return 'rainy'
        elif 'controlled climate' in x or 'indoor' in x:
            return 'indoor'
        elif 'snow' in x:
            return 'snowy'
        return None

    def mapGameWeather(self):
        gameWeatherMap = {}
        for w in self.data.GameWeather.dropna().unique():
            mean = self.data[self.data.GameWeather == w]['Yards'].mean()
            norm = (mean - self.data.Yards.mean()) / self.data.Yards.std()
            gameWeatherMap[w] = norm
        self.data['GameWeather'] = self.data['GameWeather'].map(gameWeatherMap)

    def cleanStadiumType(self, txt):  # Fixes the typo
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = re.sub(' +', ' ', txt)
        txt = txt.strip()
        txt = txt.replace('outside', 'outdoor')
        txt = txt.replace('outdor', 'outdoor')
        txt = txt.replace('outddors', 'outdoor')
        txt = txt.replace('outdoors', 'outdoor')
        txt = txt.replace('oudoor', 'outdoor')
        txt = txt.replace('indoors', 'indoor')
        txt = txt.replace('ourdoor', 'outdoor')
        txt = txt.replace('retractable', 'rtr.')
        return txt

    # Focuses only on the words: outdoor, indoor, closed and open.
    def cleanStadiumType2(self, txt):
        if pd.isna(txt):
            return np.nan
        if 'outdoor' in txt or 'open' in txt:
            return 1
        if 'indoor' in txt or 'closed' in txt:
            return 0
        return np.nan

    def cleanDefencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(', ')]
               for t in self.data['DefensePersonnel']]
        self.data['DL'] = pd.Series([int(a[0]) for a in arr])
        self.data['LB'] = pd.Series([int(a[1]) for a in arr])
        self.data['DB'] = pd.Series([int(a[2]) for a in arr])
        self.data = self.data.drop(labels=["DefensePersonnel"], axis=1)

    def cleanOffencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(", ")]
               for t in self.data["OffensePersonnel"]]
        self.data["RB"] = pd.Series([int(a[0]) for a in arr])
        self.data["TE"] = pd.Series([int(a[1]) for a in arr])
        self.data["WR"] = pd.Series([int(a[2]) for a in arr])
        self.data = self.data.drop(labels=["OffensePersonnel"], axis=1)

    def cleanOffenseFormation(self):
        """
        This is a function for cleaning the Offense Formation column.
        It will find the mean Yards for and normalize it.
        """

        formationMap = {}
        for f in self.data.OffenseFormation.dropna().unique():
            mean = self.data[self.data.OffenseFormation == f]['Yards'].mean()
            norm = (mean - self.data.Yards.mean()) / self.data.Yards.std()
            formationMap[f] = norm
    
        self.data['OffenseFormation'] = self.data['OffenseFormation'].map(formationMap)

    def cleanHeight(self):
        """
        Parses the PlayerHeight column and converts height into inches
        """
        self.data['PlayerHeight'] = self.data['PlayerHeight'].apply(
            lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    def cleanTimeHandoff(self):
        self.data['TimeHandoff'] = self.data['TimeHandoff'].apply(
            lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    def cleanTimeSnap(self):
        self.data['TimeSnap'] = self.data['TimeSnap'].apply(
            lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    def cleanGameClock(self):
        arr = [[int(s[0]) for s in t.split(":")]
               for t in self.data["GameClock"]]
        self.data["GameHour"] = [int(a[0]) for a in arr]
        self.data["GameMinute"] = [int(a[1]) for a in arr]
        self.data = self.data.drop(labels=['GameClock'], axis=1)

    def cleanTurf(self):
        # from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
        Turf = {'Field Turf': 'Artificial', 'A-Turf Titan': 'Artificial', 'Grass': 'Natural', 'UBU Sports Speed S5-M': 'Artificial',
                'Artificial': 'Artificial', 'DD GrassMaster': 'Artificial', 'Natural Grass': 'Natural',
                'UBU Speed Series-S5-M': 'Artificial', 'FieldTurf': 'Artificial', 'FieldTurf 360': 'Artificial', 'Natural grass': 'Natural', 'grass': 'Natural',
                'Natural': 'Natural', 'Artifical': 'Artificial', 'FieldTurf360': 'Artificial', 'Naturall Grass': 'Natural', 'Field turf': 'Artificial',
                'SISGrass': 'Artificial', 'Twenty-Four/Seven Turf': 'Artificial', 'natural grass': 'Natural'}

        self.data['Turf'] = self.data['Turf'].map(Turf)
        self.data['Turf'] = self.data['Turf'] == 'Natural'

    def cleanPossessionTeam(self):  # fixes problem in team name encoding
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in self.data['PossessionTeam'].unique():
            map_abbr[abb] = abb
        self.data['PossessionTeam'] = self.data['PossessionTeam'].map(
            map_abbr)
        self.data['HomeTeamAbbr'] = self.data['HomeTeamAbbr'].map(map_abbr)
        self.data['VisitorTeamAbbr'] = self.data['VisitorTeamAbbr'].map(
            map_abbr)

    def cleanPlayerBirthDate(self):
        self.data['PlayerBirthDate'] = self.data['PlayerBirthDate'].apply(
            lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

    def cleanWindDirection(self, txt):
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = txt.replace('from', '')
        txt = txt.replace(' ', '')
        txt = txt.replace('north', 'n')
        txt = txt.replace('south', 's')
        txt = txt.replace('west', 'w')
        txt = txt.replace('east', 'e')
        return txt
    
    def mapWindDirection(self, txt):
        windDirectionMap = {
            'n': 0,'nne': 1/8,'nen': 1/8,'ne': 2/8,
            'ene': 3/8,'nee': 3/8,'e': 4/8,'ese': 5/8,
            'see': 5/8,'se': 6/8,'ses': 7/8,'sse': 7/8,
            's': 1,'ssw': 9/8,'sws': 9/8,'sw': 10/8,
            'sww': 11/8,'wsw': 11/8,'w': 12/8,'wnw': 13/8,
            'nw': 14/8,'nwn': 15/8,'nnw': 15/8
        }
        try:
            return windDirectionMap[txt]
        except:
            return np.nan

    def cleanPlayDirection(self):
        """
        1 if play direction if right, 0 if play direction is left.
        """
        self.data['PlayDirection'] = self.data['PlayDirection'].apply(
            lambda x: x.strip() == 'right')

    def cleanTeam(self):
        """
        1 if home team, 0 if away team
        """
        self.data['Team'] = self.data['Team'].apply(
            lambda x: x.strip() == 'home')

    def parse(self):
        self.data['WindSpeed'] = self.data['WindSpeed'].apply(self.cleanWindSpeed)
        self.data['GameWeather'] = self.data['GameWeather'].apply(self.cleanGameWeather)
        self.mapGameWeather()
        self.data['StadiumType'] = self.data['StadiumType'].apply(self.cleanStadiumType)
        self.data['StadiumType'] = self.data['StadiumType'].apply(self.cleanStadiumType2)
        self.data['WindDirection'] = self.data['WindDirection'].apply(self.cleanWindDirection)
        self.data['WindDirection'] = self.data['WindDirection'].apply(self.mapWindDirection)
        self.cleanOffencePersonnel()
        self.cleanDefencePersonnel()
        self.cleanHeight()
        self.cleanTimeHandoff()
        self.cleanTimeSnap()
        self.cleanTurf()
        self.cleanPossessionTeam()
        self.cleanPlayerBirthDate()
        self.cleanPlayDirection()
        self.cleanTeam()

        return self.data


In [3]:
class FeatureEngine:
    """A Feature Engineer for the NFL data

    This class implements an engine that engineers
    the variables of the NFL dataset for the 2019 NFL data competition held on Kaggle.

    Parameters
    ----------
    data: the nfl data in pandas DataFrame format

    exclude: A list of feature engineering processes to exclude

    Notes
    -----
    Might not work if some of the columns have been edited since initial
    import. In that case, either implement additional feature engineering
    methods or reload the data.

    References
    ----------
    The methods were implemented based from the one found at:
    https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win
    """

    def __init__(self, data, exclude=[]):
        self.data = data  # Clean data from the parser
        self.exclude = exclude  # Pass a list of processes to exclude
        self.include = ['X',
                        'Orientation',
                        'HomeField',
                        'FieldEqPossession',
                        'isRusher',
                        'PlayerAge',
                        'HandSnapDelta',
                        'YardsLeft',
                        'BMI',
                        'DefendersInTheBox_vs_Distance']

    def engineerX(self):
        """
        Readjusts X
        """
        self.data['X'] = self.data.apply(
            lambda row: row['X'] if row['PlayDirection'] else 120-row['X'], axis=1)

    def engineerOrientation(self, angle, play_direction):
        """
        Readjusts Orientation

        References
        ----------
        #from https://www.kaggle.com/scirpus/hybrid-gp-and-nn
        """
        if play_direction == 0:
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def engineerFieldEqPossession(self):
        self.data['FieldEqPossession'] = self.data['FieldPosition'] == self.data['PossessionTeam']

    def engineerHomeField(self):
        self.data['HomeField'] = self.data['FieldPosition'] == self.data['HomeTeamAbbr']

    def engineerIsRusher(self):
        self.data['isRusher'] = self.data['NflId'] == self.data['NflIdRusher']
        self.data.drop(['NflId', 'NflIdRusher'], axis=1, inplace=True)

    def engineerHandoffSnapDelta(self):
        self.data['TimeDelta'] = self.data.apply(lambda row: (
            row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        self.data = self.data.drop(['TimeHandoff', 'TimeSnap'], axis=1)

    def engineerYardsLeft(self):
        """
        Computes yards left from end-zone

        Note
        ----
        Requires variable HomeField (must execute engineerHomeField before execution)
        """
        self.data['YardsLeft'] = self.data.apply(
            lambda row: 100-row['YardLine'] if row['HomeField'] else row['YardLine'], axis=1)
        self.data['YardsLeft'] = self.data.apply(
            lambda row: row['YardsLeft'] if row['PlayDirection'] else 100-row['YardsLeft'], axis=1)
        self.data.drop(self.data.index[(self.data['YardsLeft'] < self.data['Yards']) | (
            self.data['YardsLeft']-100 > self.data['Yards'])], inplace=True)

    def engineerBMI(self):
        """
        Computes the BMI of a player from height and weight
        """
        self.data['PlayerBMI'] = 703 * \
            (self.data['PlayerWeight']/(self.data['PlayerHeight'])**2)

    def engineerPlayerAge(self):
        """
        Computes the age of the player from TimeHandoff
        """
        seconds_in_year = 60*60*24*365.25
        self.data['PlayerAge'] = self.data.apply(lambda row: (
            row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        self.data = self.data.drop(['PlayerBirthDate'], axis=1)

    def engineerDefendersInTheBox_vs_Distance(self):
        dfInBox_mode = self.data['DefendersInTheBox'].mode()
        self.data['DefendersInTheBox'].fillna(
            dfInBox_mode.iloc[0], inplace=True)
        self.data['DefendersInTheBox_vs_Distance'] = self.data['DefendersInTheBox'] / \
            self.data['Distance']

    ### Outputs clean and engineered DataFrame ###

    def engineer(self):
        for c in self.include:

            if c in self.exclude:
                continue

            elif c == 'X':
                self.engineerX

            elif c == 'Orientation':
                self.data['Orientation'] = self.data.apply(lambda row: self.engineerOrientation(
                    row['Orientation'], row['PlayDirection']), axis=1)
                self.data['Dir'] = self.data.apply(lambda row: self.engineerOrientation(
                    row['Dir'], row['PlayDirection']), axis=1)

            elif c == 'FieldEqPossession':
                self.engineerFieldEqPossession()

            elif c == 'HomeField':
                self.engineerHomeField()

            elif c == 'YardsLeft':
                self.engineerYardsLeft()

            elif c == 'isRusher':
                self.engineerIsRusher()

            elif c == 'PlayerAge':
                self.engineerPlayerAge()

            elif c == 'HandSnapDelta':
                self.engineerHandoffSnapDelta()

            elif c == 'BMI':
                self.engineerBMI()

            elif c == 'DefendersInTheBox_vs_Distance':
                self.engineerDefendersInTheBox_vs_Distance()

        return self.data

In [4]:
df = pd.read_csv('../data/train.csv')
parser = DataParser(df) # Initialize data parser
df = parser.parse() # Use the parse method to parse data
eng = FeatureEngine(df) # Initalize feature engine with parsed data
df = eng.engineer() # feature engineer the data

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.head() # Cleaned Data

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,LB,DB,HomeField,FieldEqPossession,isRusher,PlayerAge,TimeDelta,YardsLeft,PlayerBMI,DefendersInTheBox_vs_Distance
0,2017090700,20170907000118,False,73.91,34.84,1.69,1.13,0.4,278.01,182.82,...,3,6,True,True,False,28.69276,1.0,35,28.749228,3.0
1,2017090700,20170907000118,False,74.67,32.64,0.42,1.35,0.01,332.39,161.3,...,3,6,True,True,False,28.457305,1.0,35,35.9936,3.0
2,2017090700,20170907000118,False,74.0,33.2,1.22,0.59,0.31,356.99,157.27,...,3,6,True,True,False,28.62979,1.0,35,33.744,3.0
3,2017090700,20170907000118,False,71.46,27.7,0.42,0.54,0.02,0.23,254.36,...,3,6,True,True,False,34.79543,1.0,35,30.619556,3.0
4,2017090700,20170907000118,False,69.32,35.42,1.82,2.43,0.16,347.37,195.69,...,3,6,True,True,False,30.061685,1.0,35,27.935571,3.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504812 entries, 0 to 509761
Data columns (total 56 columns):
GameId                           504812 non-null int64
PlayId                           504812 non-null int64
Team                             504812 non-null bool
X                                504812 non-null float64
Y                                504812 non-null float64
S                                504812 non-null float64
A                                504812 non-null float64
Dis                              504812 non-null float64
Orientation                      504794 non-null float64
Dir                              504798 non-null float64
DisplayName                      504812 non-null object
JerseyNumber                     504812 non-null int64
Season                           504812 non-null int64
YardLine                         504812 non-null int64
Quarter                          504812 non-null int64
GameClock                        504812 non-null o

### Reshaping Data

First we are going to drop all columns with dtype == object. We have cleaned and featured all the columns that are required for the regression process so anything that is still left as dtype == object are variables we do not need.

In [7]:
# Sort data by the four features
df = df.sort_values(by=['PlayId', 'Team', 'isRusher', 'JerseyNumber']).reset_index()

In [8]:
# Now we are going to drop the following features as they are only needed for the sorting process which is done.
df.drop(['GameId', 'PlayId', 'index', 'isRusher', 'Team'], axis=1, inplace=True)

In [9]:
# Find out which columns have dtype==object
drop_col = []
for c in df.columns:
    if df[c].dtype == 'object':
        drop_col.append(c)
df.drop(drop_col, axis=1, inplace=True)
print('The following columns were dropped', drop_col)

The following columns were dropped ['DisplayName', 'GameClock', 'PossessionTeam', 'FieldPosition', 'OffenseFormation', 'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr', 'Stadium', 'Location']


The input for the regression neural network requires the data to be in the shape of a 2 dimensional vector, where each row corresponds to each play. Therefore we are going the combine the data for each play in to one large vector.

In [11]:
# Fill NAs with -999
df.fillna(-999, inplace=True)

In [17]:
players_col = [] # Extract features that are different for each individual player.
for col in df.columns:
    if df[col][:22].std() != 0 and col != 'GameWeather': # if the standard deviation of the first 22 rows does not equal 0 than append the whole column
        players_col.append(col)  # this measure is taken to avoid repeating data
players_col

['X',
 'Y',
 'S',
 'A',
 'Dis',
 'Orientation',
 'Dir',
 'JerseyNumber',
 'PlayerHeight',
 'PlayerWeight',
 'PlayerAge',
 'PlayerBMI']

In [18]:
# reshape the data into a large row
X_train = np.array(df[players_col]).reshape(-1, len(players_col)*22)
X_train.shape

(22946, 264)

In [19]:
# Extract features that are the same for each play
play_col = df.drop(players_col + ['Yards'], axis=1).columns
# populate empty matrix
X_play_col = np.zeros(shape=(X_train.shape[0], len(play_col)))
for i, col in enumerate(play_col):
    X_play_col[:, i] = df[col][::22] #:: means for each 22 rows

In [21]:
# join the two matrixes
X_train = np.concatenate([X_train, X_play_col], axis=1)
y_train = df['Yards'][::22]
X_train.shape[0] == y_train.shape[0]

True

In [22]:
from sklearn.preprocessing import StandardScaler

In [24]:
# Standardize for each feature
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

### A Complete Function

In [None]:
class DataReshaper:
    def __init__(self, data):
        self.data = data

    def mold(self):
        # Sort data by the four features
        self.data = self.data.sort_values(by=['PlayId', 'Team', 'isRusher', 'JerseyNumber'].reset_index()
        self.data.drop(['GameId', 'PlayId', 'index', 'isRusher', 'Team'], axis=1, inplace=True)
        # Find out which columns have dtype==object and drop them
        drop_col = []
        for c in self.data.columns:
            if self.data[c].dtype == 'object':
                self.data.append(c)
        self.data.drop(drop_col, axis=1, inplace=True)
        print('The following columns were dropped:', drop_col)
        # Extract features that are the same for each play
        play_col = self.data.drop(players_col + ['Yards'], axis=1).columns
        # populate empty matrix
        X_play_col = np.zeros(shape=(X_train.shape[0], len(play_col)))
        for i, col in enumerate(play_col):
            X_play_col[:, i] = self.data[col][::22]
        # join the two matrixes
        X_train = np.concatenate([X_train, X_play_col], axis=1)
        y_train = self.data['Yards'][::22]
        # Standardize for each feature
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)

        return X_train, y_train

In [None]:
import random
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import gc

In [None]:
folds = 10
seed = 222
kf = KFold(n_splits=folds, shuffle = True, random_state=seed)
y_valid_pred = np.zeros(X_train.shape[0])
models = [] 
scores = []

for tr_idx, val_idx in kf.split(X_train, y_train):
    tr_x, tr_y = X_train[tr_idx,:], y_train[tr_idx]
    vl_x, vl_y = X_train[val_idx,:], y_train[val_idx]
    
    print(len(tr_x),len(vl_x))
    regr = RandomForestRegressor(random_state=0, n_estimators=100)
    regr.fit(tr_x, tr_y)
    y_valid_pred[val_idx] += regr.predict(vl_x)
    scores.append(regr.score(vl_x, vl_y))
    models.append(regr)
    
gc.collect()