In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import re
import keras
import keras.backend as K
from sklearn.model_selection import RepeatedKFold
from string import punctuation
from sklearn.preprocessing import StandardScaler

In [15]:
class DataParser:
    """

    This class implements a parser that cleans the variables of the NFL dataset
    for the 2019 NFL data competition held on Kaggle.

    Parameters
    ----------
    data: the nfl data in pandas DataFrame format

    Notes
    -----
    Might not work if some of the columns have been edited since initial
    import. In that case, either implement additional feature engineering
    methods or reload the data.

    References
    ----------
    The methods were implemented based from the one found at:
    https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win

    """

    def __init__(self, data):
        self.data = data

    def cleanWindSpeed(self, x):
        x = str(x)
        x = x.lower()
        if '-' in x:
            x = (int(x.split('-')[0]) + int(x.split('-')[1])) / 2
        elif ' gusts up to 25 ' in x:
            x = (int(x.split(' gusts up tp 25 ')))
        try:
            return float(x)
        except:
            return -1

    def cleanGameWeather(self, x):
        x = str(x).lower()
        if 'sunny' in x or 'clear' in x or 'fair' in x:
            return 'sunny'
        elif 'cloud' in x or 'coudy' in x or 'clouidy' in x or 'hazy' in x or 'sun & clouds' in x or 'overcast' in x:
            return 'cloudy'
        elif 'rain' in x or 'shower' in x or 'rainy' in x:
            return 'rainy'
        elif 'controlled climate' in x or 'indoor' in x:
            return 'indoor'
        elif 'snow' in x:
            return 'snowy'
        return None

    def mapGameWeather(self):
        dummy = pd.get_dummies(self.data['GameWeather'])
        self.data = pd.concat([self.data, dummy], axis= 1)
        self.data.drop(['GameWeather'], axis=1)

    def cleanStadiumType(self, txt):  # Fixes the typo
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = re.sub(' +', ' ', txt)
        txt = txt.strip()
        txt = txt.replace('outside', 'outdoor')
        txt = txt.replace('outdor', 'outdoor')
        txt = txt.replace('outddors', 'outdoor')
        txt = txt.replace('outdoors', 'outdoor')
        txt = txt.replace('oudoor', 'outdoor')
        txt = txt.replace('indoors', 'indoor')
        txt = txt.replace('ourdoor', 'outdoor')
        txt = txt.replace('retractable', 'rtr.')
        return txt

    # Focuses only on the words: outdoor, indoor, closed and open.
    def cleanStadiumType2(self, txt):
        if pd.isna(txt):
            return np.nan
        if 'outdoor' in txt or 'open' in txt:
            return 1
        if 'indoor' in txt or 'closed' in txt:
            return 0
        return np.nan

    def cleanDefencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(', ')]
               for t in self.data['DefensePersonnel']]
        self.data['DL'] = pd.Series([int(a[0]) for a in arr])
        self.data['LB'] = pd.Series([int(a[1]) for a in arr])
        self.data['DB'] = pd.Series([int(a[2]) for a in arr])
        self.data = self.data.drop(labels=["DefensePersonnel"], axis=1)

    def cleanOffencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(", ")]
               for t in self.data["OffensePersonnel"]]
        self.data["RB"] = pd.Series([int(a[0]) for a in arr])
        self.data["TE"] = pd.Series([int(a[1]) for a in arr])
        self.data["WR"] = pd.Series([int(a[2]) for a in arr])
        self.data = self.data.drop(labels=["OffensePersonnel"], axis=1)

    def cleanOffenseFormation(self):
        dummy = pd.get_dummies(self.data['OffenseFormation'])
        self.data = pd.concat([self.data, dummy], axis=1)

    def cleanHeight(self):
        """
        Parses the PlayerHeight column and converts height into inches
        """
        self.data['PlayerHeight'] = self.data['PlayerHeight'].apply(
            lambda x: 12*int(x.split('-')[0])+int(x.split('-')[1]))

    def cleanTimeHandoff(self):
        self.data['TimeHandoff'] = self.data['TimeHandoff'].apply(
            lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    def cleanTimeSnap(self):
        self.data['TimeSnap'] = self.data['TimeSnap'].apply(
            lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%fZ"))

    def cleanGameClock(self):
        arr = [[int(s[0]) for s in t.split(":")]
               for t in self.data["GameClock"]]
        self.data["GameHour"] = [int(a[0]) for a in arr]
        self.data["GameMinute"] = [int(a[1]) for a in arr]
        self.data = self.data.drop(labels=['GameClock'], axis=1)

    def cleanTurf(self):
        # from https://www.kaggle.com/c/nfl-big-data-bowl-2020/discussion/112681#latest-649087
        Turf = {'Field Turf': 'Artificial', 'A-Turf Titan': 'Artificial', 'Grass': 'Natural', 'UBU Sports Speed S5-M': 'Artificial',
                'Artificial': 'Artificial', 'DD GrassMaster': 'Artificial', 'Natural Grass': 'Natural',
                'UBU Speed Series-S5-M': 'Artificial', 'FieldTurf': 'Artificial', 'FieldTurf 360': 'Artificial', 'Natural grass': 'Natural', 'grass': 'Natural',
                'Natural': 'Natural', 'Artifical': 'Artificial', 'FieldTurf360': 'Artificial', 'Naturall Grass': 'Natural', 'Field turf': 'Artificial',
                'SISGrass': 'Artificial', 'Twenty-Four/Seven Turf': 'Artificial', 'natural grass': 'Natural'}

        self.data['Turf'] = self.data['Turf'].map(Turf)
        self.data['Turf'] = self.data['Turf'] == 'Natural'

    def cleanPossessionTeam(self):  # fixes problem in team name encoding
        map_abbr = {'ARI': 'ARZ', 'BAL': 'BLT', 'CLE': 'CLV', 'HOU': 'HST'}
        for abb in self.data['PossessionTeam'].unique():
            map_abbr[abb] = abb
        self.data['PossessionTeam'] = self.data['PossessionTeam'].map(
            map_abbr)
        self.data['HomeTeamAbbr'] = self.data['HomeTeamAbbr'].map(map_abbr)
        self.data['VisitorTeamAbbr'] = self.data['VisitorTeamAbbr'].map(
            map_abbr)

    def cleanPlayerBirthDate(self):
        self.data['PlayerBirthDate'] = self.data['PlayerBirthDate'].apply(
            lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))

    def cleanWindDirection(self, txt):
        if pd.isna(txt):
            return np.nan
        txt = txt.lower()
        txt = ''.join([c for c in txt if c not in punctuation])
        txt = txt.replace('from', '')
        txt = txt.replace(' ', '')
        txt = txt.replace('north', 'n')
        txt = txt.replace('south', 's')
        txt = txt.replace('west', 'w')
        txt = txt.replace('east', 'e')
        return txt
    
    def mapWindDirection(self, txt):
        windDirectionMap = {
            'n': 0,'nne': 1/8,'nen': 1/8,'ne': 2/8,
            'ene': 3/8,'nee': 3/8,'e': 4/8,'ese': 5/8,
            'see': 5/8,'se': 6/8,'ses': 7/8,'sse': 7/8,
            's': 1,'ssw': 9/8,'sws': 9/8,'sw': 10/8,
            'sww': 11/8,'wsw': 11/8,'w': 12/8,'wnw': 13/8,
            'nw': 14/8,'nwn': 15/8,'nnw': 15/8
        }
        try:
            return windDirectionMap[txt]
        except:
            return np.nan

    def cleanPlayDirection(self):
        """
        1 if play direction if right, 0 if play direction is left.
        """
        self.data['PlayDirection'] = self.data['PlayDirection'].apply(
            lambda x: x.strip() == 'right')

    def cleanTeam(self):
        """
        1 if home team, 0 if away team
        """
        self.data['Team'] = self.data['Team'].apply(
            lambda x: x.strip() == 'home')

    def parse(self):
        self.data['WindSpeed'] = self.data['WindSpeed'].apply(self.cleanWindSpeed)
        self.data['GameWeather'] = self.data['GameWeather'].apply(self.cleanGameWeather)
        self.mapGameWeather()
        self.data['StadiumType'] = self.data['StadiumType'].apply(self.cleanStadiumType)
        self.data['StadiumType'] = self.data['StadiumType'].apply(self.cleanStadiumType2)
        self.data['WindDirection'] = self.data['WindDirection'].apply(self.cleanWindDirection)
        self.data['WindDirection'] = self.data['WindDirection'].apply(self.mapWindDirection)
        self.cleanOffenseFormation()
        self.cleanOffencePersonnel()
        self.cleanDefencePersonnel()
        self.cleanHeight()
        self.cleanTimeHandoff()
        self.cleanTimeSnap()
        self.cleanTurf()
        self.cleanPossessionTeam()
        self.cleanPlayerBirthDate()
        self.cleanPlayDirection()
        self.cleanTeam()

        return self.data

In [None]:
class FeatureEngine:
    """A Feature Engineer for the NFL data

    This class implements an engine that engineers
    the variables of the NFL dataset for the 2019 NFL data competition held on Kaggle.

    Parameters
    ----------
    data: the nfl data in pandas DataFrame format
    predict: must be True in the prediction stage
    exclude: A list of feature engineering processes to exclude

    Notes
    -----
    Might not work if some of the columns have been edited since initial
    import. In that case, either implement additional feature engineering
    methods or reload the data.

    References
    ----------
    The methods were implemented based from the one found at:
    https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win
    """

    def __init__(self, data, predict=False, exclude=[]):
        self.data = data  # Clean data from the parser
        self.exclude = exclude  # Pass a list of processes to exclude
        self.include = ['X',
                        'Orientation',
                        'HomeField',
                        'FieldEqPossession',
                        'isRusher',
                        'PlayerAge',
                        'HandSnapDelta',
                        'YardsLeft',
                        'BMI',
                        'DefendersInTheBox_vs_Distance']

    def engineerX(self):
        """
        Readjusts X
        """
        self.data['X'] = self.data.apply(
            lambda row: row['X'] if row['PlayDirection'] else 120-row['X'], axis=1)

    def engineerOrientation(self, angle, play_direction):
        """
        Readjusts Orientation

        References
        ----------
        #from https://www.kaggle.com/scirpus/hybrid-gp-and-nn
        """
        if play_direction == 0:
            new_angle = 360.0 - angle
            if new_angle == 360.0:
                new_angle = 0.0
            return new_angle
        else:
            return angle

    def engineerFieldEqPossession(self):
        self.data['FieldEqPossession'] = self.data['FieldPosition'] == self.data['PossessionTeam']

    def engineerHomeField(self):
        self.data['HomeField'] = self.data['FieldPosition'] == self.data['HomeTeamAbbr']

    def engineerIsRusher(self):
        self.data['isRusher'] = self.data['NflId'] == self.data['NflIdRusher']
        self.data.drop(['NflId', 'NflIdRusher'], axis=1, inplace=True)

    def engineerHandoffSnapDelta(self):
        self.data['TimeDelta'] = self.data.apply(lambda row: (
            row['TimeHandoff'] - row['TimeSnap']).total_seconds(), axis=1)
        self.data = self.data.drop(['TimeHandoff', 'TimeSnap'], axis=1)

    def engineerYardsLeft(self):
        """
        Computes yards left from end-zone

        Note
        ----
        Requires variable HomeField (must execute engineerHomeField before execution)
        """
        self.data['YardsLeft'] = self.data.apply(
            lambda row: 100-row['YardLine'] if row['HomeField'] else row['YardLine'], axis=1)
        self.data['YardsLeft'] = self.data.apply(
            lambda row: row['YardsLeft'] if row['PlayDirection'] else 100-row['YardsLeft'], axis=1)
        
    def engineerBMI(self):
        """
        Computes the BMI of a player from height and weight
        """
        self.data['PlayerBMI'] = 703 * \
            (self.data['PlayerWeight']/(self.data['PlayerHeight'])**2)

    def engineerPlayerAge(self):
        """
        Computes the age of the player from TimeHandoff
        """
        seconds_in_year = 60*60*24*365.25
        self.data['PlayerAge'] = self.data.apply(lambda row: (
            row['TimeHandoff']-row['PlayerBirthDate']).total_seconds()/seconds_in_year, axis=1)
        self.data = self.data.drop(['PlayerBirthDate'], axis=1)

    def engineerDefendersInTheBox_vs_Distance(self):
        dfInBox_mode = self.data['DefendersInTheBox'].mode()
        self.data['DefendersInTheBox'].fillna(
            dfInBox_mode.iloc[0], inplace=True)
        self.data['DefendersInTheBox_vs_Distance'] = self.data['DefendersInTheBox'] / \
            self.data['Distance']

    ### Outputs clean and engineered DataFrame ###

    def engineer(self):
        for c in self.include:

            if c in self.exclude:
                continue

            elif c == 'X':
                self.engineerX

            elif c == 'Orientation':
                self.data['Orientation'] = self.data.apply(lambda row: self.engineerOrientation(
                    row['Orientation'], row['PlayDirection']), axis=1)
                self.data['Dir'] = self.data.apply(lambda row: self.engineerOrientation(
                    row['Dir'], row['PlayDirection']), axis=1)

            elif c == 'FieldEqPossession':
                self.engineerFieldEqPossession()

            elif c == 'HomeField':
                self.engineerHomeField()

            elif c == 'YardsLeft':
                self.engineerYardsLeft()

            elif c == 'isRusher':
                self.engineerIsRusher()

            elif c == 'PlayerAge':
                self.engineerPlayerAge()

            elif c == 'HandSnapDelta':
                self.engineerHandoffSnapDelta()

            elif c == 'BMI':
                self.engineerBMI()

            elif c == 'DefendersInTheBox_vs_Distance':
                self.engineerDefendersInTheBox_vs_Distance()

        return self.data

In [None]:
def DataReshaper2(data, predict=False, playersCol = []):
    """
    Takes the parsed and feature engineered data and outputs X_train and y_train
    vectors that are compatible for neural networks and machine learning algorithms

    Parameters:
    -----------
    data: parsed and feature engineered data (pandas dataframe format)
    predict: must be true in the prediction stage
    playersCol: pass the players_col created in the training stage

    Returns:
    --------
    X_train: a 2 dimentional vector housing all predictor variables
    y_train: a 1 dimentional vector housing all response variable
    players_col: The names of variables that are unique to each player (ex: height and weight)

    Note:
    -----
    Requires Standard Scalar from the scikit learn library
    
    References
    ----------
    https://www.kaggle.com/bgmello/neural-networks-feature-engineering-for-the-win
    """
    
    ### Dropping Unnecessary Columns and Filling NAs ###
    data = data.sort_values(by=['PlayId', 'Team', 'isRusher', 'JerseyNumber']).reset_index()
    data.drop(['GameId', 'PlayId', 'index', 'isRusher', 'Team'], axis=1, inplace=True)

    drop_col = []
    for c in data.columns:
            if data[c].dtype == 'object':
                drop_col.append(c)
    data.drop(drop_col, axis=1, inplace=True))
    
    data.fillna(-999, inplace=True)
    
    ### Creating One Large Row ###
    players_col = playersCol
    if(not predict):
        for col in data.columns:
            if data[col][:22].std() != 0 and col != 'GameWeather': # if the standard deviation of the first 22 rows does not equal 0 than append the whole column
                players_col.append(col)  # this measure is taken to avoid repeating data
    
    X_train = np.array(data[players_col]).reshape(-1, len(players_col)*22)

    if(not predict):
        play_col = data.drop(players_col + ['Yards'], axis=1).columns
    else:
        play_col = data.drop(players_col, axis=1).columns

    X_play_col = np.zeros(shape=(X_train.shape[0], len(play_col)))
    for i, col in enumerate(play_col):
            X_play_col[:, i] = data[col][::22]

    X_train = np.concatenate([X_train, X_play_col], axis=1)
    
    ### Reshaping y_train(only for training stage) ###
    if(not predict):
        y_train = np.zeros(shape=(X_train.shape[0], 199))
        for i, yard in enumerate(train['Yards'][::22]):
            y_train[i, yard+99:] = np.ones(shape=(1, 100-yard))

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
                            
    if(not predict):
        return X_train, y_train, players_col
    return X_train

In [17]:
train = pd.read_csv('../../train.csv')

In [None]:
parser = DataParser(train)
train = parser.parse()
engine = FeatureEngine(train)
train = engine.engineer()
X_train, y_train = DataReshaper2(train)

### Neural Network

In [None]:
#from https://www.kaggle.com/davidcairuz/nfl-neural-network-w-softmax
def crps(y_true, y_pred):
    return K.mean(K.square(y_true - K.cumsum(y_pred, axis=1)), axis=1)

In [None]:
def build_model():
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(keras.layers.Dense(32, activation='relu'))
    model.add(keras.layers.Dense(199, activation='softmax'))
    model.compile(optimizer='rmsprop', loss=crps)
    return model

In [None]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath='my_model.h5',
        monitor='val_loss',
        save_best_only=True,
    ),
    keras.callbacks.TensorBoard(
        log_dir='my_log_dir',
    )
]

In [None]:
def train_model(X_train, y_train, X_val, y_val):
    model = build_model()
    model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=200,
                    batch_size=64,
                    callbacks=callbacks_list,
                    verbose=0)
    return model

In [None]:
rkf = RepeatedKFold(n_splits=5, n_repeats=5)

models = []

for tr_idx, vl_idx in rkf.split(X_train, y_train):
    
    x_tr, y_tr = X_train[tr_idx], y_train[tr_idx]
    x_vl, y_vl = X_train[vl_idx], y_train[vl_idx]
    
    model = train_model(x_tr, y_tr, x_vl, y_vl)
    models.append(model)

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)

plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()