In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import math
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras import *
import tensorflow

In [26]:
DATA_PATH = '../raw_data'

In [27]:
train = pd.read_csv(f'{DATA_PATH}/train.csv')
X = train.drop(columns='Yards')
y = train.Yards

In [28]:
train.dtypes.value_counts()

object     24
int64      15
float64    10
dtype: int64

# Categorical Features

In [29]:
train_cat = train.select_dtypes(include=['object'])

In [30]:
train_cat.nunique()

Team                     2
DisplayName           2568
GameClock              901
PossessionTeam          32
FieldPosition           32
OffenseFormation         8
OffensePersonnel        61
DefensePersonnel        45
PlayDirection            2
TimeHandoff          30709
TimeSnap             30721
PlayerHeight            16
PlayerBirthDate       1897
PlayerCollegeName      314
Position                25
HomeTeamAbbr            32
VisitorTeamAbbr         32
Stadium                 61
Location                67
StadiumType             33
Turf                    23
GameWeather             73
WindSpeed               69
WindDirection           58
dtype: int64

In [31]:
train_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Team               682154 non-null  object
 1   DisplayName        682154 non-null  object
 2   GameClock          682154 non-null  object
 3   PossessionTeam     682154 non-null  object
 4   FieldPosition      673552 non-null  object
 5   OffenseFormation   682066 non-null  object
 6   OffensePersonnel   682154 non-null  object
 7   DefensePersonnel   682154 non-null  object
 8   PlayDirection      682154 non-null  object
 9   TimeHandoff        682154 non-null  object
 10  TimeSnap           682154 non-null  object
 11  PlayerHeight       682154 non-null  object
 12  PlayerBirthDate    682154 non-null  object
 13  PlayerCollegeName  682154 non-null  object
 14  Position           682154 non-null  object
 15  HomeTeamAbbr       682154 non-null  object
 16  VisitorTeamAbbr    6

## Team

In [32]:
train_cat.Team.unique()

array(['away', 'home'], dtype=object)

In [33]:
def proc_team(x):
    return 1 if x == 'home' else 0

## DisplayName

In [34]:
train_cat.DisplayName.nunique()

2568

## GameClock

In [35]:
train_cat.GameClock.unique()

array(['14:14:00', '13:52:00', '13:02:00', '12:12:00', '12:08:00',
       '11:21:00', '10:34:00', '09:34:00', '09:25:00', '06:34:00',
       '03:47:00', '03:03:00', '01:36:00', '00:37:00', '00:23:00',
       '14:44:00', '13:17:00', '12:49:00', '11:23:00', '09:57:00',
       '09:17:00', '08:38:00', '08:15:00', '07:31:00', '07:22:00',
       '06:11:00', '05:29:00', '03:44:00', '02:29:00', '15:00:00',
       '10:53:00', '10:23:00', '09:13:00', '07:55:00', '07:08:00',
       '05:35:00', '05:03:00', '04:21:00', '02:31:00', '02:00:00',
       '00:57:00', '00:04:00', '12:29:00', '10:28:00', '10:00:00',
       '08:20:00', '05:19:00', '04:15:00', '04:05:00', '02:44:00',
       '02:37:00', '02:32:00', '14:52:00', '12:48:00', '12:15:00',
       '11:35:00', '10:33:00', '09:40:00', '06:13:00', '01:57:00',
       '00:20:00', '14:30:00', '13:22:00', '06:16:00', '05:48:00',
       '04:44:00', '03:55:00', '02:38:00', '02:10:00', '14:54:00',
       '12:00:00', '10:41:00', '09:05:00', '08:05:00', '06:52:

In [36]:
# transform game clock time in minutes
def proc_gameclock(x):
    min,sec,msec = x.split(':')
    return int(min) + int(sec)/60 + int(msec)/3600

## PossessionTeam

In [37]:
train_cat.PossessionTeam.unique()

array(['NE', 'KC', 'BUF', 'NYJ', 'ATL', 'CHI', 'CIN', 'BLT', 'CLV', 'PIT',
       'ARZ', 'DET', 'JAX', 'HST', 'OAK', 'TEN', 'WAS', 'PHI', 'LA',
       'IND', 'SEA', 'GB', 'CAR', 'SF', 'DAL', 'NYG', 'NO', 'MIN', 'DEN',
       'LAC', 'TB', 'MIA'], dtype=object)

In [38]:
train_cat[
    (train_cat.PossessionTeam != train_cat.HomeTeamAbbr) & 
    (train_cat.PossessionTeam != train_cat.VisitorTeamAbbr)][['PossessionTeam','HomeTeamAbbr','VisitorTeamAbbr']]

Unnamed: 0,PossessionTeam,HomeTeamAbbr,VisitorTeamAbbr
2992,BLT,CIN,BAL
2993,BLT,CIN,BAL
2994,BLT,CIN,BAL
2995,BLT,CIN,BAL
2996,BLT,CIN,BAL
...,...,...,...
682149,BLT,LA,BAL
682150,BLT,LA,BAL
682151,BLT,LA,BAL
682152,BLT,LA,BAL


## FieldPosition

In [39]:
train_cat.FieldPosition.unique()

array(['NE', 'KC', nan, 'BUF', 'NYJ', 'ATL', 'CHI', 'CIN', 'BLT', 'CLV',
       'PIT', 'ARZ', 'DET', 'JAX', 'HST', 'TEN', 'OAK', 'WAS', 'PHI',
       'LA', 'IND', 'GB', 'SEA', 'CAR', 'SF', 'DAL', 'NYG', 'NO', 'MIN',
       'DEN', 'LAC', 'TB', 'MIA'], dtype=object)

In [40]:
def possession_in_fieldPosition(df):
    df['PossessionInFieldPosition'] = df.FieldPosition == df.PossessionTeam
    df['PossessionInFieldPosition'] = df['PossessionInFieldPosition'].apply(lambda x : 1 if x else 0)
    return df.drop(columns = ['FieldPosition','PossessionTeam'])

## OffenseFormation

In [41]:
train_cat.OffenseFormation.unique()

array(['SHOTGUN', 'SINGLEBACK', 'JUMBO', 'PISTOL', 'I_FORM', 'ACE',
       'WILDCAT', nan, 'EMPTY'], dtype=object)

In [42]:
# One Hot Encoding of the offense formation values
def oneHotEncoding_offense_formation(df):
    offense_formation_dummies = pd.get_dummies(df.OffenseFormation, columns=df.OffenseFormation.unique())
    return pd.concat([df.drop('OffenseFormation', axis=1),offense_formation_dummies], axis=1)

## OffensePersonnel

In [43]:
# feature is droped because it is already contained in Position feature

## DefensePersonnel

In [44]:
# feature is droped because it is already contained in Position feature

## Position

In [45]:
train_cat.Position.unique()

array(['SS', 'DE', 'ILB', 'FS', 'CB', 'DT', 'WR', 'TE', 'T', 'QB', 'RB',
       'G', 'C', 'OLB', 'NT', 'FB', 'MLB', 'LB', 'OT', 'S', 'OG', 'HB',
       'DB', 'DL', 'SAF'], dtype=object)

In [46]:
# One Hot Encoding of the positions
def oneHotEncoding_position(df):
    position_dummies = pd.get_dummies(df.Position, columns=df.Position.unique())
    return pd.concat([df.drop('Position', axis=1),position_dummies], axis=1)

## PlayDirection

In [47]:
train_cat.PlayDirection.unique()

array(['left', 'right'], dtype=object)

In [48]:
def proc_play_direction(x):
    return 1 if x == 'left' else 0

## TimeHandoff / TimeSnap / PlayerBithDate

In [49]:
train_cat.TimeHandoff.unique()

array(['2017-09-08T00:44:06.000Z', '2017-09-08T00:44:27.000Z',
       '2017-09-08T00:45:17.000Z', ..., '2019-11-26T03:46:54.000Z',
       '2019-11-26T03:58:22.000Z', '2019-11-26T03:59:06.000Z'],
      dtype=object)

In [50]:
train_cat.TimeSnap.unique()

array(['2017-09-08T00:44:05.000Z', '2017-09-08T00:44:26.000Z',
       '2017-09-08T00:45:15.000Z', ..., '2019-11-26T03:46:53.000Z',
       '2019-11-26T03:58:21.000Z', '2019-11-26T03:59:05.000Z'],
      dtype=object)

In [51]:
train_cat.PlayerBirthDate.unique()

array(['12/29/1988', '03/25/1989', '01/21/1989', ..., '11/12/1995',
       '07/01/1996', '08/28/1995'], dtype=object)

In [52]:
def proc_time_handoff_snap_and_player_age(df):
    seconds_in_year = 3600*24*365.25
    df['TimeDeltaHandoff'] = (df.TimeHandoff.apply(lambda x : datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%fZ'))
                          - df.TimeSnap.apply(lambda x : datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%fZ')))
    df['TimeDeltaHandoff'] = df['TimeDeltaHandoff'].apply(lambda x : x.total_seconds()/seconds_in_year)
    df['PlayerAge'] = (df.TimeHandoff.apply(lambda x : datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%fZ'))
                          - df.PlayerBirthDate.apply(lambda x : datetime.strptime(x,'%m/%d/%Y')))
    df['PlayerAge'] = df['PlayerAge'].apply(lambda x : x.total_seconds()/seconds_in_year)
    return df.drop(columns=['TimeHandoff','TimeSnap','PlayerBirthDate'])

## PlayerHeight

In [53]:
train_cat.PlayerHeight.unique()

array(['6-0', '6-3', '6-2', '5-11', '6-6', '6-9', '6-4', '6-1', '5-10',
       '6-5', '6-7', '5-9', '5-8', '5-7', '6-8', '5-6'], dtype=object)

In [54]:
def proc_player_height(x):
    return float(f"{x.split('-')[0]}.{x.split('-')[1]}") * 30.48

## PlayerCollegeName

In [55]:
train_cat.PlayerCollegeName.unique()

array(['Tennessee', 'Miami', 'Georgia', 'Texas', 'Newberry', 'Auburn',
       'Oregon', 'Rice', 'Brigham Young', 'Washington',
       'Mississippi State', 'Texas Tech', 'Arizona', 'Texas Christian',
       'Colorado', 'Michigan', 'Monmouth, N.J.', 'Oregon State',
       'Wisconsin', 'Georgia Tech', 'North Carolina State',
       'Central Michigan', 'LSU', 'Southern Mississippi', 'Brown',
       'Clemson', 'Florida', 'Utah', 'California', 'Cincinnati',
       'Wisconsin-Milwaukee', 'McGill', 'South Dakota State', 'Missouri',
       'Penn State', 'Toledo', 'Rutgers', 'Alabama', 'South Carolina',
       'West Alabama', 'Arkansas', 'Stanford', 'Nebraska', 'Connecticut',
       'Georgia State', 'Arizona State', 'Houston', 'Pittsburgh', 'UCLA',
       'Minnesota', 'Vanderbilt', 'Abilene Christian', 'Chattanooga',
       'Temple', 'Troy', 'Arkansas State', 'Louisiana State',
       'Florida State', 'Southern California', 'Ohio State', 'Louisville',
       'Tulsa', 'Virginia Tech', 'Louisiana 

## HomeTeamAbbr

In [56]:
train_cat.HomeTeamAbbr.unique()

array(['NE', 'BUF', 'CHI', 'CIN', 'CLE', 'DET', 'HOU', 'TEN', 'WAS', 'LA',
       'GB', 'SF', 'DAL', 'MIN', 'DEN', 'BAL', 'CAR', 'IND', 'JAX', 'KC',
       'NO', 'PIT', 'TB', 'LAC', 'OAK', 'SEA', 'ATL', 'NYG', 'NYJ', 'PHI',
       'ARI', 'MIA'], dtype=object)

## Stadium

In [57]:
train_cat.Stadium.unique()

array(['Gillette Stadium', 'New Era Field', 'Soldier Field',
       'Paul Brown Stadium', 'FirstEnergy', 'Ford Field', 'NRG Stadium',
       'Nissan Stadium', 'FedExField', 'Los Angeles Memorial Coliseum',
       'Lambeau Field', 'Levis Stadium', 'AT&T Stadium',
       'U.S. Bank Stadium', 'Sports Authority Field at Mile High',
       'M&T Bank Stadium', 'Bank of America Stadium', 'Lucas Oil Stadium',
       'Everbank Field', 'Arrowhead Stadium', 'Mercedes-Benz Superdome',
       'Heinz Field', 'Raymond James Stadium', 'StubHub Center',
       'Oakland-Alameda County Coliseum', 'CenturyLink Field',
       'Mercedes-Benz Dome', 'MetLife Stadium', 'Wembley Stadium',
       'Lincoln Financial Field', 'University of Phoenix Stadium',
       'Mercedes-Benz Stadium', 'M&T Stadium', 'First Energy Stadium',
       'NRG', 'MetLife', 'CenturyLink', 'FirstEnergy Stadium',
       'Hard Rock Stadium', 'EverBank Field', 'Twickenham',
       'Twickenham Stadium', 'Estadio Azteca', 'M & T Bank Stadium

## Location

In [58]:
train_cat.Location.unique()

array(['Foxborough, MA', 'Orchard Park NY', 'Chicago. IL',
       'Cincinnati, Ohio', 'Cleveland, Ohio', 'Detroit, MI',
       'Houston, Texas', 'Nashville, TN', 'Landover, MD',
       'Los Angeles, Calif.', 'Green Bay, WI', 'Santa Clara, CA',
       'Arlington, Texas', 'Minneapolis, MN', 'Denver, CO',
       'Baltimore, Md.', 'Charlotte, North Carolina',
       'Indianapolis, Ind.', 'Jacksonville, FL', 'Kansas City, MO',
       'New Orleans, LA', 'Pittsburgh', 'Tampa, FL', 'Carson, CA',
       'Oakland, CA', 'Seattle, WA', 'Atlanta, GA', 'East Rutherford, NJ',
       'London, England', 'Chicago, IL', 'Detroit', 'Philadelphia, Pa.',
       'Glendale, AZ', 'Cleveland, OH', 'Foxborough, Ma',
       'E. Rutherford, NJ', 'Miami Gardens, Fla.', 'Houston, TX',
       'London', 'New Orleans, La.', 'Mexico City', 'Baltimore, Maryland',
       'Arlington, TX', 'Jacksonville, Fl', 'Jacksonville, Florida',
       'Pittsburgh, PA', 'Charlotte, NC', 'Cleveland,Ohio',
       'East Rutherford, N.J.',

## StadiumType

In [59]:
train_cat.StadiumType.value_counts()

Outdoor                      362516
Outdoors                      92708
Indoors                       56826
Dome                          23122
Indoor                        19140
Retractable Roof              18766
Open                          11308
Retr. Roof-Closed             11044
Domed, closed                  6908
Retr. Roof - Closed            6446
Domed, open                    3696
Retr. Roof-Open                3014
Retractable Roof - Closed      2222
Closed Dome                    2134
Dome, closed                   1826
Domed                          1826
Domed, Open                    1760
OUTDOOR                        1254
Oudoor                         1188
indoor                         1166
Retr. Roof Closed              1056
Indoor, Roof Closed            1056
Bowl                            968
Outddors                        968
Heinz Field                     902
Outdoor Retr Roof-Open          880
Retr. Roof - Open               880
Ourdoor                     

In [60]:
def convert_stadium_type_to_dict(text):
    stadium_type_dict = {}
    if str(text)=='nan':
        return stadium_type_dict
    if 'outdoor' in text.lower() or 'open' in text.lower() or 'heinz' in text.lower() \
    or 'ourdoor' in text.lower() or 'outdor' in text.lower():
        stadium_type_dict['outdoor'] = 1
    if ('indoor' in text.lower() and 'open' not in text.lower()) or 'closed' in text.lower():
        stadium_type_dict['indoor'] = 1
    if 'retr' in text.lower():
        stadium_type_dict['retractable'] = 1
    return stadium_type_dict

In [61]:
def oneHotEncoding_stadium_type(df):
    bow_stadium_type = df.StadiumType.apply(lambda x : convert_stadium_type_to_dict(x))
    vect = DictVectorizer(sparse=False)
    vectors_stadium_types = vect.fit_transform(bow_stadium_type)
    stadium_type_dummies = pd.DataFrame(vectors_stadium_types, columns=vect.get_feature_names())
    return pd.concat([df.drop('StadiumType', axis=1), stadium_type_dummies], axis=1)

## Turf

In [62]:
train_cat.Turf.unique()

array(['Field Turf', 'A-Turf Titan', 'Grass', 'UBU Sports Speed S5-M',
       'Artificial', 'DD GrassMaster', 'Natural Grass',
       'UBU Speed Series-S5-M', 'FieldTurf', 'FieldTurf 360',
       'Natural grass', 'grass', 'Natural', 'Artifical', 'FieldTurf360',
       'Naturall Grass', 'Field turf', 'SISGrass',
       'Twenty-Four/Seven Turf', 'natural grass', 'UBU-Speed Series-S5-M',
       'Twenty Four/Seven Turf', 'Turf'], dtype=object)

In [63]:
def convert_turf(x):
    return 'Natural' if x.lower() in ['grass','natural grass','natural','naturall grass'] else 'Artificial'

In [64]:
def process_turf(df):
    df['IsTurfNatural'] = df.Turf.apply(lambda x : 1 if convert_turf(x)=='Natural' else 0)
    return df.drop('Turf', axis=1)

## GameWeather

In [65]:
train_cat.GameWeather.unique()

array(['Clear and warm', 'Sun & clouds', 'Sunny', 'Controlled Climate',
       'Mostly Sunny', 'Clear', nan, 'Indoor', 'Mostly Cloudy',
       'Mostly Coudy', 'Partly sunny', 'Partly Cloudy', 'Cloudy',
       'Sunny, highs to upper 80s', 'Indoors', 'Light Rain', 'Showers',
       'Partly cloudy', 'Partly Sunny', '30% Chance of Rain',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Rain', 'Cloudy, fog started developing in 2nd quarter', 'Coudy',
       'Rain likely, temps in low 40s.', 'Cold', 'N/A (Indoors)',
       'Clear skies', 'cloudy', 'Fair', 'Mostly cloudy',
       'Cloudy, chance of rain', 'Heavy lake effect snow', 'Party Cloudy',
       'Cloudy, light snow accumulating 1-3"', 'Cloudy and cold', 'Snow',
       'Hazy', 'Scattered Showers', 'Cloudy and Cool', 'N/A Indoor',
       'Rain Chance 40%', 'Clear and sunny', 'Mostly sunny',
       'Sunny and warm', 'Partly clear', 'Cloudy, 50% change of rain',
       'Clear and Sunny', '

In [66]:
def convert_game_weather_to_dict(text):
    game_weather_dict = {}
    if str(text)=='nan':
        return game_weather_dict
    if 'clear' in text.lower(): 
        game_weather_dict['clear'] = 1
    if 'warm' in text.lower():
        game_weather_dict['warm'] = 1
    if 'sun' in text.lower():
        game_weather_dict['sunny'] = 1
    if 'cloud' in text.lower() or 'coudy' in text.lower() or 'clouidy' in text.lower() or 'overcast' in text.lower():
        game_weather_dict['cloud'] = 1
    if 'indoor' in text.lower():
        game_weather_dict['indoor'] = 1
    if 'rain' in text.lower():
        game_weather_dict['rain'] = 1
    if 'shower' in text.lower():
        game_weather_dict['shower'] = 1
    if 'snow' in text.lower():
        game_weather_dict['snow'] = 1
    if 'cold' in text.lower():
        game_weather_dict['cold'] = 1
    if 'cool' in text.lower():
        game_weather_dict['cool'] = 1
    return game_weather_dict

In [67]:
def oneHotEncoding_game_weather(df):
    bow_game_weather = df.GameWeather.apply(lambda x : convert_game_weather_to_dict(x))
    vect = DictVectorizer(sparse=False)
    vectors_game_weather = vect.fit_transform(bow_game_weather)
    game_weather_dummies = pd.DataFrame(vectors_game_weather, columns=vect.get_feature_names())
    return pd.concat([df.drop('GameWeather', axis=1), game_weather_dummies], axis=1)

## WindSpeed

In [68]:
train_cat.WindSpeed.unique()

array([8.0, 6.0, 10.0, 9.0, 11.0, nan, 7.0, 5.0, 2.0, 12.0, 1, 3, 4, 13,
       '10', '5', '6', '4', '8', '0', 'SSW', 14.0, 0.0, 15.0, 17.0, 18.0,
       16.0, '11-17', '16', '14', '13', '12', '23', '7', '9', '3', '17',
       '14-23', '1', '13 MPH', 24.0, '15', '12-22', '2', '4 MPh',
       '15 gusts up to 25', '11', '10MPH', '10mph', '22', 'E', '7 MPH',
       'Calm', '6 mph', '19', 'SE', '20', '10-20', '12mph', '6mph',
       '9mph', 'SSE', '14 Gusting to 24', '6 mph, Gusts to 10',
       '2 mph, gusts to 5', 23.0, 19, '12 mph', '9 mph, gusts to 13',
       '10 mph, gusts to 15'], dtype=object)

In [69]:
def process_wind_speed(x):
    digits = [int(i) for i in str(x).lower().replace('mph','').replace('.0','').replace('-',' ').split(' ') if i.isnumeric()]
    return sum(digits)/len(digits) if len(digits)>0 else 0

## WindDirection

In [70]:
train_cat.WindDirection.unique()

array(['SW', 'NNE', 'SE', 'East', nan, 'NE', 'North', 'S', 'Northwest',
       'SouthWest', 'ENE', 'ESE', 'SSW', 'NW', 'Northeast', 'From S', 'W',
       'South', 'West-Southwest', 'E', '13', 'N', 'NNW',
       'South Southeast', 'SSE', 'West', 'WSW', 'From SW', 'WNW', 's',
       'NorthEast', 'from W', 'W-NW', 'South Southwest', 'Southeast',
       'From WSW', 'West Northwest', 'Calm', 'From SSE', 'From W',
       'East North East', 'From ESE', 'EAST', 'East Southeast',
       'From SSW', '8', 'North East', 'Southwest', 'North/Northwest',
       'From NNE', '1', 'N-NE', 'W-SW', 'From NNW', 'S-SW', 'From NE',
       'South west', 'South, Southeast', 'Southerly'], dtype=object)

In [71]:
dict_wind_direction = {
    'east': 'e', 
    'north': 'n', 
    'northwest': 'nw',
    'southwest': 'sw', 
    'northeast': 'ne', 
    'south': 's', 
    'west-southwest': 'wsw', 
    'south southeast': 'sse', 
    'west': 'w', 
    'northeast': 'ne', 
    'w-nw': 'wnw', 
    'south southwest': 'ssw', 
    'southeast': 'se',
    'west northwest': 'wnw',
    'east north east': 'ene', 
    'east southeast': 'ese',
    'north east': 'ne', 
    'north/northwest': 'nnw',
    'n-ne': 'nne', 
    'w-sw': 'wsw', 
    's-sw': 'ssw', 
    'south west': 'sw', 
    'south, southeast': 'sse', 
    'southerly': 's'
}

In [72]:
def process_wind_direction(x):
    if isinstance(x,float) or x.isnumeric() or x.lower() in ['calm']:
        return ''
    return dict_wind_direction.get(x.lower().replace('from ',''), x.lower().replace('from ',''))

In [73]:
def oneHotEncoding_wind_direction(df):
    wind_direction_dummies = pd.get_dummies(df.WindDirection.apply(lambda x : process_wind_direction(x)), columns=df.WindDirection.unique())
    return pd.concat([df.drop('WindDirection', axis=1), wind_direction_dummies], axis=1)

## Drop categorical features

In [74]:
def drop_categorical_features(df):
    return df.drop(columns=['DisplayName','OffensePersonnel','DefensePersonnel', 'PlayerCollegeName','HomeTeamAbbr','VisitorTeamAbbr','Stadium','Location'], axis=1)

# Numerical Features

In [75]:
train_num = train.select_dtypes(include=['int64','float64'])

In [76]:
train_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 25 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   GameId                  682154 non-null  int64  
 1   PlayId                  682154 non-null  int64  
 2   X                       682154 non-null  float64
 3   Y                       682154 non-null  float64
 4   S                       682154 non-null  float64
 5   A                       682154 non-null  float64
 6   Dis                     682154 non-null  float64
 7   Orientation             682131 non-null  float64
 8   Dir                     682126 non-null  float64
 9   NflId                   682154 non-null  int64  
 10  JerseyNumber            682154 non-null  int64  
 11  Season                  682154 non-null  int64  
 12  YardLine                682154 non-null  int64  
 13  Quarter                 682154 non-null  int64  
 14  Down                

## Droped numerical features

In [77]:
def drop_numerical_features(df):
    return df.drop(columns=['GameId', 'PlayId', 'JerseyNumber','Season'], axis=1)

## NflId & NflIdRusher

In [78]:
def process_is_rusher(df):
    df['IsRusher'] = df.NflId == df.NflIdRusher
    df['IsRusher'] = df['IsRusher'].apply(lambda x : 1 if x else 0)
    return df.drop(columns=['NflId','NflIdRusher'])

## Orientation

In [126]:
# replace null values with 180 (happens to be the average)
def proc_orientation(x):
    return x if x==True else 180

## Dir

In [133]:
# replace null values with 180 (happens to be the average)
def proc_dir(x):
    return x if x==True else 180

## DefendersInTheBox

In [133]:
# replace null values with 180 (happens to be the average)
def proc_dir(x):
    return x if x==True else 180

In [135]:
test = train_num[train_num.DefendersInTheBox.notnull()]
sum(test.DefendersInTheBox)/len(test.DefendersInTheBox)

6.910339934206283

# Processing of all features

In [127]:
def process_features(df_source):
    df = df_source.copy()
    df.Team = df.Team.apply(lambda x : proc_team(x))
    df.GameClock = df.GameClock.apply(lambda x : proc_gameclock(x))
    df = possession_in_fieldPosition(df)
    df = oneHotEncoding_offense_formation(df)
    df = oneHotEncoding_position(df)
    df.PlayDirection = df.PlayDirection.apply(lambda x : proc_play_direction(x))
    df = proc_time_handoff_snap_and_player_age(df)
    df.PlayerHeight = df.PlayerHeight.apply(lambda x : proc_player_height(x))
    df = oneHotEncoding_stadium_type(df)
    df = process_turf(df)
    df = oneHotEncoding_game_weather(df)
    df.WindSpeed = df.WindSpeed.apply(lambda x : process_wind_speed(x))
    df = oneHotEncoding_wind_direction(df)
    df = process_is_rusher(df)
    df = drop_categorical_features(df)
    df.Orientation = df.Orientation.apply(lambda x : proc_orientation(x))
    df.Dir = df.Dir.apply(lambda x : proc_dir(x))
    df = drop_numerical_features(df)
    return df

# Save processed data

In [80]:
train_proc = process_features(train)

In [91]:
for col in train_proc.columns:
    if np.any(np.isnan(train_proc[col])):
        print(col)

Orientation
Dir
DefendersInTheBox
Temperature
Humidity


In [139]:
train_proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682154 entries, 0 to 682153
Data columns (total 92 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Team                       682154 non-null  int64  
 1   X                          682154 non-null  float64
 2   Y                          682154 non-null  float64
 3   S                          682154 non-null  float64
 4   A                          682154 non-null  float64
 5   Dis                        682154 non-null  float64
 6   Orientation                682131 non-null  float64
 7   Dir                        682126 non-null  float64
 8   YardLine                   682154 non-null  int64  
 9   Quarter                    682154 non-null  int64  
 10  GameClock                  682154 non-null  float64
 11  Down                       682154 non-null  int64  
 12  Distance                   682154 non-null  int64  
 13  HomeScoreBeforePlay        68

In [170]:
X_train, X_test, y_train, y_test = train_test_split(train_proc, y, test_size=0.3, random_state=2)

In [171]:
X_train

Unnamed: 0,Team,X,Y,S,A,Dis,Orientation,Dir,YardLine,Quarter,...,nw,s,se,sse,ssw,sw,w,wnw,wsw,IsRusher
605093,0,83.46,24.50,1.96,0.79,0.20,80.87,276.82,4,4,...,0,0,0,0,0,1,0,0,0,0
524681,0,59.55,28.58,0.70,1.14,0.07,343.53,202.67,50,1,...,0,0,0,0,0,0,0,0,0,0
674288,0,32.90,34.49,2.41,0.44,0.26,58.21,317.77,25,1,...,0,0,0,0,0,0,0,0,0,0
443826,1,10.66,18.54,3.25,1.67,0.32,307.06,299.70,1,1,...,0,0,0,0,0,0,0,1,0,0
159363,1,57.44,30.63,3.34,2.02,0.31,167.20,212.90,44,2,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84434,1,51.05,42.58,3.26,2.80,0.57,143.49,159.34,39,3,...,0,0,1,0,0,0,0,0,0,0
437782,0,96.21,36.82,2.15,1.90,0.24,134.02,45.89,15,3,...,0,0,0,0,0,1,0,0,0,0
620104,1,17.47,23.38,3.60,0.30,0.39,251.50,158.87,6,2,...,0,0,0,0,1,0,0,0,0,0
203245,0,84.64,28.54,1.52,1.17,0.16,313.99,347.07,25,4,...,1,0,0,0,0,0,0,0,0,0


In [183]:
model = Sequential()

In [184]:
model.add(layers.Dense(5, input_dim=92, activation='relu')) 
model.add(layers.Dense(5, activation='tanh'))
model.add(layers.Dense(1, activation='linear'))

In [185]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 5)                 465       
_________________________________________________________________
dense_25 (Dense)             (None, 5)                 30        
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 6         
Total params: 501
Trainable params: 501
Non-trainable params: 0
_________________________________________________________________


In [186]:
model.compile(loss='mse', 
              optimizer=tensorflow.keras.optimizers.Adam(
    learning_rate=0.01, beta_1=0.9, beta_2=0.99
)
)

In [187]:
model.fit(X_train, y_train, batch_size=32, epochs=1)



<keras.callbacks.History at 0x7fb814cc19a0>

In [188]:
model.evaluate(X_test, y_test)



nan

In [196]:
np.any(np.isnan(X_train))

True