# Feature Engineering Notebook
<p>Davis Data Science Club</p>
<p>Author: Shozen Dan</p>
<p>This is the notebook covering the data cleaning and feature engineering process.</p>

In [1]:
# Import Necessary Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
nfl = pd.read_csv('./train.csv')
nfl.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,Week,Stadium,Location,StadiumType,Turf,GameWeather,Temperature,Humidity,WindSpeed,WindDirection
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.4,81.99,177.18,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.7,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
2,2017090700,20170907000118,away,74.0,33.2,1.22,0.59,0.31,3.01,202.73,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
3,2017090700,20170907000118,away,71.46,27.7,0.42,0.54,0.02,359.77,105.64,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,1,Gillette Stadium,"Foxborough, MA",Outdoor,Field Turf,Clear and warm,63.0,77.0,8,SW


In [3]:
nfl.columns

Index(['GameId', 'PlayId', 'Team', 'X', 'Y', 'S', 'A', 'Dis', 'Orientation',
       'Dir', 'NflId', 'DisplayName', 'JerseyNumber', 'Season', 'YardLine',
       'Quarter', 'GameClock', 'PossessionTeam', 'Down', 'Distance',
       'FieldPosition', 'HomeScoreBeforePlay', 'VisitorScoreBeforePlay',
       'NflIdRusher', 'OffenseFormation', 'OffensePersonnel',
       'DefendersInTheBox', 'DefensePersonnel', 'PlayDirection', 'TimeHandoff',
       'TimeSnap', 'Yards', 'PlayerHeight', 'PlayerWeight', 'PlayerBirthDate',
       'PlayerCollegeName', 'Position', 'HomeTeamAbbr', 'VisitorTeamAbbr',
       'Week', 'Stadium', 'Location', 'StadiumType', 'Turf', 'GameWeather',
       'Temperature', 'Humidity', 'WindSpeed', 'WindDirection'],
      dtype='object')

### Wind Speed

In [4]:
nfl['WindSpeed'].unique()

array([8.0, 6.0, 10.0, 9.0, 11.0, nan, 7.0, 5.0, 2.0, 12.0, 1, 3, 4, 13,
       '10', '5', '6', '4', '8', '0', 'SSW', 14.0, 0.0, 15.0, 17.0, 18.0,
       16.0, '11-17', '16', '14', '13', '12', '23', '7', '9', '3', '17',
       '14-23', '1', '13 MPH', 24.0, '15', '12-22', '2', '4 MPh',
       '15 gusts up to 25', '11', '10MPH', '10mph', '22', 'E', '7 MPH',
       'Calm', '6 mph', '19', 'SE', '20', '10-20', '12mph'], dtype=object)

As we can see from the unique method on the WindSpeed column, there are many different data types. Some are a mixture of integers and strings. We need to convert all of this in to floats.

In [5]:
def cleanWindSpeed(x):
    x = str(x) # convert all values to string
    x = x.lower() # to lowercase
    if '-' in x:
        x = (int(x.split('-')[0]) + int(x.split('-')[1])) / 2
    elif ' gusts up to 25 ' in x:
        x = (int(x.split(' gusts up tp 25 ')))
    try: 
        return float(x)
    except:
        return -1

In [6]:
nfl['WindSpeed'].apply(lambda p: cleanWindSpeed(p)).unique()

array([ 8. ,  6. , 10. ,  9. , 11. ,  nan,  7. ,  5. ,  2. , 12. ,  1. ,
        3. ,  4. , 13. ,  0. , -1. , 14. , 15. , 17. , 18. , 16. , 23. ,
       18.5, 24. , 22. , 19. , 20. ])

### Game Weather

In [7]:
nfl['GameWeather'].unique()

array(['Clear and warm', 'Sun & clouds', 'Sunny', 'Controlled Climate',
       'Mostly Sunny', 'Clear', nan, 'Indoor', 'Mostly Cloudy',
       'Mostly Coudy', 'Partly sunny', 'Partly Cloudy', 'Cloudy',
       'Sunny, highs to upper 80s', 'Indoors', 'Light Rain', 'Showers',
       'Partly cloudy', 'Partly Sunny', '30% Chance of Rain',
       'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.',
       'Rain', 'Cloudy, fog started developing in 2nd quarter', 'Coudy',
       'Rain likely, temps in low 40s.', 'Cold', 'N/A (Indoors)',
       'Clear skies', 'cloudy', 'Fair', 'Mostly cloudy',
       'Cloudy, chance of rain', 'Heavy lake effect snow', 'Party Cloudy',
       'Cloudy, light snow accumulating 1-3"', 'Cloudy and cold', 'Snow',
       'Hazy', 'Scattered Showers', 'Cloudy and Cool', 'N/A Indoor',
       'Rain Chance 40%', 'Clear and sunny', 'Mostly sunny',
       'Sunny and warm', 'Partly clear', 'Cloudy, 50% change of rain',
       'Clear and Sunny', '

The weather data extremely qualitative and seem to have no fixed format. We will attmept to all the weather conditions in to five basic groups: sunny, cloudy, rainy, snow, and indoors.

In [8]:
def cleanGameWeather(x):
    x = str(x).lower()
    if 'sunny' in x or 'clear' in x or 'fair' in x:
        return 'sunny'
    elif 'cloud' in x or 'coudy' in x or 'clouidy' in x or 'hazy' in x or 'sun & clouds' in x or 'overcast' in x:
        return 'cloudy'
    elif 'rain' in x or 'shower' in x or 'rainy' in x:
        return 'rainy'
    elif 'controlled climate' in x or 'indoor' in x:
        return 'indoors'
    elif 'snow' in x:
        return 'snow'
    return None

In [9]:
nfl['GameWeather'].apply(lambda p: cleanGameWeather(p)).unique()

array(['sunny', 'cloudy', 'indoors', None, 'rainy', 'snow'], dtype=object)

### Field Position and Possession Team
True if the side of the field where play occurs and the team in posession is the same. 

In [10]:
# from https://www.kaggle.com/ryches/model-free-benchmark
def fieldEqPosition():
    nfl['FieldEqPosition'] = nfl['FieldPosition'] == nfl['PossessionTeam']

In [11]:
fieldEqPosition()
nfl['FieldEqPosition']

0          True
1          True
2          True
3          True
4          True
          ...  
509757    False
509758    False
509759    False
509760    False
509761    False
Name: FieldEqPosition, Length: 509762, dtype: bool

### NFL id and Rusher id
True if the unique identifier of the player and the rusher id is the same. (Identifies the player making the rushing play)

In [12]:
def isRusher():
    nfl['isRusher'] = nfl['NflId'] == nfl['NflIdRusher']

In [13]:
isRusher()
nfl['isRusher']

0         False
1         False
2         False
3         False
4         False
          ...  
509757    False
509758    False
509759    False
509760    False
509761     True
Name: isRusher, Length: 509762, dtype: bool

### Handoff and Snap Time

In [14]:
nfl['TimeHandoff']

0         2017-09-08T00:44:06.000Z
1         2017-09-08T00:44:06.000Z
2         2017-09-08T00:44:06.000Z
3         2017-09-08T00:44:06.000Z
4         2017-09-08T00:44:06.000Z
                    ...           
509757    2018-12-31T00:24:51.000Z
509758    2018-12-31T00:24:51.000Z
509759    2018-12-31T00:24:51.000Z
509760    2018-12-31T00:24:51.000Z
509761    2018-12-31T00:24:51.000Z
Name: TimeHandoff, Length: 509762, dtype: object

The TimeHandoff column and the TimeSnap contains the time when the handoff/snap play happend. We only need the minutes and seconds. 

In [15]:
nfl['TimeHandoff_min'] = [int(x[-7:-5]) for x in nfl['TimeHandoff']]
nfl['TimeHandoff_sec'] = [int(x[-4:-2]) for x in nfl['TimeHandoff']]
nfl['TimeHandoff'] = [x[11:-1] for x in nfl['TimeHandoff']]

In [16]:
nfl['TimeSnap_min'] = [int(x[-7:-5]) for x in nfl['TimeSnap']]
nfl['TimeSnap_sec'] = [int(x[-4:-2]) for x in nfl['TimeSnap']]
nfl['TimeSnap'] = [x[11:-1] for x in nfl['TimeSnap']]

In [17]:
nfl['HandoffSnapDiff_min'] = nfl['TimeHandoff_min'] - nfl['TimeSnap_min']
nfl['HandoffSnapDiss_sec'] = nfl['HandoffSnapDiff_min'] * 60 + nfl['TimeHandoff_sec'] - nfl['TimeSnap_sec']

### BMI

In [18]:
nfl.PlayerHeight

0          6-0
1          6-3
2          6-3
3          6-3
4          6-0
          ... 
509757     6-6
509758     6-5
509759     6-5
509760     6-6
509761    5-11
Name: PlayerHeight, Length: 509762, dtype: object

Need to convert to intergers. We will also compute BMI.

In [19]:
nfl['height_1'] = [int(x[0]) for x in nfl['PlayerHeight']]
nfl['height_2'] = [int(x[2]) for x in nfl['PlayerHeight']]
nfl['height_3'] = nfl['height_1'] * 12 + nfl['height_2'] # convert feet to inches
nfl['BMI'] = (nfl['PlayerWeight'] * 703) / (nfl['height_1'] * 12 + nfl['height_2'] ** 2) # Condence height and weight into BMI

### Defence and Offence Personnel

In [20]:
nfl.DefensePersonnel

0         2 DL, 3 LB, 6 DB
1         2 DL, 3 LB, 6 DB
2         2 DL, 3 LB, 6 DB
3         2 DL, 3 LB, 6 DB
4         2 DL, 3 LB, 6 DB
                ...       
509757    4 DL, 3 LB, 4 DB
509758    4 DL, 3 LB, 4 DB
509759    4 DL, 3 LB, 4 DB
509760    4 DL, 3 LB, 4 DB
509761    4 DL, 3 LB, 4 DB
Name: DefensePersonnel, Length: 509762, dtype: object

Need to extract the numbers.

In [21]:
arr = [[int(s[0]) for s in t.split(', ')] for t in nfl.DefensePersonnel]
nfl['DL'] = [a[0] for a in arr]
nfl['LB'] = [a[1] for a in arr]
nfl['DB'] = [a[2] for a in arr]

In [22]:
nfl.OffensePersonnel

0               1 RB, 1 TE, 3 WR
1               1 RB, 1 TE, 3 WR
2               1 RB, 1 TE, 3 WR
3               1 RB, 1 TE, 3 WR
4               1 RB, 1 TE, 3 WR
                   ...          
509757    6 OL, 1 RB, 2 TE, 1 WR
509758    6 OL, 1 RB, 2 TE, 1 WR
509759    6 OL, 1 RB, 2 TE, 1 WR
509760    6 OL, 1 RB, 2 TE, 1 WR
509761    6 OL, 1 RB, 2 TE, 1 WR
Name: OffensePersonnel, Length: 509762, dtype: object

In [23]:
arr = [[int(s[0]) for s in t.split(', ')] for t in nfl.OffensePersonnel]
nfl['RB'] = [a[0] for a in arr]
nfl['TE'] = [a[1] for a in arr]
nfl['WR'] = [a[2] for a in arr]

### Game Clock

In [24]:
nfl.GameClock

0         14:14:00
1         14:14:00
2         14:14:00
3         14:14:00
4         14:14:00
            ...   
509757    00:16:00
509758    00:16:00
509759    00:16:00
509760    00:16:00
509761    00:16:00
Name: GameClock, Length: 509762, dtype: object

Extract hour and minute. 

In [25]:
arr = [[int(s[0]) for s in t.split(":")] for t in nfl.GameClock]
nfl['GameHour'] = [a[0] for a in arr]
nfl['GameMinute'] = [a[1] for a in arr]

### Player Birth Date

In [26]:
nfl.PlayerBirthDate

0         12/29/1988
1         03/25/1989
2         01/21/1989
3         11/22/1982
4         08/17/1987
             ...    
509757    03/06/1993
509758    06/02/1994
509759    07/19/1992
509760    08/05/1995
509761    09/16/1994
Name: PlayerBirthDate, Length: 509762, dtype: object

Compute player age

In [27]:
nfl['Season'] = [int(x) for x in nfl['Season']]
nfl['BirthY'] = [int(t.split('/')[2]) for t in nfl["PlayerBirthDate"]]
nfl['Age'] = nfl['Season'] - nfl['BirthY']

In [28]:
class NflFeatureEng:
    """A Feature Engineer for the NFL data
    
    This class implements a feature engineer that cleans and engineers 
    the variables of the NFL dataset for the 2019 NFL data competition held on Kaggle.
    
    Parameters
    ----------
    data: the nfl data in pandas DataFrame format
    
    exclude: A list of feature engineering processes to exclude
    
    Notes
    -----
    Might not work if some of the columns have been edited since initial
    import. In that case, either implement additional feature engineering 
    methods or reload the data.
    
    References
    ----------
    The methods were implemented based from the one found at:
    https://www.kaggle.com/prashantkikani/nfl-starter-lgb-feature-engg
    """
    
    def __init__(self, data, exclude = []):
        self.data = data
        self.exclude = exclude
        self.include = ['WindSpeed', 
                        'GameWeather', 
                        'FieldEqPossession', 
                        'isRusher', 
                        'TimeHandoff', 
                        'TimeSnap', 
                        'HandSnapDiff', 
                        'BMI', 
                        'DefencePersonnel', 
                        'OffencePersonnel', 
                        'GameClock', 
                        'PlayerAge']
    
    def windSpeed(self, x):
        x = str(x) # convert all values to string
        x = x.lower() # convert all upper case to lowercase
        if '-' in x:
            x = (int(x.split('-')[0]) + int(x.split('-')[1])) / 2
        elif ' gusts up to 25 ' in x:
            x = (int(x.split(' gusts up tp 25 ')))
        try: 
            return float(x)
        except:
            return -1
        
    def gameWeather(self, x):
        x = str(x).lower()
        if 'sunny' in x or 'clear' in x or 'fair' in x:
            return 'sunny'
        elif 'cloud' in x or 'coudy' in x or 'clouidy' in x or 'hazy' in x or 'sun & clouds' in x or 'overcast' in x:
            return 'cloudy'
        elif 'rain' in x or 'shower' in x or 'rainy' in x:
            return 'rainy'
        elif 'controlled climate' in x or 'indoor' in x:
            return 'indoors'
        elif 'snow' in x:
            return 'snow'
        return None
    
    def fieldEqPossession(self):
        self.data['FieldEqPossession'] = self.data['FieldPosition'] == self.data['PossessionTeam']
        
    def isRusher(self):
        self.data['isRusher'] = self.data['NflId'] == self.data['NflIdRusher']
    
    def timeHandoff(self):
        self.data['TimeHandoff_min'] = pd.Series([int(x[-7:-5]) for x in self.data['TimeHandoff']])
        self.data['TimeHandoff_sec'] = pd.Series([int(x[-4:-2]) for x in self.data['TimeHandoff']])
        self.data['TimeHandoff'] = pd.Series([x[11:-1] for x in self.data['TimeHandoff']])
        
    def timeSnap(self):
        self.data['TimeSnap_min'] = pd.Series([int(x[-7:-5]) for x in self.data['TimeSnap']])
        self.data['TimeSnap_sec'] = pd.Series([int(x[-7:-5]) for x in self.data['TimeSnap']])
        self.data['TimeSnap'] = pd.Series([x[11:-1] for x in self.data['TimeSnap']])
    
    def handSnapDiff(self):
        self.data['HandSnapDiff_min'] = self.data['TimeHandoff_min'] - self.data['TimeSnap_min']
        self.data['handoff_snap_diff_sec'] = self.data['HandSnapDiff_min'] * 60 + self.data['TimeHandoff_sec'] - self.data['TimeSnap_sec']
    
    def BMI(self):
        self.data['height_1'] = pd.Series([int(x[0]) for x in self.data['PlayerHeight']])
        self.data['height_2'] = pd.Series([int(x[2]) for x in self.data['PlayerHeight']])
        self.data['height_3'] = self.data['height_1'] * 12 + self.data['height_2']
        self.data['BMI'] = (self.data['PlayerWeight'] * 703) / (self.data['height_1'] * 12 + self.data['height_2'] ** 2)
        
    def defencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(', ')] for t in self.data['DefensePersonnel']]
        self.data['DL'] = pd.Series([int(a[0]) for a in arr])
        self.data['LB'] = pd.Series([int(a[1]) for a in arr])
        self.data['DB'] = pd.Series([int(a[2]) for a in arr])
    
    def offencePersonnel(self):
        arr = [[int(s[0]) for s in t.split(", ")] for t in self.data["OffensePersonnel"]]
        self.data["RB"] = pd.Series([int(a[0]) for a in arr])
        self.data["TE"] = pd.Series([int(a[1]) for a in arr])
        self.data["WR"] = pd.Series([int(a[2]) for a in arr])
    
    def gameClock(self):
        arr = [[int(s[0]) for s in t.split(":")] for t in self.data["GameClock"]]
        self.data["GameHour"] = pd.Series([int(a[0]) for a in arr])
        self.data["GameMinute"] = pd.Series([int(a[1]) for a in arr])
        
    def playerAge(self):
        self.data['Season'] = pd.Series([int(x) for x in self.data['Season']])
        self.data["BirthY"] = pd.Series([int(t.split('/')[2]) for t in self.data["PlayerBirthDate"]])
        self.data['age'] = self.data['Season'] - self.data['BirthY']
        
    def engineer(self):
        for c in self.include:
            
            if c in self.exclude: continue
                
            elif c == 'WindSpeed':
                self.data['WindSpeed'] = self.data['WindSpeed'].apply(lambda p: self.windSpeed(p))
            
            elif c == 'GameWeather':
                self.data['GameWeather'] = self.data['GameWeather'].apply(lambda p: self.gameWeather(p))

            elif c == 'FieldEqPossession':
                self.fieldEqPossession()
            
            elif c == 'isRusher':
                self.isRusher()
            
            elif c == 'TimeHandoff':
                self.timeHandoff()
            
            elif c == 'TimeSnap':
                self.timeSnap()
            
            elif c == 'HandSnapDiff':
                self.handSnapDiff()
            
            elif c == 'BMI':
                self.BMI()
            
            elif c == 'DefencePersonnel':
                self.defencePersonnel()
                
            elif c == 'OffencePersonnel':
                self.offencePersonnel()
                
            elif c == 'GameClock':
                self.gameClock()
                
            elif c == 'PlayerAge':
                self.playerAge()
        
        return self.data

In [29]:
nfl = pd.read_csv('./train.csv')

In [30]:
eng = NflFeatureEng(nfl)

In [31]:
eng.engineer()

Unnamed: 0,GameId,PlayId,Team,X,Y,S,A,Dis,Orientation,Dir,...,DL,LB,DB,RB,TE,WR,GameHour,GameMinute,BirthY,age
0,2017090700,20170907000118,away,73.91,34.84,1.69,1.13,0.40,81.99,177.18,...,2,3,6,1,1,3,1,1,1988,29
1,2017090700,20170907000118,away,74.67,32.64,0.42,1.35,0.01,27.61,198.70,...,2,3,6,1,1,3,1,1,1989,28
2,2017090700,20170907000118,away,74.00,33.20,1.22,0.59,0.31,3.01,202.73,...,2,3,6,1,1,3,1,1,1989,28
3,2017090700,20170907000118,away,71.46,27.70,0.42,0.54,0.02,359.77,105.64,...,2,3,6,1,1,3,1,1,1982,35
4,2017090700,20170907000118,away,69.32,35.42,1.82,2.43,0.16,12.63,164.31,...,2,3,6,1,1,3,1,1,1987,30
5,2017090700,20170907000118,away,75.06,24.00,1.01,0.32,0.18,308.34,95.01,...,2,3,6,1,1,3,1,1,1991,26
6,2017090700,20170907000118,away,74.11,16.64,1.11,0.83,0.02,357.23,322.59,...,2,3,6,1,1,3,1,1,1992,25
7,2017090700,20170907000118,away,73.37,18.73,1.24,0.74,0.13,328.52,270.04,...,2,3,6,1,1,3,1,1,1991,26
8,2017090700,20170907000118,away,56.63,26.90,0.26,1.86,0.28,344.70,55.31,...,2,3,6,1,1,3,1,1,1990,27
9,2017090700,20170907000118,away,73.35,38.83,4.55,0.76,0.51,75.47,190.84,...,2,3,6,1,1,3,1,1,1993,24
