# **Loading the data**

In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nfl-playing-surface-analytics/InjuryRecord.csv
/kaggle/input/nfl-playing-surface-analytics/PlayerTrackData.csv
/kaggle/input/nfl-playing-surface-analytics/PlayList.csv


In [2]:
# Load the data files
playlist = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/PlayList.csv')
injuries = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/InjuryRecord.csv')
tracking = pd.read_csv('/kaggle/input/nfl-playing-surface-analytics/PlayerTrackData.csv', nrows=int(1e6))

In [3]:
display(playlist.head(),
        injuries.head(),
        tracking.head())

Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


Unnamed: 0,PlayKey,time,event,x,y,dir,dis,o,s
0,26624-1-1,0.0,huddle_start_offense,87.46,28.93,288.24,0.01,262.33,0.13
1,26624-1-1,0.1,,87.45,28.92,283.91,0.01,261.69,0.12
2,26624-1-1,0.2,,87.44,28.92,280.4,0.01,261.17,0.12
3,26624-1-1,0.3,,87.44,28.92,278.79,0.01,260.66,0.1
4,26624-1-1,0.4,,87.44,28.92,275.44,0.01,260.27,0.09


In [4]:
# Converting to titlecase for uniformity

def titlecase_strings(x):
    if isinstance(x, str):  # Check if the element is a string
        return x.title()
    return x  # Return the element as is for int64 columns

# Apply titlecase_strings function to the entire DataFrame
playlist2 = playlist.applymap(titlecase_strings)
injuries2 = injuries.applymap(titlecase_strings)
tracking2 = tracking.applymap(titlecase_strings)

display(playlist2.head(),injuries2.head(),tracking2.head())

Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear And Warm,Pass,1,Qb,Qb
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear And Warm,Pass,2,Qb,Qb
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear And Warm,Rush,3,Qb,Qb
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear And Warm,Rush,4,Qb,Qb
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear And Warm,Pass,5,Qb,Qb


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


Unnamed: 0,PlayKey,time,event,x,y,dir,dis,o,s
0,26624-1-1,0.0,Huddle_Start_Offense,87.46,28.93,288.24,0.01,262.33,0.13
1,26624-1-1,0.1,,87.45,28.92,283.91,0.01,261.69,0.12
2,26624-1-1,0.2,,87.44,28.92,280.4,0.01,261.17,0.12
3,26624-1-1,0.3,,87.44,28.92,278.79,0.01,260.66,0.1
4,26624-1-1,0.4,,87.44,28.92,275.44,0.01,260.27,0.09


In [5]:
# Checking if original dtypes are maintained
print(playlist.dtypes == playlist2.dtypes,
       injuries.dtypes == injuries2.dtypes,
       tracking.dtypes == tracking2.dtypes,sep='\n\n')

PlayerKey         True
GameID            True
PlayKey           True
RosterPosition    True
PlayerDay         True
PlayerGame        True
StadiumType       True
FieldType         True
Temperature       True
Weather           True
PlayType          True
PlayerGamePlay    True
Position          True
PositionGroup     True
dtype: bool

PlayerKey    True
GameID       True
PlayKey      True
BodyPart     True
Surface      True
DM_M1        True
DM_M7        True
DM_M28       True
DM_M42       True
dtype: bool

PlayKey    True
time       True
event      True
x          True
y          True
dir        True
dis        True
o          True
s          True
dtype: bool


# **Exploratory Data Analysis**

## Guide to reassignment:  
  
The goal is to optimize weather classifications by integrating temperature insights into existing descriptions such as **_'Sunny'_** and **_'Clear'_**. This will streamline information while avoiding redundancies. 

#### Reassignment preference order:
1. Indoors
2. Rainy
3. Showers
4. Cloudy
5. Chance of rain
6. Partly cloudy
7. Clear
8. Sunny/snow  

**_'Temperature'_** column gives information about the heat

i.e lower temperature = chance of rain/snow, higher temperature = sunny

***
##### ex. 1: 
**_'Sun & Clouds'_** is reassigned to **_'Cloudy_Part'_**  
Logic : Having both the sun and the clouds means that it is partly cloudy (does not fit under Sunny nor Cloudy completly)

##### ex. 2: 
**_'Sunny And Cold'_** is reassigned to **_'Sunny'_**  
Logic : Since 'Cold' can be determined from high 'Temperature' values

##### ex. 3: 
**_'Cold'_** and **_'Fair'_** is reassigned to **_'Clear'_**  
Logic : No other information is given and 'Temperature' can be determined

***

"Cloudy, Light Snow Accumulating 1-3" (in 'Cloudy') -> ??

In [6]:
print(playlist2.Weather.nunique(), 
      playlist2['Weather'].value_counts(),sep='\n\n')

56

Cloudy                                                                              61692
Sunny                                                                               51728
Partly Cloudy                                                                       29774
Clear                                                                               25923
Mostly Cloudy                                                                       13565
Rain                                                                                 8349
Controlled Climate                                                                   6355
N/A (Indoors)                                                                        5517
Mostly Sunny                                                                         4270
Indoors                                                                              3460
Indoor                                                                               3038
Partly

In [7]:
# Display all rows since Kaggle hides intermediate rows when number of rows exceed the pandas limit
pd.set_option('display.max_rows', None)

Sunny = ['Sunny And Cold','Sunny, Highs To Upper 80S','Sunny','Sunny And Warm','Sunny Skies','Heat Index 95']
Clear = ['Fair','Cold','Sunny And Clear','Clear And Sunny','Clear','Clear And Cold','Clear And Warm','Clear And Cool','Clear Skies']
Cloudy_Part = ['Sun & Clouds','Mostly Sunny Skies','Clear To Partly Cloudy','Partly Clouidy','Party Cloudy','Mostly Sunny','Partly Cloudy',]
Cloudy = ['Partly Sunny','Partly Clear','Overcast','Coudy','Cloudy, Light Snow Accumulating 1-3"','Mostly Cloudy','Cloudy, Fog Started Developing In 2Nd Quarter','Cloudy, 50% Change Of Rain','Cloudy, Chance Of Rain','Mostly Coudy','Cloudy','Mostly Cloudy','Cloudy And Cold','Cloudy And Cool']
Rainy = ['Cloudy With Periods Of Rain, Thunder Possible. Winds Shifting To Wnw, 10-20 Mph.','Rain','Cloudy, Rain','Rainy']
Showers = ['Rain Shower','Showers','Scattered Showers','Light Rain']
Rain_Chance = ['10% Chance Of Rain','30% Chance Of Rain','Rain Chance 40%','Rain Likely, Temps In Low 40s.','Rain Likely, Temps In Low 40S.']
Snow = ['Snow','Heavy Lake Effect Snow']
Hazy = ['Hazy']
Windy = ['Windy','Sunny, Windy',]
Indoor = ['Indoors','Indoor','Controlled Climate','N/A (Indoors)','N/A Indoor']

# Brute force method:
# playlist2['Weather'] = playlist2['Weather'].replace(Sunny, 'Sunny')

# Optimized mapping
value_mapping = {
    'Sunny': Sunny,
    'Clear': Clear,
    'Cloudy_Part': Cloudy_Part,
    'Cloudy': Cloudy,
    'Rainy': Rainy,
    'Showers': Showers,
    'Rain_Chance': Rain_Chance,
    'Snow': Snow,
    'Windy': Windy,
    'Indoor': Indoor
}

for key, value in value_mapping.items():
    playlist2['Weather'] = playlist2['Weather'].apply(lambda x: key if x in value else x)


In [8]:
# Checking if reclassification of Weather values were successful
print(playlist2.Weather.nunique(), 
      playlist2['Weather'].value_counts(),sep='\n\n')

11

Cloudy         84944
Sunny          54655
Cloudy_Part    35870
Clear          34791
Indoor         20276
Rainy           9478
Showers         3450
Rain_Chance     2011
Snow            1329
Hazy            1043
Windy            467
Name: Weather, dtype: int64


In [9]:
print(playlist2.StadiumType.nunique(), 
      playlist2['StadiumType'].value_counts(),sep='\n\n')

28

Outdoor                   145032
Outdoors                   32956
Indoors                    22805
Dome                        9376
Retractable Roof            8914
Indoor                      6892
Open                        4124
Domed, Closed               3076
Domed, Open                 2586
Retr. Roof - Closed         2235
Retr. Roof-Closed           2015
Dome, Closed                1059
Closed Dome                 1011
Domed                        985
Oudoor                       914
Ourdoor                      778
Outdoor Retr Roof-Open       601
Outddors                     595
Indoor, Roof Closed          547
Retr. Roof-Open              486
Retr. Roof - Open            486
Indoor, Open Roof            479
Bowl                         465
Retr. Roof Closed            414
Heinz Field                  389
Outdor                       356
Outside                      341
Cloudy                       178
Name: StadiumType, dtype: int64


In [10]:
# Reclassifying 'StadiumType' column
# Stadiums are either open, enclosed, or have a retractable roof

Open = ['Outdoor','Oudoor','Heinz Field','Outdoors', 'Open', 'Open Roof','Retr. Roof - Open','Outdoor Retr Roof-Open','Outddors','Ourdoor','Retr. Roof-Open', 'Domed, Open', 'Domed, open','Outdor', 'Outside']
Enclosed = ['Indoors','Dome, Closed','Domed, Closed','Indoor, Open Roof','Closed Dome','Domed, closed','Dome', 'Indoor', 'Domed','Retr. Roof-Closed','Indoor, Roof Closed','Retr. Roof - Closed', 'Dome, closed','Cloudy','Retr. Roof Closed']

playlist2['StadiumType'] = playlist2['StadiumType'].replace(Open, 'Open')
playlist2['StadiumType'] = playlist2['StadiumType'].replace(Enclosed, 'Enclosed')
playlist2['StadiumType'] = playlist2['StadiumType'].replace('Retractable Roof', 'Retr_Roof')

In [11]:
# Checking if reclassification of 'StadiumType' values were successful
print(playlist2.StadiumType.nunique(), 
      playlist2['StadiumType'].value_counts(),sep='\n\n')

4

Open         189644
Enclosed      51072
Retr_Roof      8914
Bowl            465
Name: StadiumType, dtype: int64
