# **Explore Data**

This notebook contains code for exploring the dataset

This project requires Python 3.7 or above:

In [None]:
import sys
assert sys.version_info >= (3, 7)

It also requires Scikit-Learn ≥ 1.0.1:

In [None]:
from packaging import version
import sklearn
assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

## Download the Data

In [1]:
# Load the Kaggle dataset from the data directory, download it if necessary
import os
import pandas as pd
from pathlib import Path


def load_kaggle_nfl_dataset(dataset_owner_name: str,
                            dataset_name: str, filename: str,
                            chunksize: int = 100000
                            ) -> pd.DataFrame:
    '''
    Loads the Kaggle dataset from the file system.
    If the dataset is not present, it is downloaded from Kaggle.
    Returns the dataset as a pandas DataFrame.
    Additionally, the dataset is filtered to only include data from before 2010.
    '''
    dataset_path = Path(f"datasets/{dataset_name}.zip")
    if not dataset_path.is_file():
        os.system(f"kaggle datasets download -d {dataset_owner_name}/{dataset_name} -p datasets")
        os.system(f"unzip datasets/{dataset_name}.zip -d datasets/{dataset_name}")

    path = f"datasets/{dataset_name}/{filename}"
    df_chunks = pd.read_csv(Path(path), chunksize=chunksize, encoding='utf-8')
    df = pd.concat(df_chunks)
    return df

dataset_owner_name = 'maxhorowitz'
dataset_name = 'nflplaybyplay2009to2016'
filename = 'NFL Play by Play 2009-2018 (v5).csv'
nfl_df = load_kaggle_nfl_dataset(dataset_owner_name, dataset_name, filename)[:1000]

  df = pd.concat(df_chunks)
  df = pd.concat(df_chunks)
  df = pd.concat(df_chunks)
  df = pd.concat(df_chunks)
  df = pd.concat(df_chunks)


## Take a Quick Look at the Data Structure

In [2]:
nfl_df.head()

Unnamed: 0,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,...,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
0,46,2009091000,PIT,TEN,PIT,home,TEN,TEN,30.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
1,68,2009091000,PIT,TEN,PIT,home,TEN,PIT,58.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
2,92,2009091000,PIT,TEN,PIT,home,TEN,PIT,53.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
3,113,2009091000,PIT,TEN,PIT,home,TEN,PIT,56.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0
4,139,2009091000,PIT,TEN,PIT,home,TEN,PIT,56.0,2009-09-10,...,,,,0,,,0.0,0.0,0.0,0.0


In [3]:
nfl_df.tail()

Unnamed: 0,play_id,game_id,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,...,penalty_player_id,penalty_player_name,penalty_yards,replay_or_challenge,replay_or_challenge_result,penalty_type,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
995,2979,2009091306,IND,JAC,JAC,away,IND,IND,7.0,2009-09-13,...,,,,0,,,0.0,0.0,0.0,0.0
996,2996,2009091306,IND,JAC,JAC,away,IND,IND,2.0,2009-09-13,...,,,,0,,,0.0,0.0,0.0,0.0
997,3014,2009091306,IND,JAC,IND,home,JAC,JAC,30.0,2009-09-13,...,,,,0,,,0.0,0.0,0.0,0.0
998,3036,2009091306,IND,JAC,IND,home,JAC,IND,80.0,2009-09-13,...,,,,0,,,0.0,0.0,0.0,0.0
999,3057,2009091306,IND,JAC,IND,home,JAC,IND,80.0,2009-09-13,...,,,,0,,,0.0,0.0,0.0,0.0


In [4]:
nfl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 255 entries, play_id to defensive_extra_point_conv
dtypes: float64(135), int64(18), object(102)
memory usage: 1.9+ MB


In [5]:
nfl_df.sample(200).describe()

Unnamed: 0,play_id,game_id,yardline_100,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,quarter_end,drive,sp,qtr,...,assist_tackle_4_team,fumble_recovery_1_yards,fumble_recovery_2_yards,return_yards,penalty_yards,replay_or_challenge,defensive_two_point_attempt,defensive_two_point_conv,defensive_extra_point_attempt,defensive_extra_point_conv
count,200.0,200.0,195.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,...,0.0,3.0,0.0,200.0,8.0,200.0,195.0,195.0,195.0,195.0
mean,2007.71,2009091000.0,53.697436,413.3,818.3,1727.3,0.025,12.145,0.055,2.55,...,,1.666667,,1.845,5.75,0.0,0.0,0.0,0.0,0.0
std,1175.35756,131.9634,25.40691,292.063668,541.167248,1019.147974,0.156517,7.207895,0.228552,1.141937,...,,2.886751,,7.904644,3.150964,0.0,0.0,0.0,0.0,0.0
min,68.0,2009091000.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,,0.0,,-3.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,902.0,2009091000.0,34.0,115.75,357.0,835.0,0.0,5.0,0.0,2.0,...,,0.0,,0.0,4.5,0.0,0.0,0.0,0.0,0.0
50%,2052.0,2009091000.0,58.0,415.5,822.5,1800.5,0.0,11.0,0.0,2.0,...,,0.0,,0.0,5.0,0.0,0.0,0.0,0.0,0.0
75%,2996.5,2009091000.0,73.0,675.5,1245.0,2630.75,0.0,18.0,0.0,4.0,...,,2.5,,0.0,6.25,0.0,0.0,0.0,0.0,0.0
max,4209.0,2009091000.0,96.0,900.0,1800.0,3593.0,1.0,26.0,1.0,5.0,...,,5.0,,80.0,11.0,0.0,0.0,0.0,0.0,0.0
