# NFL Database Project: WebScraping & Data Cleaning
<br>
CIS 761 Database Management Systems – Term Project<br>
Kansas State University
<br><br>
Vishnu Bondalakunta<br>
Charles Zumbaugh<br>
James Chapman<br>
<br>

#### This notebook uses the public ESPN API. The URLs and endpoints were found in the following GitHub link. ESPN does not provide official documentation.

* ## [List of NFL API Endpoints](https://gist.github.com/nntrn/ee26cb2a0716de0947a0a4e9a157bc1c#event-competitions-api)

#### 8 tables are collected and saved as CSV files, which is used to populate the NFL database.
* games
* season_dates
* linescores
* rosters
* athletes
* venues
* teams
* positions

In [1]:
import pandas as pd
import numpy as np
import requests
import re

In [2]:
years = range(2013,2024) # ! Keep track of calendar year and NFL season year

ESPN_EVENTS = 'https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard?limit=1000&dates={}' #.format(year)
ESPN_ROSTERS = 'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{}/competitions/{}/competitors/{}/roster' #.format(game_id,game_id,team_id)
ESPN_ATHLETE_INFO = 'https://site.web.api.espn.com/apis/common/v3/sports/football/nfl/athletes/{}' #.format(player_id)
ESPN_VENUE_INFO = 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/venues/{}' #.format(venue_id)
ESPN_TEAM_INFO = 'https://site.api.espn.com/apis/site/v2/sports/football/nfl/teams/{}' #.format(team_id)
ESPN_POSITION_INFO = 'http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/{}' #.format(position_id)

ESPN_STATISTICS = 'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{}/competitions/{}/competitors/{}/roster/{}/statistics/0' #.format(game_id,game_id,team_id,player_id)
ESPN_PLAY_BY_PLAY = 'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{}/competitions/{}/plays?limit=3000' #.format(game_id,game_id)

# Games

In [3]:
games = pd.DataFrame()
for year in years:
    try:
        events = requests.get(ESPN_EVENTS.format(year)).json()
        yearlyEvents = pd.json_normalize(events['events'])
        games = pd.concat([games, yearlyEvents], ignore_index=True)
    except: 
        print(year)

# games['competitions'] is a list of dictionaries 
games['attendance'] = games['competitions'].apply(lambda x : x[0]['attendance'])
games['venue_id'] = games['competitions'].apply(lambda x : x[0]['venue']['id'])

# games['competitions'][0]['competitors'] is a list of 2 dictionaries (home and away).
games['competitors'] = games['competitions'].apply(lambda x : x[0]['competitors'])

games['home_team_id'] = games['competitors'].apply(lambda x : x[0]['id'])
games['home_score'] = games['competitors'].apply(lambda x : x[0]['score'])
games['away_team_id'] = games['competitors'].apply(lambda x : x[1]['id'])
games['away_score'] = games['competitors'].apply(lambda x : x[1]['score'])

# games['competitors']['winner'] is a Boolean win/lose, but there are ties! 
def tietest(x):
    try: return x[0]['winner']
    except KeyError: return None # tie! 
games['home_win_bool'] = games['competitors'].apply(tietest)

# Drop pre-season and off-season
games = games.drop(games[(games['season.type']== 1)].index)
games = games.drop(games[(games['season.type']== 4)].index)

# Split datetime into date/time 
# decompose date/week/season year/season type
games['datetime'] = pd.to_datetime(games['date'])
games['utc_time'] = games['datetime'].dt.strftime("%H:%M")
games['date'] = games['datetime'].dt.strftime("%d.%m.%Y")
season_dates = games[['date','season.year','season.type','week.number']].copy()

#Keep around For linescores
gameLinescores = games[['id','competitors']].copy()

# Drop rows
games = games.rename(columns={'id':'game_id'})
games = games[['game_id',
               'name',
               'shortName',
               'date',
               'attendance',
               'venue_id',
               'home_team_id',
               'away_team_id',
               'utc_time']] 
                #'home_score',
                #'away_score',
               #'home_win_bool'

games.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2986 entries, 0 to 3584
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       2986 non-null   object
 1   name          2986 non-null   object
 2   shortName     2986 non-null   object
 3   date          2986 non-null   object
 4   attendance    2986 non-null   int64 
 5   venue_id      2986 non-null   object
 6   home_team_id  2986 non-null   object
 7   away_team_id  2986 non-null   object
 8   utc_time      2986 non-null   object
dtypes: int64(1), object(8)
memory usage: 233.3+ KB


# Season_Dates

In [4]:
season_dates = season_dates.rename(columns={'season.year':'season_year',
                                          'season.type':'season_type',
                                          'week.number':'week'})
# season_types names
season_dates.loc[(season_dates['season_type']== 2),'season_type'] = 'Regular Season'
season_dates.loc[(season_dates['season_type']== 3),'season_type'] = 'Post Season'

season_dates.drop_duplicates(inplace=True)
season_dates.info(verbose=True) 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 0 to 3584
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         837 non-null    object
 1   season_year  837 non-null    int64 
 2   season_type  837 non-null    object
 3   week         837 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 32.7+ KB


# Linescores 


In [5]:
linescores = pd.DataFrame(columns = ["game_id","team_id","quarter","score"])
overtimecount = 0
def eachrow(game):
    try:
        for competitor in game['competitors']: # competitors[0] = home team
            quarter = 0
            for linescore in competitor['linescores']:
                quarter += 1
                linescores.loc[len(linescores.index)] = [game['id'], 
                                                         competitor['id'],
                                                         quarter, 
                                                         linescore['value']]
    except:
        print('Game canceled-', game['id'])

junk = gameLinescores.apply(eachrow, axis=1)

linescores['game_id'] = linescores['game_id'].astype('int64')
linescores['team_id'] = linescores['team_id'].astype('int64')
linescores['score'] = linescores['score'].astype('int64')
linescores.info(verbose=True)

Game canceled- 400554331
Game canceled- 400951581
Game canceled- 401492629
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24212 entries, 0 to 24211
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   game_id  24212 non-null  int64
 1   team_id  24212 non-null  int64
 2   quarter  24212 non-null  int64
 3   score    24212 non-null  int64
dtypes: int64(4)
memory usage: 945.8 KB


# Rosters 

The rosters URL lists all players under 'entries'. But the position is something like this…<br>
$ref : "http://sports.core.api.espn.com/v2/sports/football/leagues/nfl/positions/46?lang=en&region=us" <br>
This just splits the link at "positions" giving "/46?lang=en&region=us"
and then takes only numbers "46"




In [6]:
rosters = pd.DataFrame(columns = ["game_id", "team_id", "athlete_id","position_id","active","didNotPlay"])
def eachrow(game):
    temp_game_id = game['game_id']
    temp_home_team_id = game['home_team_id']
    temp_away_team_id = game['away_team_id']

    try:
        homeRoster = requests.get(ESPN_ROSTERS.format(temp_game_id,temp_game_id,temp_home_team_id)).json()
        for player in homeRoster['entries']:
            split = re.split("positions", player['position']['$ref'])
            position_id = re.sub("[^0-9.]", "", split[1])
            rosters.loc[len(rosters.index)] = [temp_game_id, 
                                                temp_home_team_id,
                                                player['playerId'], 
                                                position_id, 
                                                player['active'], 
                                                player['didNotPlay']]
    
        awayRoster = requests.get(ESPN_ROSTERS.format(temp_game_id,temp_game_id,temp_away_team_id)).json()
        # print(temp_game_id, temp_home_team_id, temp_away_team_id, len(homeRoster['entries']), len(awayRoster['entries']))
        for player in awayRoster['entries']:
            split = re.split("positions", player['position']['$ref'])
            position_id = re.sub("[^0-9.]", "", split[1])
            rosters.loc[len(rosters.index)] = [temp_game_id, 
                                                temp_home_team_id,
                                                player['playerId'], 
                                                position_id, 
                                                player['active'], 
                                                player['didNotPlay']]
    except:
        print('Game canceled or Pro bowl -', game['game_id'])

games.apply(eachrow, axis=1)
rosters.info(verbose=True)

Game canceled or Pro bowl - 330127032
Game canceled or Pro bowl - 340126035
Game canceled or Pro bowl - 400554331
Game canceled or Pro bowl - 400874729
<class 'pandas.core.frame.DataFrame'>
Int64Index: 364111 entries, 0 to 364110
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   game_id      364111 non-null  object
 1   team_id      364111 non-null  object
 2   athlete_id   364111 non-null  int64 
 3   position_id  364111 non-null  object
 4   active       364111 non-null  bool  
 5   didNotPlay   364111 non-null  bool  
dtypes: bool(2), int64(1), object(3)
memory usage: 14.6+ MB


# Athletes 

In [7]:
athletes = pd.DataFrame()
for athlete_id in pd.unique(rosters['athlete_id']):
    try:
        athlete_info = requests.get(ESPN_ATHLETE_INFO.format(athlete_id)).json()
        athlete_info = pd.json_normalize(athlete_info['athlete'])
        athletes = pd.concat([athletes, athlete_info], ignore_index=True)
    except: 
        print(athlete_id)

# Drop rows
athletes = athletes.rename(columns={'id':'athlete_id',
                                     'displayBirthPlace':'birth_place',
                                     'displayDraft':'drafted_bool',
                                     'displayHeight':'heightInches',
                                     'displayWeight':'weight',
                                     'displayDOB':'dob'})
athletes = athletes[['athlete_id',
                     'firstName',
                     'lastName',
                     #'debutYear',
                     'birth_place',
                     'drafted_bool',
                     'jersey',
                     'heightInches',
                     'weight',
                     'dob']]
                     #'displayDraft']]
                     #'college.id',
                     #'jersey',
                     #'displayJersey',
                     #'displayExperience',
                     #'position.id',
                     #'team.id',
                     #'collegeTeam.id',
                     #'collegeAthlete.id'

#athletes['debutYear'] = athletes['debutYear'].fillna(0).astype('int64')
athletes.info(verbose=True)

4568981
3929824
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7165 entries, 0 to 7164
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   athlete_id    7165 non-null   object
 1   firstName     7165 non-null   object
 2   lastName      7165 non-null   object
 3   birth_place   6743 non-null   object
 4   drafted_bool  4010 non-null   object
 5   jersey        6713 non-null   object
 6   heightInches  7157 non-null   object
 7   weight        7157 non-null   object
 8   dob           7120 non-null   object
dtypes: object(9)
memory usage: 503.9+ KB


In [9]:
def extract_number(s: str) -> str:
    try:
        return re.match(r'[0-9]+', s).group(0)
    except:
        return 0#'NULL'


def get_height_inches(height_str: str) -> int:
    # pattern of display height in [0-9]'[0-9]+" where the first
    # value is height in feet and the second is height in inches.
    try:
        matches = re.findall(r'[0-9]+', height_str)
        feet = matches[0]
        inches = matches[1]
        return int(feet) * 12 + int(inches)
    except:
        return pd.NA

def clean_date(date_string: str) -> str:
    # The date is formatted in one of several ways:
    # %M-%d-%Y
    # %M-%d-%y
    # %m-%d-%Y
    # %m-%d-%y
    try:
        date_values = re.findall(r'[0-9]+', date_string)
        day = date_values[0]
        month = date_values[1]
        year = date_values[2]
        day = '0' + day if len(day) < 2 else day
        month = '0' + month if len(month) < 2 else month
        if len(year) < 4:
            if int(year) > 24:
                year = '19' + year
            else:
                year = '20' + year
        return f'{month}-{day}-{year}'
    except:
        return ''

athletes['weight'] = athletes['weight'].apply(extract_number)
athletes['heightInches'] = athletes['heightInches'].apply(get_height_inches)
athletes['dob'] = athletes['dob'].apply(clean_date)

# Venues

In [23]:
venues = pd.DataFrame()
for venue_id in pd.unique(games['venue_id']):
    try:
        venue_info = requests.get(ESPN_VENUE_INFO.format(venue_id)).json()
        venue_info = pd.json_normalize(venue_info)
        venues = pd.concat([venues, venue_info], ignore_index=True)
    except: 
        print(venue_id)

# # Drop rows
venues = venues[['id',
                 'fullName',
                 #'capacity',
                 'grass',
                 'indoor',
                 'address.city',
                 'address.state',
                 'address.zipCode']]
venues = venues.rename(columns={'id':'venue_id',
                                'address.city':'city',
                                'address.state':'state',
                                'address.zipCode':'zipCode'})
venues.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   venue_id  48 non-null     object
 1   fullName  48 non-null     object
 2   grass     48 non-null     bool  
 3   indoor    48 non-null     bool  
 4   city      48 non-null     object
 5   state     42 non-null     object
 6   zipCode   38 non-null     object
dtypes: bool(2), object(5)
memory usage: 2.1+ KB


# Teams

In [11]:
teams = pd.DataFrame()
for team_id in pd.unique(games['home_team_id']):
    try:
        team_info = requests.get(ESPN_TEAM_INFO.format(team_id)).json()
        team_info = pd.json_normalize(team_info)
        teams = pd.concat([teams, team_info], ignore_index=True)
    except: 
        print(team_id)

# # Drop rows
teams = teams[['team.id',
                 'team.location',
                 'team.name',
                 'team.abbreviation',
                 'team.franchise.venue.id',
                 'team.color',
                 'team.alternateColor']]
teams = teams.rename(columns={'team.id':'team_id',
                             'team.location':'location',
                             'team.name':'name',
                             'team.abbreviation':'abbreviation',
                             'team.franchise.venue.id':'venue_id',
                             'team.color':'primary_color',
                             'team.alternateColor':'secondary_color'})

teams.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team_id          36 non-null     object
 1   location         36 non-null     object
 2   name             34 non-null     object
 3   abbreviation     36 non-null     object
 4   venue_id         32 non-null     object
 5   primary_color    34 non-null     object
 6   secondary_color  32 non-null     object
dtypes: object(7)
memory usage: 2.1+ KB


# Positions

In [12]:
positions = pd.DataFrame()
for position_id in pd.unique(rosters['position_id']):
    try:
        position_info = requests.get(ESPN_POSITION_INFO.format(position_id)).json()
        position_info = pd.json_normalize(position_info)
        positions = pd.concat([positions, position_info], ignore_index=True)
    except: 
        print(position_info)

# Drop rows
positions = positions[['id', 'name', 'abbreviation']]
positions = positions.rename(columns={'id':'position_id'})

positions.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   position_id   24 non-null     object
 1   name          24 non-null     object
 2   abbreviation  24 non-null     object
dtypes: object(3)
memory usage: 704.0+ bytes


# Review

In [13]:
games.info(verbose=True)
for column in games.columns.tolist():
    try:
        print(column,'--------',pd.unique(games[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2986 entries, 0 to 3584
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       2986 non-null   object
 1   name          2986 non-null   object
 2   shortName     2986 non-null   object
 3   date          2986 non-null   object
 4   attendance    2986 non-null   int64 
 5   venue_id      2986 non-null   object
 6   home_team_id  2986 non-null   object
 7   away_team_id  2986 non-null   object
 8   utc_time      2986 non-null   object
dtypes: int64(1), object(8)
memory usage: 233.3+ KB
game_id -------- 2986
name -------- 1160
shortName -------- 1144
date -------- 837
attendance -------- 2315
venue_id -------- 48
home_team_id -------- 36
away_team_id -------- 36
utc_time -------- 49


In [14]:
season_dates.info(verbose=True)
for column in season_dates.columns.tolist():
    try:
        print(column,'--------',pd.unique(season_dates[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 837 entries, 0 to 3584
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         837 non-null    object
 1   season_year  837 non-null    int64 
 2   season_type  837 non-null    object
 3   week         837 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 32.7+ KB
date -------- 837
season_year -------- 12
season_type -------- 2
week -------- 18


In [15]:
linescores.info(verbose=True)
for column in linescores.columns.tolist():
    try:
        print(column,'--------',pd.unique(linescores[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24212 entries, 0 to 24211
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   game_id  24212 non-null  int64
 1   team_id  24212 non-null  int64
 2   quarter  24212 non-null  int64
 3   score    24212 non-null  int64
dtypes: int64(4)
memory usage: 945.8 KB
game_id -------- 2983
team_id -------- 36
quarter -------- 6
score -------- 30


In [16]:
rosters.info(verbose=True)
for column in rosters.columns.tolist():
    try:
        print(column,'--------',pd.unique(rosters[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364111 entries, 0 to 364110
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   game_id      364111 non-null  object
 1   team_id      364111 non-null  object
 2   athlete_id   364111 non-null  int64 
 3   position_id  364111 non-null  object
 4   active       364111 non-null  bool  
 5   didNotPlay   364111 non-null  bool  
dtypes: bool(2), int64(1), object(3)
memory usage: 14.6+ MB
game_id -------- 2982
team_id -------- 36
athlete_id -------- 7167
position_id -------- 24
active -------- 2
didNotPlay -------- 2


In [28]:
athletes.info(verbose=True)
for column in athletes.columns.tolist():
    try:
        print(column,'--------',pd.unique(athletes[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7165 entries, 0 to 7164
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   athlete_id    7165 non-null   object
 1   firstName     7165 non-null   object
 2   lastName      7165 non-null   object
 3   birth_place   6743 non-null   object
 4   drafted_bool  4010 non-null   object
 5   jersey        6713 non-null   object
 6   heightInches  7157 non-null   object
 7   weight        7165 non-null   int64 
 8   dob           7165 non-null   object
dtypes: int64(1), object(8)
memory usage: 503.9+ KB
athlete_id -------- 7165
firstName -------- 2363
lastName -------- 3861
birth_place -------- 2469
drafted_bool -------- 4007
jersey -------- 107
heightInches -------- 18
weight -------- 209
dob -------- 4308


In [24]:
venues.info(verbose=True)
for column in venues.columns.tolist():
    try:
        print(column,'--------',pd.unique(venues[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   venue_id  48 non-null     object
 1   fullName  48 non-null     object
 2   grass     48 non-null     bool  
 3   indoor    48 non-null     bool  
 4   city      48 non-null     object
 5   state     42 non-null     object
 6   zipCode   38 non-null     object
dtypes: bool(2), object(5)
memory usage: 2.1+ KB
venue_id -------- 48
fullName -------- 48
grass -------- 2
indoor -------- 2
city -------- 43
state -------- 26
zipCode -------- 38


In [19]:
teams.info(verbose=True)
for column in teams.columns.tolist():
    try:
        print(column,'--------',pd.unique(teams[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team_id          36 non-null     object
 1   location         36 non-null     object
 2   name             34 non-null     object
 3   abbreviation     36 non-null     object
 4   venue_id         32 non-null     object
 5   primary_color    34 non-null     object
 6   secondary_color  32 non-null     object
dtypes: object(7)
memory usage: 2.1+ KB
team_id -------- 36
location -------- 34
name -------- 35
abbreviation -------- 36
venue_id -------- 32
primary_color -------- 31
secondary_color -------- 22


In [20]:
positions.info(verbose=True)
for column in positions.columns.tolist():
    try:
        print(column,'--------',pd.unique(positions[column]).size)
    except:
        print(column)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   position_id   24 non-null     object
 1   name          24 non-null     object
 2   abbreviation  24 non-null     object
dtypes: object(3)
memory usage: 704.0+ bytes
position_id -------- 24
name -------- 24
abbreviation -------- 24


In [3]:
# Get play-by-play for each game
def get_data(url, key = None):
    """
    Get data from a given endpoint and return it in JSON. If a key is provided, the value at that key
    will be returned
    :param url: The URL to get the data from
    :param key: An optional key at the top level of the object.
    :return: dict
    """
    response = requests.get(url).json()
    if key:
        return response[key]
    return requests.get(url).json()

PLAYS_URL = 'https://sports.core.api.espn.com/v2/sports/football/leagues/nfl/events/{event_id}/competitions/{event_id}/plays?limit=300'

games = pd.read_csv('data/games.csv')
# games = games[games['year'] == 2020]
plays = pd.DataFrame()
player_plays = pd.DataFrame()
player_stats = pd.DataFrame()
for game_id in games['game_id']:

    # For each game id, pull the play by play data
    play_data = get_data(PLAYS_URL.format(event_id=game_id), key='items')
    for play in play_data:
        # Get the play data if it exists, otherwise continue
        try:
            play_dict = {
                'play_id': play['id'],
                'game_id': game_id,
                'play_type': play['type']['text'],
                'text': play['text'],
                'short_text': play['shortText'],
                'quarter': play['period']['number'],
                'seconds_remaining': play['clock']['value'],
                'scoring_play': play['scoringPlay'],
                'score_value': play['scoreValue'],
                'yards': play['statYardage'],
                'start_down': play['start']['down'],
                'start_first_down_distance': play['start']['distance'],
                'start_yard_line': play['start']['yardLine'],
                'start_yards_to_endzone': play['start']['yardsToEndzone'],
                'end_down': play['end']['down'],
                'end_first_down_distance': play['end']['distance'],
                'end_yard_line': play['end']['yardLine'],
                'end_yards_to_endzone': play['end']['yardsToEndzone']
            }
            df_plays = pd.DataFrame(play_dict, index=[0])
            plays = plays.append(play_dict, ignore_index=True)
            # Loop through each participant and pull the player data
            for player in play['participants']:
                player_url = player['athlete']['$ref']
                player_id = re.search(r'(?<=athletes/)[0-9]+(?=\?)', player_url).group(0)
                player_plays_dict = {
                    'player_id': player_id,
                    'play_id': play['id'],
                    'game_id': game_id,
                    'type': player['type']
                }
                player_play_df = pd.DataFrame(player_plays_dict, index=[0])
                player_plays = pd.concat([player_plays, player_play_df], ignore_index=True)
                for stat in player['stats']:
                    stats = {
                        'player_id': player_id,
                        'game_id': game_id,
                        'play_id': play['id'],
                        'stat_name': stat['name'],
                        'display_name': stat['displayName'],
                        'value': stat['value'],
                        'display_value': stat['displayValue']
                    }
                    stat_df = pd.DataFrame(stats, index=[0])
                    player_stats = pd.concat([player_stats, stat_df], ignore_index=True)
        except Exception as e:
            continue

KeyboardInterrupt: 

# Save

In [21]:
#######################################
games.to_csv('games.csv', index=False)
season_dates.to_csv('season_dates.csv', index=False)
linescores.to_csv('linescores.csv', index=False)
rosters.to_csv('rosters.csv', index=False)
athletes.to_csv('athletes.csv', index=False)
venues.to_csv('venues.csv', index=False)
teams.to_csv('teams.csv', index=False)
positions.to_csv('positions.csv', index=False)
#######################################

In [22]:
: )

SyntaxError: unmatched ')' (2155285666.py, line 1)