# Scaping NFL data


In [37]:
# Importing libraries
import random
import time

import numpy as np
import pandas as pd

In [38]:
# Create list of seasons as stings to scrape
seasons = [str(season) for season in range(2021, 2022)]
print(f"Scraping data for {len(seasons)} seasons")

# Create list of team abbreviations used by pro-football-reference
teams = [
    "crd",
    "atl",
    "rav",
    "buf",
    "car",
    "chi",
    "cin",
    "cle",
    "dal",
    "den",
    "det",
    "gnb",
    "htx",
    "clt",
    "jax",
    "kan",
    "rai",
    "sdg",
    "ram",
    "mia",
    "min",
    "nwe",
    "nor",
    "nyg",
    "nyj",
    "phi",
    "pit",
    "sfo",
    "sea",
    "tam",
    "oti",
    "was",
]
print(f"Scraping data for {len(teams)} teams")

Scraping data for 1 seasons
Scraping data for 32 teams


In [39]:
# Empty list to store dataframes
all_dfs = []

# Loop through each season for each team
for season in seasons:
    for team in teams:
        # 1. Create url
        url = f"https://www.pro-football-reference.com/teams/{team}/{season}/gamelog/"
        print(url)
        # 2. Scrape data
        df = pd.read_html(url, header=1, attrs={"id": "gamelog" + season})[0]

        # 3. Insert Team and Season into dataframe
        df.insert(loc=0, column="Season", value=season)
        df.insert(loc=2, column="Team", value=team.upper())

        # 4. Append to list
        all_dfs.append(df)

        # Limit requests to 20 requests per minute
        time.sleep(random.randint(3, 5))

# Concatenate all dataframes in list
gamelog_df = pd.concat(all_dfs, ignore_index=True)
gamelog_df.to_csv(f"../data/nfl_gamelog_{seasons[0]}-{seasons[-1]}.csv", index=False)

https://www.pro-football-reference.com/teams/crd/2021/gamelog/
https://www.pro-football-reference.com/teams/atl/2021/gamelog/
https://www.pro-football-reference.com/teams/rav/2021/gamelog/
https://www.pro-football-reference.com/teams/buf/2021/gamelog/
https://www.pro-football-reference.com/teams/car/2021/gamelog/
https://www.pro-football-reference.com/teams/chi/2021/gamelog/
https://www.pro-football-reference.com/teams/cin/2021/gamelog/
https://www.pro-football-reference.com/teams/cle/2021/gamelog/
https://www.pro-football-reference.com/teams/dal/2021/gamelog/
https://www.pro-football-reference.com/teams/den/2021/gamelog/
https://www.pro-football-reference.com/teams/det/2021/gamelog/
https://www.pro-football-reference.com/teams/gnb/2021/gamelog/
https://www.pro-football-reference.com/teams/htx/2021/gamelog/
https://www.pro-football-reference.com/teams/clt/2021/gamelog/
https://www.pro-football-reference.com/teams/jax/2021/gamelog/
https://www.pro-football-reference.com/teams/kan/2021/g

In [40]:
np.set_printoptions(linewidth=120)

print(gamelog_df.info())
print(gamelog_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 38 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Season      544 non-null    object 
 1   Week        544 non-null    int64  
 2   Team        544 non-null    object 
 3   Day         544 non-null    object 
 4   Date        544 non-null    object 
 5   Unnamed: 3  544 non-null    object 
 6   Unnamed: 4  544 non-null    object 
 7   OT          42 non-null     object 
 8   Unnamed: 6  272 non-null    object 
 9   Opp         544 non-null    object 
 10  Tm          544 non-null    int64  
 11  Opp.1       544 non-null    int64  
 12  Cmp         544 non-null    int64  
 13  Att         544 non-null    int64  
 14  Yds         544 non-null    int64  
 15  TD          544 non-null    int64  
 16  Int         544 non-null    int64  
 17  Sk          544 non-null    int64  
 18  Yds.1       544 non-null    int64  
 19  Y/A         544 non-null    f

## Clean Gamelog Data


In [41]:
# Drop Game stats columns
gamelog_clean_df = gamelog_df.drop(gamelog_df.columns[12:], axis=1)
# Drop boxscore link column
gamelog_clean_df = gamelog_clean_df.drop(gamelog_clean_df.columns[5:6], axis=1)

# Rename columns
gamelog_clean_df = gamelog_clean_df.rename(
    columns={
        "Unnamed: 4": "Win",
        "Unnamed: 6": "Home",
        "Tm": "Team_points",
        "Opp.1": "Opp_points",
    }
)

# Map Opp to team abbreviation
team_dict = {
    "Arizona Cardinals": "CRD",
    "Atlanta Falcons": "ATL",
    "Baltimore Ravens": "RAV",
    "Buffalo Bills": "BUF",
    "Carolina Panthers": "CAR",
    "Chicago Bears": "CHI",
    "Cincinnati Bengals": "CIN",
    "Cleveland Browns": "CLE",
    "Dallas Cowboys": "DAL",
    "Denver Broncos": "DEN",
    "Detroit Lions": "DET",
    "Green Bay Packers": "GNB",
    "Houston Texans": "HTX",
    "Indianapolis Colts": "CLT",
    "Jacksonville Jaguars": "JAX",
    "Kansas City Chiefs": "KAN",
    "Los Angeles Chargers": "SDG",
    "Los Angeles Rams": "RAM",
    "Miami Dolphins": "MIA",
    "Minnesota Vikings": "MIN",
    "New England Patriots": "NWE",
    "New Orleans Saints": "NOR",
    "New York Giants": "NYG",
    "New York Jets": "NYJ",
    "Philadelphia Eagles": "PHI",
    "Pittsburgh Steelers": "PIT",
    "San Francisco 49ers": "SFO",
    "Seattle Seahawks": "SEA",
    "Tampa Bay Buccaneers": "TAM",
    "Tennessee Titans": "OTI",
    "Washington Commanders": "WAS",
    "Washington Football Team": "WAS",
    "Washington Redskins": "WAS",
}
gamelog_clean_df["Opp"] = gamelog_clean_df["Opp"].map(team_dict)

# Convert 'Win' column to boolean (W = 1, L = 0)
gamelog_clean_df["Win"] = gamelog_clean_df["Win"].apply(lambda x: 1 if x == "W" else 0)
# Convert 'OT' column to boolean (OT = 1, other = 0)
gamelog_clean_df["OT"] = gamelog_clean_df["OT"].apply(lambda x: 1 if x == "OT" else 0)
# Convert 'Home' column to boolean (H = 1, A = 0)
gamelog_clean_df["Home"] = gamelog_clean_df["Home"].apply(lambda x: 0 if x == "@" else 0)

with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
    display(gamelog_clean_df)

Unnamed: 0,Season,Week,Team,Day,Date,Win,OT,Home,Opp,Team_points,Opp_points
0,2021,1,CRD,Sun,September 12,1,0,0,OTI,38,13
1,2021,2,CRD,Sun,September 19,1,0,0,MIN,34,33
2,2021,3,CRD,Sun,September 26,1,0,0,JAX,31,19
3,2021,4,CRD,Sun,October 3,1,0,0,RAM,37,20
4,2021,5,CRD,Sun,October 10,1,0,0,SFO,17,10
...,...,...,...,...,...,...,...,...,...,...,...
539,2021,14,WAS,Sun,December 12,0,0,0,DAL,20,27
540,2021,15,WAS,Tue,December 21,0,0,0,PHI,17,27
541,2021,16,WAS,Sun,December 26,0,0,0,DAL,14,56
542,2021,17,WAS,Sun,January 2,0,0,0,PHI,16,20
