# Scaping NFL data


In [37]:
# Importing libraries
import random
import time

import numpy as np
import pandas as pd

In [38]:
# Create list of seasons as stings to scrape
seasons = [str(season) for season in range(2021, 2022)]
print(f"Scraping data for {len(seasons)} seasons")

# Create list of team abbreviations used by pro-football-reference
teams = [
    "crd",
    "atl",
    "rav",
    "buf",
    "car",
    "chi",
    "cin",
    "cle",
    "dal",
    "den",
    "det",
    "gnb",
    "htx",
    "clt",
    "jax",
    "kan",
    "rai",
    "sdg",
    "ram",
    "mia",
    "min",
    "nwe",
    "nor",
    "nyg",
    "nyj",
    "phi",
    "pit",
    "sfo",
    "sea",
    "tam",
    "oti",
    "was",
]
print(f"Scraping data for {len(teams)} teams")

Scraping data for 1 seasons
Scraping data for 32 teams


In [None]:
# Empty list to store dataframes
all_dfs = []

# Loop through each season for each team
for season in seasons:
    for team in teams:
        # 1. Create url
        url = f"https://www.pro-football-reference.com/teams/{team}/{season}/gamelog/"
        print(url)

        # 2. Scrape data
        df = pd.read_html(url, header=1, attrs={"id": "gamelog" + season})[0]

        # 3. Insert Team and Season into dataframe
        df.insert(loc=0, column="Season", value=season)
        df.insert(loc=2, column="Team", value=team.upper())

        # 4. Append to list
        all_dfs.append(df)

        # Limit requests to 20 requests per minute
        time.sleep(random.randint(3, 5))

# Concatenate all dataframes in list
gamelog_df = pd.concat(all_dfs, ignore_index=True)
gamelog_df.to_csv(f"../data/nfl_gamelog_{seasons[0]}-{seasons[-1]}.csv", index=False)

In [58]:
np.set_printoptions(linewidth=120)

gamelog_df = pd.read_csv(f"../data/nfl_gamelog_{seasons[0]}-{seasons[-1]}.csv")
print(gamelog_df.info())
print(gamelog_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 544 entries, 0 to 543
Data columns (total 38 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Season      544 non-null    int64  
 1   Week        544 non-null    int64  
 2   Team        544 non-null    object 
 3   Day         544 non-null    object 
 4   Date        544 non-null    object 
 5   Unnamed: 3  544 non-null    object 
 6   Unnamed: 4  544 non-null    object 
 7   OT          42 non-null     object 
 8   Unnamed: 6  272 non-null    object 
 9   Opp         544 non-null    object 
 10  Tm          544 non-null    int64  
 11  Opp.1       544 non-null    int64  
 12  Cmp         544 non-null    int64  
 13  Att         544 non-null    int64  
 14  Yds         544 non-null    int64  
 15  TD          544 non-null    int64  
 16  Int         544 non-null    int64  
 17  Sk          544 non-null    int64  
 18  Yds.1       544 non-null    int64  
 19  Y/A         544 non-null    f

## Clean Gamelog Data


In [124]:
# Drop Game stats columns
gamelog_clean_df = gamelog_df.drop(gamelog_df.columns[12:], axis=1)
# Drop Boxscore Link column
gamelog_clean_df = gamelog_clean_df.drop(["Unnamed: 3"], axis=1)
# Drop 'Day' column
gamelog_clean_df = gamelog_clean_df.drop(["Day"], axis=1)
# Drop 'OT' column
gamelog_clean_df = gamelog_clean_df.drop(["OT"], axis=1)

# Rename columns
gamelog_clean_df = gamelog_clean_df.rename(
    columns={
        "Unnamed: 4": "Win",
        "Unnamed: 6": "Home",
        "Tm": "Team_points",
        "Opp.1": "Opp_points",
    }
)

# Map Opp to team abbreviation
team_dict = {
    "Arizona Cardinals": "CRD",
    "Atlanta Falcons": "ATL",
    "Baltimore Ravens": "RAV",
    "Buffalo Bills": "BUF",
    "Carolina Panthers": "CAR",
    "Chicago Bears": "CHI",
    "Cincinnati Bengals": "CIN",
    "Cleveland Browns": "CLE",
    "Dallas Cowboys": "DAL",
    "Denver Broncos": "DEN",
    "Detroit Lions": "DET",
    "Green Bay Packers": "GNB",
    "Houston Texans": "HTX",
    "Indianapolis Colts": "CLT",
    "Jacksonville Jaguars": "JAX",
    "Kansas City Chiefs": "KAN",
    "Los Angeles Chargers": "SDG",
    "Los Angeles Rams": "RAM",
    "Las Vegas Raiders": "RAI",
    "Oakland Raiders": "RAI",
    "Miami Dolphins": "MIA",
    "Minnesota Vikings": "MIN",
    "New England Patriots": "NWE",
    "New Orleans Saints": "NOR",
    "New York Giants": "NYG",
    "New York Jets": "NYJ",
    "Philadelphia Eagles": "PHI",
    "Pittsburgh Steelers": "PIT",
    "San Francisco 49ers": "SFO",
    "Seattle Seahawks": "SEA",
    "Tampa Bay Buccaneers": "TAM",
    "Tennessee Titans": "OTI",
    "Washington Commanders": "WAS",
    "Washington Football Team": "WAS",
    "Washington Redskins": "WAS",
}
gamelog_clean_df["Opp"] = gamelog_clean_df["Opp"].map(team_dict)

# Convert 'Win' column to boolean (W = 1, L = 0)
gamelog_clean_df["Win"] = gamelog_clean_df["Win"].apply(lambda x: 1 if x == "W" else 0)
# Convert 'Home' column to boolean (H = 1, A = 0)
gamelog_clean_df["Home"] = gamelog_clean_df["Home"].apply(
    lambda x: 0 if x == "@" else 1
)

with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
    display(gamelog_clean_df)

Unnamed: 0,Season,Week,Team,Date,Win,Home,Opp,Team_points,Opp_points
0,2021,1,CRD,September 12,1,0,OTI,38,13
1,2021,2,CRD,September 19,1,1,MIN,34,33
2,2021,3,CRD,September 26,1,0,JAX,31,19
3,2021,4,CRD,October 3,1,0,RAM,37,20
4,2021,5,CRD,October 10,1,1,SFO,17,10
...,...,...,...,...,...,...,...,...,...
539,2021,14,WAS,December 12,0,1,DAL,20,27
540,2021,15,WAS,December 21,0,0,PHI,17,27
541,2021,16,WAS,December 26,0,0,DAL,14,56
542,2021,17,WAS,January 2,0,1,PHI,16,20


## Scrape vegas lines data


In [None]:
# Create empty list to store dataframes
all_lines_dfs = []

# Loop through each season for each team
for season in seasons:
    for team in teams:
        # 1. Create url
        url = f"https://www.pro-football-reference.com/teams/{team}/{season}_lines.htm"
        print(url)

        # 2. Scrape data
        lines_df = pd.read_html(url, header=0, attrs={"id": "vegas_lines"})[0]

        # 3. Insert Team and Season into dataframe
        lines_df.insert(loc=0, column="Season", value=season)
        lines_df.insert(loc=2, column="Team", value=team.upper())

        # 4. Append to list
        all_lines_dfs.append(lines_df)

        # Limit requests to 20 requests per minute
        time.sleep(random.randint(3, 5))

# Concatenate all dataframes in list
lines_df = pd.concat(all_lines_dfs, ignore_index=True)
lines_df.to_csv(f"../data/nfl_lines_{seasons[0]}-{seasons[-1]}.csv", index=False)

In [131]:
lines_df = pd.read_csv(f"../data/nfl_lines_{seasons[0]}-{seasons[-1]}.csv")
print(lines_df.info())
print(lines_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570 entries, 0 to 569
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Season      570 non-null    int64  
 1   G#          570 non-null    int64  
 2   Team        570 non-null    object 
 3   Opp         570 non-null    object 
 4   Spread      570 non-null    float64
 5   Over/Under  570 non-null    float64
 6   Result      570 non-null    object 
 7   vs. Line    570 non-null    object 
 8   OU Result   570 non-null    object 
dtypes: float64(2), int64(2), object(5)
memory usage: 40.2+ KB
None
(570, 9)


## Clean Vegas Lines Data


In [127]:
# Drop columns
lines_clean_df = lines_df.drop(lines_df.columns[6:], axis=1)

# Rename columns
lines_clean_df = lines_clean_df.rename(
    columns={
        "G#": "G",
        "Over/Under": "Total",
    }
)

# Remove playoff games from dataframe
# 1978 - 2020 seasons had 16 regular season games per team
# 2021 - current(2025) seasons have 17 regular season games per team
lines_clean_df = lines_clean_df.query(
    "(Season <= 2020 & G < 17) | (Season >= 2021 & G < 18)"
)

# Remove 'G' column
lines_clean_df = lines_clean_df.drop(["G"], axis=1)

# Create 'Home' column (if opp has @ then it is away game)
lines_clean_df["Home"] = lines_clean_df["Opp"].apply(lambda x: 0 if x[0] == "@" else 1)

# Remove '@' from 'Opp' column
lines_clean_df["Opp"] = lines_clean_df["Opp"].str.replace("@", "")

# Map team abbreviations in vegas lines dataframe to team abbreviations in gamelog dataframe
abbr_dict = {
    "OAK": "RAI",
    "LVR": "RAI",
    "STL": "RAM",
    "LAR": "RAM",
    "LAC": "SDG",
    "IND": "CLT",
    "HOU": "HTX",
    "BAL": "RAV",
    "ARI": "CRD",
    "TEN": "OTI",
}
lines_clean_df = lines_clean_df.replace({"Opp": abbr_dict})

with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
    display(lines_clean_df)

Unnamed: 0,Season,Team,Opp,Spread,Total,Home
0,2021,CRD,OTI,3.0,53.5,0
1,2021,CRD,MIN,-3.5,50.5,1
2,2021,CRD,JAX,-7.5,51.5,0
3,2021,CRD,RAM,4.0,49.5,0
4,2021,CRD,SFO,-5.0,48.5,1
...,...,...,...,...,...,...
565,2021,WAS,DAL,4.5,48.0,1
566,2021,WAS,PHI,6.5,42.0,0
567,2021,WAS,DAL,10.0,46.5,0
568,2021,WAS,PHI,4.5,44.5,1


## Merge Vegas Lines and Gamelog Data


In [128]:
# Check Both Dataframes
print(gamelog_clean_df.shape)
print(lines_clean_df.shape)

(544, 9)
(544, 6)
    Season Team  Opp  Spread  Total  Home
35    2021  RAV  RAI    -4.0   50.5     0
    Season  Week Team          Date  Win  Home  Opp  Team_points  Opp_points
34    2021     1  RAV  September 13    0     0  RAI           27          33


In [130]:
# Merge Dataframes using 'Season', 'Team', 'Opp', and 'Home' columns
merged_df = pd.merge(
    gamelog_clean_df, lines_clean_df, on=["Season", "Team", "Opp", "Home"]
)

# Check Merged Dataframe
print(merged_df.shape)
with pd.option_context("display.max_colwidth", None, "display.max_columns", None):
    display(merged_df)

(544, 11)


Unnamed: 0,Season,Week,Team,Date,Win,Home,Opp,Team_points,Opp_points,Spread,Total
0,2021,1,CRD,September 12,1,0,OTI,38,13,3.0,53.5
1,2021,2,CRD,September 19,1,1,MIN,34,33,-3.5,50.5
2,2021,3,CRD,September 26,1,0,JAX,31,19,-7.5,51.5
3,2021,4,CRD,October 3,1,0,RAM,37,20,4.0,49.5
4,2021,5,CRD,October 10,1,1,SFO,17,10,-5.0,48.5
...,...,...,...,...,...,...,...,...,...,...,...
539,2021,14,WAS,December 12,0,1,DAL,20,27,4.5,48.0
540,2021,15,WAS,December 21,0,0,PHI,17,27,6.5,42.0
541,2021,16,WAS,December 26,0,0,DAL,14,56,10.0,46.5
542,2021,17,WAS,January 2,0,1,PHI,16,20,4.5,44.5


    Season  Week Team          Date  Win  Home  Opp  Team_points  Opp_points  \
34    2021     1  RAV  September 13    0     0  RAI           27          33   

    Spread  Total  
34    -4.0   50.5  
