In [None]:
!python3 -m pip install bs4 requests

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


### Scraping NFL Weather Data

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import time

In [None]:
pd.options.display.max_columns = None

In [None]:
rowData = []

for season in [2021, 2022, 2023, 2024]:
    for i in range(1, 19):

        page = requests.get(f"https://www.nflweather.com/week/{season}/week-{i}")

        # scrape webpage
        soup = BeautifulSoup(page.content, 'html.parser')

        divs = soup.find_all('div', class_='game-box w-100 d-flex flex-column flex-lg-row align-items-center shadow-box rounded my-2 py-1')

        for div in divs:
            data = [season, i]
            date = div.find('div', class_='fw-bold text-wrap').getText()
            data.append(date.strip())

            awayTeam = div.find('div', class_='flex-centered flex-column me-1 ms-xxl-auto')
            data.append(awayTeam.getText().strip())

            homeTeam = div.find('div', class_='flex-centered flex-column me-xxl-auto')
            data.append(homeTeam.getText().strip())

            awayPoints = div.find('div', class_='game-points ps-1 pe-2')
            data.append(int(awayPoints.getText().strip()))

            homePoints = div.find('div', class_='game-points pe-1 ps-2')
            data.append(int(homePoints.getText().strip()))

            try:
                weather = div.find_all('div', class_='mx-2')
                for w in weather:
                    data.append(w.find('span').getText())

            except:
                continue

            wind = div.find('div', class_='text-break col-md-2 mb-1 px-1 flex-centered')
            wind_attributes = wind.getText().strip().split()
            wind_speed = wind_attributes[1]
            data.append(wind_speed)

            rowData.append(data)

        time.sleep(1)


df = pd.DataFrame(rowData, columns = ['Season', 'Week_Number', 'Date', 'Away_Team', 'Home_Team', 'Away_Points', 'Home_Points', 'Temperature', 'Weather', 'Wind Speed'])

df

KeyboardInterrupt: 

In [None]:
df.to_csv('weather_data.csv', index=False)

### Scraping NFL Boxscore Data

In [None]:
# scrape every single boxscore link from the 2024 NFL season
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.pro-football-reference.com/years/2024/games.htm"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

table = soup.find("table", {"id": "games"})
rows = table.tbody.find_all("tr")

links = []
for tr in rows:
  link_tag = tr.find("a", text="boxscore")
  if link_tag and link_tag['href'].startswith("/boxscores/"):
    links.append(f"https://www.pro-football-reference.com{link_tag['href']}")

print(f"Found {len(links)} boxscores")

Found 285 boxscores


  link_tag = tr.find("a", text="boxscore")


In [None]:
# TODO: scrape each game from pro-football reference
# https://www.pro-football-reference.com/boxscores/202502090phi.htm#all_team_stats

# Reminder: scrape homeTeam, awayTeam, homeScore, awayScore
# then scrape all of the variables in "Team Stats"

import requests
from bs4 import BeautifulSoup
from bs4 import Comment
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')


def parse_data_from_table(table):
  away_team, home_team = table.columns[1:]
  table.columns = ['Category', home_team, away_team]
  # table.columns[0] = 'Category'
  stats_by_team = {}
  # print(away_team, home_team)
  for idx, row in table.iterrows():
    if row['Category'] == 'Rush-Yds-TDs':
      rushes, rush_yds, rush_tds = row[away_team].split('-')
      stats_by_team[f"Away Team Rushes"] = int(rushes)
      stats_by_team[f"Away Team Rush Yards"] = int(rush_yds)
      stats_by_team[f"Away Team Rush TDs"] = int(rush_tds)
      rushes, rush_yds, rush_tds = row[home_team].split('-')
      stats_by_team[f"Home Team Rushes"] = int(rushes)
      stats_by_team[f"Home Team Rush Yards"] = int(rush_yds)
      stats_by_team[f"Home Team Rush TDs"] = int(rush_tds)
    elif row['Category'] == 'Cmp-Att-Yd-TD-INT':
      completions, pass_attempts, pass_yds, pass_tds, ints = row[away_team].split('-')
      stats_by_team[f"Away Team Completions"] = int(completions)
      stats_by_team[f"Away Team Pass Attempts"] = int(pass_attempts)
      stats_by_team[f"Away Team Pass Yards"] = int(pass_yds)
      stats_by_team[f"Away Team Pass TDs"] = int(pass_tds)
      stats_by_team[f"Away Team Interceptions"] = int(ints)
      completions, pass_attempts, pass_yds, pass_tds, ints = row[home_team].split('-')
      stats_by_team[f"Home Team Completions"] = int(completions)
      stats_by_team[f"Home Team Pass Attempts"] = int(pass_attempts)
      stats_by_team[f"Home Team Pass Yards"] = int(pass_yds)
      stats_by_team[f"Home Team Pass TDs"] = int(pass_tds)
      stats_by_team[f"Home Team Interceptions"] = int(ints)
    elif row['Category'] == 'Sacked-Yards':
      sacks, sack_yds = row[away_team].split('-')
      stats_by_team[f"Away Team Sacks"] = int(sacks)
      stats_by_team[f"Away Team Sack Yards"] = int(sack_yds)
      sacks, sack_yds = row[home_team].split('-')
      stats_by_team[f"Home Team Sacks"] = int(sacks)
      stats_by_team[f"Home Team Sack Yards"] = int(sack_yds)
    elif row['Category'] == 'Fumbles-Lost':
      fumbles, fumbles_lost = row[away_team].split('-')
      stats_by_team[f"Away Team Fumbles"] = int(fumbles)
      stats_by_team[f"Away Team Fumbles Lost"] = int(fumbles_lost)
      fumbles, fumbles_lost = row[home_team].split('-')
      stats_by_team[f"Home Team Fumbles"] = int(fumbles)
      stats_by_team[f"Home Team Fumbles Lost"]  = int(fumbles_lost)
    elif row['Category'] == 'Penalties-Yards':
      penalties, penalty_yds = row[away_team].split('-')
      stats_by_team[f"Away Team Penalties"] = int(penalties)
      stats_by_team[f"Away Team Penalty Yards"] = int(penalty_yds)
      penalties, penalty_yds = row[home_team].split('-')
      stats_by_team[f"Home Team Penalties"] = int(penalties)
      stats_by_team[f"Home Team Penalty Yards"] = int(penalty_yds)
    elif row['Category'] == 'Third Down Conv.':
      converted, attempts = row[away_team].split('-')
      stats_by_team[f"Away Team 3rd Down Conversions"] = int(converted)
      stats_by_team[f"Away Team 3rd Down Attempts"] = int(attempts)
      converted, attempts = row[home_team].split('-')
      stats_by_team[f"Home Team 3rd Down Conversions"] = int(converted)
      stats_by_team[f"Home Team 3rd Down Attempts"] = int(attempts)
    elif row['Category'] == 'Fourth Down Conv.':
      converted, attempts = row[away_team].split('-')
      stats_by_team[f"Away Team 4th Down Conversions"] = int(converted)
      stats_by_team[f"Away Team 4th Down Attempts"] = int(attempts)
      converted, attempts = row[home_team].split('-')
      stats_by_team[f"Home Team 4th Down Conversions"] = int(converted)
      stats_by_team[f"Home Team 4th Down Attempts"] = int(attempts)
    else:
      stats_by_team[f"Away Team {row['Category']}"] = row[away_team]
      stats_by_team[f"Home Team {row['Category']}"] = row[home_team]

  stats_by_team_df = pd.DataFrame.from_dict(stats_by_team, orient='index')
  return stats_by_team_df


def scrape_team_stats_table(url):
  resp = requests.get(url)
  soup = BeautifulSoup(resp.text, "html.parser")

  # Find the "Team Stats" table by its surrounding comment/wrapper or table captions
  div = soup.find(id="all_team_stats")
  if div:
      # Pro‑Football‑Reference sometimes wraps tables in <!-- --> comments
      commented_html = div.find(string=lambda text: isinstance(text, Comment))
      inner_soup = BeautifulSoup(commented_html, "html.parser")
      table = inner_soup.find("table")
  else:
      table = soup.find("table", {"id": "team_stats"})

  df = pd.read_html(str(table))[0]
  return df

full_data = pd.DataFrame()
for link in links:
  df = scrape_team_stats_table(link)
  stats_df = parse_data_from_table(df).transpose()
  stats_df.index = [url.split('/')[-1].split('.')[0]]
  full_data = pd.concat([full_data, stats_df], axis=0)
  time.sleep(3.1)

# NOTE: [command/control] + D to multi-select

# off_first_downs
# off_rushes
# off_rushing_yds
# off_rushing_tds
# off_completions
# off_pass_attempts
# off_pass_yds
# off_pass_tds
# off_ints
# off_sacks
# off_sack_yds
# off_net_pass_yds
# off_total_yds
# off_fumbles
# off_fumbles_lost
# off_turnovers
# off_penalties
# off_penalty_yds
# off_third_down_conversions
# off_fourth_down_conversions
# off_possession_time
# def_first_downs
# def_rushes
# def_rushing_yds
# def_rushing_tds
# def_completions
# def_pass_attempts
# def_pass_yds
# def_pass_tds
# def_ints
# def_sacks
# def_sack_yds
# def_net_pass_yds
# def_total_yds
# def_fumbles
# def_fumbles_lost
# def_turnovers
# def_penalties
# def_penalty_yds
# def_third_down_conversions
# def_fourth_down_conversions
# def_possession_time

In [None]:
full_data.to_csv('boxscore_data.csv', index=True)