In [13]:
!pip install python-dotenv
!pip install html5lib



In [14]:
# 3rd Party Libraries
from bs4 import BeautifulSoup
from colored import Fore, Back, Style
from dotenv import load_dotenv
import pandas as pd
import requests

# Built in libraries
from time import gmtime, strftime
from typing import List, Set, Dict, Tuple, Optional

In [15]:
BASE_URL = 'https://www.pro-football-reference.com'

# Complete URL for the 2023 year page
standings_stats_url = f"{BASE_URL}/years/2023/"

response = requests.get(standings_stats_url)

# Return status code if request is successful (200)
def request_status(url: str) -> bool:
    if response.status_code == 200:
        return response.status_code
    else:
        print(f'{Fore.white}{Back.red}False{Style.reset}')
        return response.status_code

In [16]:
# Function to create a BeautifulSoup object if the request is successful
def make_soup(url: str) -> BeautifulSoup:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return None

soup = make_soup(standings_stats_url)

In [17]:
# Function to extract team URLs from a specific division
def get_team_urls(division_id):
    # Select all anchor tags within the division's section
    teams = soup.select(f"#{division_id} a")
    # Construct full URLs for each team and return as a list
    return [f"{BASE_URL}{l.get('href')}" for l in teams]
    
afc_teams_urls = get_team_urls('AFC')
nfc_teams_urls = get_team_urls('NFC')

In [18]:
afc_teams_urls

['https://www.pro-football-reference.com/teams/buf/2023.htm',
 'https://www.pro-football-reference.com/teams/mia/2023.htm',
 'https://www.pro-football-reference.com/teams/nyj/2023.htm',
 'https://www.pro-football-reference.com/teams/nwe/2023.htm',
 'https://www.pro-football-reference.com/teams/rav/2023.htm',
 'https://www.pro-football-reference.com/teams/cle/2023.htm',
 'https://www.pro-football-reference.com/teams/pit/2023.htm',
 'https://www.pro-football-reference.com/teams/cin/2023.htm',
 'https://www.pro-football-reference.com/teams/htx/2023.htm',
 'https://www.pro-football-reference.com/teams/jax/2023.htm',
 'https://www.pro-football-reference.com/teams/clt/2023.htm',
 'https://www.pro-football-reference.com/teams/oti/2023.htm',
 'https://www.pro-football-reference.com/teams/kan/2023.htm',
 'https://www.pro-football-reference.com/teams/rai/2023.htm',
 'https://www.pro-football-reference.com/teams/den/2023.htm',
 'https://www.pro-football-reference.com/teams/sdg/2023.htm']

In [50]:
def gather_team_stats(teams: List[str]) -> None:
    for t in teams:
        soup = make_soup(t)
        if soup:
            table = soup.find('table', id='team_stats')
            if table:
                # Convert the table HTML to a DataFrame
                stats_df = pd.read_html(str(table))[0]  # [0] because pd.read_html returns a list of DataFrames
                return stats_df
            else:
                print(f"Table not found in {t}")
        else:
            print(f"Failed to get data for {t}")

afc_team_stats_df = gather_team_stats(afc_teams_urls[:1])

  stats_df = pd.read_html(str(table))[0]  # [0] because pd.read_html returns a list of DataFrames


In [51]:
afc_team_stats_df

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Tot Yds & TO,Tot Yds & TO,Tot Yds & TO,Unnamed: 6_level_0,Unnamed: 7_level_0,Passing,Passing,...,Penalties,Penalties,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Average Drive,Average Drive,Average Drive,Average Drive,Average Drive
Unnamed: 0_level_1,Player,PF,Yds,Ply,Y/P,TO,FL,1stD,Cmp,Att,...,Yds,1stPy,#Dr,Sc%,TO%,Start,Time,Plays,Yds,Pts
0,Team Stats,451,6366,1115.0,5.7,28,10,381,385.0,579,...,883.0,24.0,181.0,41.4,14.9,Own 29.0,3:01,6.32,35.1,2.37
1,Opp. Stats,311,5222,1015.0,5.1,30,12,313,363.0,552,...,753.0,34.0,181.0,32.0,16.0,Own 27.1,2:40,5.8,28.8,1.67
2,Lg Rank Offense,6,4,,,23,16,3,,16,...,,,,6.0,10.0,13,3,6.0,6.0,6.0
3,Lg Rank Defense,4,9,,,3,5,11,,8,...,,,,26.0,2.0,6,12,11.0,11.0,7.0


In [52]:
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df

afc_team_stats_df = flatten_columns(afc_team_stats_df)

In [54]:
afc_team_stats_df.columns

Index(['Unnamed: 0_level_0 Player', 'Unnamed: 1_level_0 PF',
       'Unnamed: 2_level_0 Yds', 'Tot Yds & TO Ply', 'Tot Yds & TO Y/P',
       'Tot Yds & TO TO', 'Unnamed: 6_level_0 FL', 'Unnamed: 7_level_0 1stD',
       'Passing Cmp', 'Passing Att', 'Passing Yds', 'Passing TD',
       'Passing Int', 'Passing NY/A', 'Passing 1stD', 'Rushing Att',
       'Rushing Yds', 'Rushing TD', 'Rushing Y/A', 'Rushing 1stD',
       'Penalties Pen', 'Penalties Yds', 'Penalties 1stPy',
       'Unnamed: 23_level_0 #Dr', 'Unnamed: 24_level_0 Sc%',
       'Unnamed: 25_level_0 TO%', 'Average Drive Start', 'Average Drive Time',
       'Average Drive Plays', 'Average Drive Yds', 'Average Drive Pts'],
      dtype='object')

In [55]:
afc_team_stats_df['p_n_a'] = afc_team_stats_df['Passing NY/A']

In [56]:
afc_team_stats_df.columns

Index(['Unnamed: 0_level_0 Player', 'Unnamed: 1_level_0 PF',
       'Unnamed: 2_level_0 Yds', 'Tot Yds & TO Ply', 'Tot Yds & TO Y/P',
       'Tot Yds & TO TO', 'Unnamed: 6_level_0 FL', 'Unnamed: 7_level_0 1stD',
       'Passing Cmp', 'Passing Att', 'Passing Yds', 'Passing TD',
       'Passing Int', 'Passing NY/A', 'Passing 1stD', 'Rushing Att',
       'Rushing Yds', 'Rushing TD', 'Rushing Y/A', 'Rushing 1stD',
       'Penalties Pen', 'Penalties Yds', 'Penalties 1stPy',
       'Unnamed: 23_level_0 #Dr', 'Unnamed: 24_level_0 Sc%',
       'Unnamed: 25_level_0 TO%', 'Average Drive Start', 'Average Drive Time',
       'Average Drive Plays', 'Average Drive Yds', 'Average Drive Pts',
       'p_n_a'],
      dtype='object')