In [1]:
# 3rd Party Libraries
from bs4 import BeautifulSoup
from colored import Fore, Back, Style
from dotenv import load_dotenv
import pandas as pd
import requests

# Built in libraries
from time import gmtime, strftime
from typing import List, Set, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

In [2]:
BASE_URL = 'https://www.pro-football-reference.com'

# Complete URL for the 2023 year page
standings_stats_url = f"{BASE_URL}/years/2023/"

response = requests.get(standings_stats_url)

# Return status code if request is successful (200)
def request_status(url: str) -> bool:
    if response.status_code == 200:
        return response.status_code
    else:
        print(f'{Fore.white}{Back.red}False{Style.reset}')
        return response.status_code

In [3]:
# Function to create a BeautifulSoup object if the request is successful
def make_soup(url: str) -> BeautifulSoup:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return None

soup = make_soup(standings_stats_url)

In [4]:
# Function to extract team URLs from a specific division
def get_team_urls(division_id):
    # Select all anchor tags within the division's section
    teams = soup.select(f"#{division_id} a")
    # Construct full URLs for each team and return as a list
    return [f"{BASE_URL}{l.get('href')}" for l in teams]
    
afc_teams_urls = get_team_urls('AFC')
nfc_teams_urls = get_team_urls('NFC')

In [20]:
nfc_teams_urls

['https://www.pro-football-reference.com/teams/dal/2023.htm',
 'https://www.pro-football-reference.com/teams/phi/2023.htm',
 'https://www.pro-football-reference.com/teams/nyg/2023.htm',
 'https://www.pro-football-reference.com/teams/was/2023.htm',
 'https://www.pro-football-reference.com/teams/det/2023.htm',
 'https://www.pro-football-reference.com/teams/gnb/2023.htm',
 'https://www.pro-football-reference.com/teams/min/2023.htm',
 'https://www.pro-football-reference.com/teams/chi/2023.htm',
 'https://www.pro-football-reference.com/teams/tam/2023.htm',
 'https://www.pro-football-reference.com/teams/nor/2023.htm',
 'https://www.pro-football-reference.com/teams/atl/2023.htm',
 'https://www.pro-football-reference.com/teams/car/2023.htm',
 'https://www.pro-football-reference.com/teams/sfo/2023.htm',
 'https://www.pro-football-reference.com/teams/ram/2023.htm',
 'https://www.pro-football-reference.com/teams/sea/2023.htm',
 'https://www.pro-football-reference.com/teams/crd/2023.htm']

In [6]:
def gather_team_stats(teams: List[str]) -> None:
    for t in teams:
        team_init = 
        soup = make_soup(t)
        if soup:
            table = soup.find('table', id='team_stats')
            if table:
                # Convert the table HTML to a DataFrame
                stats_df = pd.read_html(str(table))[0]  # [0] because pd.read_html returns a list of DataFrames
                return stats_df
            else:
                print(f"Table not found in {t}")
        else:
            print(f"Failed to get data for {t}")

afc_team_stats_df = gather_team_stats(afc_teams_urls[:1])

In [7]:
def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df

afc_team_stats_df = flatten_columns(afc_team_stats_df)

In [15]:
afc_team_stats_df = afc_team_stats_df.rename(
    {
     'Unnamed: 0_level_0 Player': 'player',
     'Unnamed: 1_level_0 PF': 'total_points',
     'Unnamed: 2_level_0 Yds': 'total_yards',
     'Tot Yds & TO Ply': 'total_off_plays',
     'Tot Yds & TO Y/P': 'total_yds_per_play',
     'Tot Yds & TO TO': 'total_to_lost',
     'Unnamed: 6_level_0 FL': 'total_fum_lost',
     'Unnamed: 7_level_0 1stD': 'total_first_downs',
     'Passing Cmp': 'pass_cmp',
     'Passing Att': 'pass_att',
     'Passing Yds': 'pass_yds', 
     'Passing TD': 'pass_td',
     'Passing Int': 'pass_int', 
     'Passing NY/A': 'pass_net_yds_per_att', 
     'Passing 1stD': 'pass_first_downs', 
     'Rushing Att': 'rush_att',
     'Rushing Yds': 'rush_yds', 
     'Rushing TD': 'rush_td', 
     'Rushing Y/A': 'rush_yds_per_att', 
     'Rushing 1stD': 'rush_first_downs',
     'Penalties Pen': 'penalties', 
     'Penalties Yds': 'penalty_yds', 
     'Penalties 1stPy': 'penalty_first_downs',
     'Unnamed: 23_level_0 #Dr': 'num_drives', 
     'Unnamed: 24_level_0 Sc%': 'scoring_pct',
     'Unnamed: 25_level_0 TO%': 'turnover_pct', 
     'Average Drive Start': 'avg_drive_start', 
     'Average Drive Time': 'avg_drive_time',
     'Average Drive Plays': 'avg_drive_plays', 
     'Average Drive Yds': 'avg_drive_yds', 
     'Average Drive Pts': 'avg_drive_pts'
     }, axis=1)

In [19]:
afc_team_stats_df.fillna(value=0)

Unnamed: 0,player,total_points,total_yards,total_off_plays,total_yds_per_play,total_to_lost,total_fum_lost,total_first_downs,pass_cmp,pass_att,...,penalty_yds,penalty_first_downs,num_drives,scoring_pct,turnover_pct,avg_drive_start,avg_drive_time,avg_drive_plays,avg_drive_yds,avg_drive_pts
0,Team Stats,451,6366,1115.0,5.7,28,10,381,385.0,579,...,883.0,24.0,181.0,41.4,14.9,Own 29.0,3:01,6.32,35.1,2.37
1,Opp. Stats,311,5222,1015.0,5.1,30,12,313,363.0,552,...,753.0,34.0,181.0,32.0,16.0,Own 27.1,2:40,5.8,28.8,1.67
2,Lg Rank Offense,6,4,0.0,0.0,23,16,3,0.0,16,...,0.0,0.0,0.0,6.0,5.0,13,2,3.0,5.0,6.0
3,Lg Rank Defense,4,9,0.0,0.0,3,5,11,0.0,8,...,0.0,0.0,0.0,25.0,1.0,4,10,10.0,10.0,7.0
