<a href="https://colab.research.google.com/github/MLDS-UT-Austin/FantasyFootball/blob/main/NFL_Stats_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

In [2]:
base_url = "https://www.pro-football-reference.com"
stat_categories = ["passing", "scrimmage", "kicking"]

if not os.path.exists("./yearly_stats/"):
    os.makedirs("./yearly_stats/")

if not os.path.exists("./player_gamelogs/"):
    os.makedirs("./player_gamelogs/")

# Going back as far as the beginning of Brady's career
years = [x for x in range(2015, 2022)]

In [3]:
def add_urls(soup, urls, limit=50):
    links = soup.find_all('a')
    for link in links[:limit*2]:
        if not 'team' in link['href']:
            urls.add(link['href'])

In [18]:
# Initialize set for storing player URLs
player_urls = set()

for year in years:
    for stat in stat_categories:
        
        # Get webpage contents and begin parsing with bs4
        url = f"{base_url}/years/{year}/{stat}.htm"
        webpage = requests.get(url)
        page_soup = BeautifulSoup(webpage.content, 'html.parser')
        
        # Table's ID is slightly different when 
        if stat == "scrimmage":
            table_id = "receiving_and_rushing"
        else:
            table_id = stat
        
        table_soup = page_soup.find('table', {'id': table_id})
        
        if stat == "scrimmage":
            add_urls(table_soup, player_urls, 100)
        else:
            add_urls(table_soup, player_urls)
        
        stat_df = pd.read_html(str(table_soup))[0]
        
        # Clean dataset
        if stat != "passing":
            stat_df.columns = ['_'.join(col).strip('_') if "level" not in col[0] else col[1] for col in stat_df.columns.to_flat_index()]
            
        # Removes duplicate headers throughout table
        stat_df = stat_df[stat_df.Rk != "Rk"].set_index("Rk")
        
        # Converts emblemized award recognition to boolean column features
        stat_df['ProBowl'] = stat_df.Player.apply(lambda x: '*' in x)
        stat_df['AllPro'] = stat_df.Player.apply(lambda x: '+' in x)
        stat_df.Player = stat_df.Player.str.strip('*+')
        
        # Export dataset
        stat_df.to_csv(f"./yearly_stats/{year}_{stat}.csv", index=False)

In [20]:
len(player_urls)

536

In [21]:
for url in player_urls:
    full_url = f"{base_url}{url[:-4]}/gamelog/"
    webpage = requests.get(full_url)
    page_soup = BeautifulSoup(webpage.content, 'html.parser')
    table_soup = page_soup.find('table', {'id': 'stats'})
    player_name = page_soup.find('h1', {'itemprop': 'name'}).text.strip()
    
    stat_df = pd.read_html(str(table_soup))[0]
    stat_df.columns = ['_'.join(col).strip('_') if "level" not in col[0] else col[1] for col in stat_df.columns.to_flat_index()]
            
    # Removes duplicate headers throughout table
    stat_df = stat_df[stat_df.Rk != "Rk"].set_index("Rk")
    
    # Convert Away to bool
    stat_df['Away_status'] = np.where(stat_df["Unnamed: 7_level_1"].isnull(), False, True)
    stat_df = stat_df.drop([])
    stat_df['GS'] = np.where(stat_df['GS'].isnull(), False, True)
    stat_df['Player'] = player_name
    
    stat_df.to_csv(f"./player_gamelogs/{player_name.replace(' ', '_')}.csv", index=False)