<a href="https://colab.research.google.com/github/MLDS-UT-Austin/FantasyFootball/blob/main/NFL_Stats_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [2]:
base_url = "https://www.pro-football-reference.com"
stat_categories = ["passing", "scrimmage", "kicking"]

if not os.path.exists("./yearly_stats/"):
    os.makedirs("./yearly_stats/")

if not os.path.exists("./player_gamelogs/"):
    os.makedirs("./player_gamelogs/")

# Going back as far as the beginning of Brady's career
years = [x for x in range(2015, 2022)]

In [3]:
def process_result(result):
    output = re.split(' |-', result['Result'])
    output[0] = output[0] == "W"
    return output

In [4]:
def add_urls(soup, urls, limit=50):
    links = soup.find_all('a')
    for link in links[:limit*2]:
        if not 'team' in link['href']:
            urls.add(link['href'])

In [None]:
# Initialize set for storing player URLs
player_urls = set()

for year in years:
    for stat in stat_categories:
        
        # Get webpage contents and begin parsing with bs4
        url = f"{base_url}/years/{year}/{stat}.htm"
        webpage = requests.get(url)
        page_soup = BeautifulSoup(webpage.content, 'html.parser')
        
        # Table's ID is slightly different when 
        if stat == "scrimmage":
            table_id = "receiving_and_rushing"
        else:
            table_id = stat
        
        table_soup = page_soup.find('table', {'id': table_id})
        
        if stat == "scrimmage":
            add_urls(table_soup, player_urls, 100)
        else:
            add_urls(table_soup, player_urls)
        
        stat_df = pd.read_html(str(table_soup))[0]
        
        # Clean dataset
        if stat != "passing":
            stat_df.columns = ['_'.join(col).strip('_') if "level" not in col[0] else col[1] for col in stat_df.columns.to_flat_index()]
            
        # Removes duplicate headers throughout table
        stat_df = stat_df[stat_df.Rk != "Rk"].set_index("Rk")
        
        # Converts emblemized award recognition to boolean column features
        stat_df['ProBowl'] = stat_df.Player.apply(lambda x: '*' in x)
        stat_df['AllPro'] = stat_df.Player.apply(lambda x: '+' in x)
        stat_df.Player = stat_df.Player.str.strip('*+')
        
        # Export dataset
        stat_df.to_csv(f"./yearly_stats/{year}_{stat}.csv", index=False)

In [None]:
# The full amount of player gamelogs we have!!!
len(player_urls)

In [None]:
# Laziest way I could import the set back in for comparison
completed_logs = set(pd.read_csv("./player_urls.csv").to_numpy().flatten())

In [None]:
# Since we're using sets, we can take the inverse of the union of our completed and newly grabbed sets
len(player_urls.difference(completed_logs))

In [None]:
# We now grab all of the career logs for those we don't already have.
new_logs = player_urls.difference(completed_logs)
if not new_logs:
    print("Completed all logs already")
else:
    for url in player_urls.difference(completed_logs):

        full_url = f"{base_url}{url[:-4]}/gamelog/"
        webpage = requests.get(full_url)
        page_soup = BeautifulSoup(webpage.content, 'html.parser')
        table_soup = page_soup.find('table', {'id': 'stats'})
        player_name = page_soup.find('h1', {'itemprop': 'name'}).text.strip()

        stat_df = pd.read_html(str(table_soup))[0]
        stat_df.columns = ['_'.join(col).strip('_') if "level" not in col[0] else col[1] for col in stat_df.columns.to_flat_index()]

        # Removes duplicate headers throughout table
        stat_df = stat_df[stat_df.Rk != "Rk"].set_index("Rk")

        # Convert Away to bool
        stat_df['Away_status'] = np.where(stat_df["Unnamed: 7_level_1"].isnull(), False, True)

        # Data Cleaning
        stat_df[['Win', 'Score', 'Opp Score']] = stat_df.apply(process_result, result_type='expand', axis=1)
        stat_df = stat_df.drop(["Unnamed: 7_level_1", "Result"], axis=1)
        stat_df['GS'] = np.where(stat_df['GS'].isnull(), False, True)
        stat_df['Player'] = player_name
        
        # Export to a new CSV
        stat_df.to_csv(f"./player_gamelogs/{player_name.replace(' ', '_')}.csv", index=False)

In [None]:
pd.DataFrame(player_urls.union(completed_logs), columns=['URLs']).to_csv("./player_urls.csv", index=False)