## Import Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

### Set parameters

In [None]:
base_url = "https://www.nfl.com"
current_week = 1


## Player Stats

#### Define function to get links

In [None]:
def get_links(level, season, base_url="https://www.nfl.com"):
    # Initialize a list to store the links
    all_links = []

    if level == "player":
        # Request raw HTML
        html = requests.get("https://www.nfl.com/stats/player-stats/")
        # Assuming your HTML is stored in a variable named html_content
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find the ul element with class 'd3-o-tabbed-controls-selector__list'
        li_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
    elif level == "team":
        # First get the base links for offense, defense, and special-teams tab
        # Request raw HTML
        html = requests.get("https://www.nfl.com/stats/team-stats/")
        # Assuming your HTML is stored in a variable named html_content
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find the ul element with class 'd3-o-tabbed-controls-selector__list'
        ul_element = soup.find('ul', class_='d3-o-tabbed-controls-selector__list')
        # Find all li elements within the ul element
        li_elements = ul_element.find_all('li')

    # Initialize a list to store the href values
    href_values = []

    # Iterate through the li elements and extract the href values from the a tags
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            href = a_tag['href']
            href_values.append(href)

    # Loop through href_values and fetch links for each URL
    for href in href_values:
        url = base_url + href
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")
        a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
        links = [base_url + element.find('a')['href'] for element in a_elements]
        all_links.extend(links)  # Append the links to the all_links list
    
    # Replace "2023" with the value of the season variable we want to scrape
    all_links = [link.replace('2023', season) for link in all_links]
    return all_links


    # For all other cases (invalid level), fetch general category links
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
    tabs = [element.find('a').get_text() for element in a_elements]
    links = [base_url + element.find('a')['href'] for element in a_elements]

    return dict(zip(tabs, links))


#### Define Function to format links

In [122]:
# Updated format_links function
def format_links(level, season):
    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}

    if level == "player":
        # Define the URL for the player level
        all_links = get_links(level, season, base_url)  # Pass the URL as an argument

        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = "individual" # e.g., individual, offense, defense, special-teams
            category = parts[6]
            year = parts[7] # 2023
            leg = parts[8] # reg of playoffs
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    if level == "team":
        all_links = get_links(level, season, base_url)

        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = parts[5] # e.g., offense, defense, special-teams
            category = parts[6] # e.g., passing, rushing etc.
            year = parts[7] # 2023
            leg = parts[8] # reg of playoffs
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    # Get stat category names and update unit_links
    stat_cols = {}

    for outer_key, inner_dict in team_stats_dict.items():
        inner_keys = list(inner_dict.keys())
        
        if outer_key in stat_cols:
            stat_cols[outer_key].extend(inner_keys)
        else:
            stat_cols[outer_key] = inner_keys

    # Update unit_links with the fetched links
    unit_links = team_stats_dict

    return unit_links


### Define Function to Check/Create Directory

In [None]:
# Function to create directories if they don't exist
def create_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f'Directory "{directory_path}" has been created.')
        except OSError as e:
            print(f'Error: Failed to create directory "{directory_path}".')
            print(e)
    else:
        print(f'Directory "{directory_path}" already exists.')


### Define Function to Scrape and Process Data

In [None]:
# Function to scrape and process data
def scrape_and_process_data(link, unit, category, level, unit_directory_path):
    # Request raw HTML for the current page
    response = requests.get(link)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all elements with the class 'd3-o-player-stats--detailed'
        stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

        # Initialize lists to collect data
        stat_val = []
        stat_col = []

        # Loop through each <tr> element to extract and collect the text from <td> elements
        for row in stats:
            # This gets the stat names
            header_cells = row.find_all('th')

            if len(header_cells) > 0:
                for cell in header_cells:
                    stat_col.append(cell.get_text(strip=True))

            # This gets the stats
            data_cells = row.find_all('td')

            if len(data_cells) > 0:
                for cell in data_cells:
                    stat_val.append(cell.get_text(strip=True))

        # Determine the number of columns in each row
        num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty
        
        # Split the list into rows
        rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

        # Create a DataFrame for the current category
        df = pd.DataFrame(rows, columns=stat_col)
        
        # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
        if 'Team' in df.columns:
            # Remove duplicated part from the "Team" column
            df['Team'] = df['Team'].apply(lambda x: x[:len(x)//2])

        # Create the directory if it doesn't exist
        create_directory_if_not_exists(unit_directory_path)
    
        # Specify the file path within the unit's directory
        csv_file_path = os.path.join(unit_directory_path, category + '.csv')

        # Export the DataFrame to a CSV file
        df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

        print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')
    else:
        print(f"Error: Unable to fetch the page for category '{category}' in unit '{unit}'.")

### Define Function to  Initiate Scraping Process

In [136]:
def get_stats(level, season):
    lst = ["offense", "defense", "special-teams"]

    # Combine the base directory path with the current week for current season, else store data in reg (regular season)
    global current_season
    if season == current_season:
        directory_path = os.path.join('data', season, level, 'week' + str(current_week))
    elif season != current_season:
        directory_path = os.path.join('data', season, level, 'reg')
    
    unit_links = format_links(level, season)

    if level == "team":
        for unit, categories in unit_links.items(): ## change to be same with player
            for category, link in categories.items():

                # Create a subdirectory for the current unit
                unit_directory_path = os.path.join(directory_path, unit)
                #create_directory_if_not_exists(unit_directory_path)


                # Call scrape_and_process_data with individual category URLs
                scrape_and_process_data(link, unit, category, level, unit_directory_path)
                
    elif level == "player":
        for unit, categories in unit_links.items(): ## change to be same with player
            for category, link in categories.items():
                # Directly use the week1 directory for player-level data
                unit_directory_path = directory_path

                scrape_and_process_data(link, unit, category, level, unit_directory_path)
    else:
        print("Invalid level specified.")


### Get Historic Data

In [None]:
current_season = "1993"

In [None]:
# hist - Example usage:
levels = ["player", "team"]
# Create a list of years as strings from 1970 to 2022
seasons = [str(year) for year in range(1970, 2023)]

for season in seasons:
    for level in levels:
        get_stats(level, season)

### Get Current Season Data

In [137]:
# Example usage:
levels = ["player", "team"]

for level in levels:
    get_stats(level, season = "1994")

['https://www.nfl.com/stats/player-stats/category/passing/1994/reg/all/passingyards/desc', 'https://www.nfl.com/stats/player-stats/category/rushing/1994/reg/all/rushingyards/desc', 'https://www.nfl.com/stats/player-stats/category/receiving/1994/reg/all/receivingreceptions/desc', 'https://www.nfl.com/stats/player-stats/category/fumbles/1994/reg/all/defensiveforcedfumble/desc', 'https://www.nfl.com/stats/player-stats/category/tackles/1994/reg/all/defensivecombinetackles/desc', 'https://www.nfl.com/stats/player-stats/category/interceptions/1994/reg/all/defensiveinterceptions/desc', 'https://www.nfl.com/stats/player-stats/category/field-goals/1994/reg/all/kickingfgmade/desc', 'https://www.nfl.com/stats/player-stats/category/kickoffs/1994/reg/all/kickofftotal/desc', 'https://www.nfl.com/stats/player-stats/category/kickoff-returns/1994/reg/all/kickreturnsaverageyards/desc', 'https://www.nfl.com/stats/player-stats/category/punts/1994/reg/all/puntingaverageyards/desc', 'https://www.nfl.com/sta

In [135]:
get_stats(level="player", season = "2023")

['https://www.nfl.com/stats/player-stats/category/passing/1994/reg/all/passingyards/desc', 'https://www.nfl.com/stats/player-stats/category/rushing/1994/reg/all/rushingyards/desc', 'https://www.nfl.com/stats/player-stats/category/receiving/1994/reg/all/receivingreceptions/desc', 'https://www.nfl.com/stats/player-stats/category/fumbles/1994/reg/all/defensiveforcedfumble/desc', 'https://www.nfl.com/stats/player-stats/category/tackles/1994/reg/all/defensivecombinetackles/desc', 'https://www.nfl.com/stats/player-stats/category/interceptions/1994/reg/all/defensiveinterceptions/desc', 'https://www.nfl.com/stats/player-stats/category/field-goals/1994/reg/all/kickingfgmade/desc', 'https://www.nfl.com/stats/player-stats/category/kickoffs/1994/reg/all/kickofftotal/desc', 'https://www.nfl.com/stats/player-stats/category/kickoff-returns/1994/reg/all/kickreturnsaverageyards/desc', 'https://www.nfl.com/stats/player-stats/category/punts/1994/reg/all/puntingaverageyards/desc', 'https://www.nfl.com/sta

TypeError: scrape_and_process_data() missing 1 required positional argument: 'unit_directory_path'