## Import Libraries

In [1]:
#pip install -v pandas

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

### Set parameters

In [3]:
base_url = "https://www.nfl.com"
current_season  = "2023"

## Player Stats

#### Define function to get links

This function retrieves the links for player or team statistics based on the specified level (either "player" or "team") and season. It scrapes the NFL website for the relevant links.

In [4]:
# Define a function to get statistics links
def get_links(level, season, base_url="https://www.nfl.com"):
    """
    Retrieves links for player or team statistics based on the specified level and season.
    
    Args:
        level (str): The statistics level, either "player" or "team".
        season (str): The NFL season for which statistics are retrieved.
        base_url (str, optional): The base URL for the NFL website. Defaults to "https://www.nfl.com".
        
    Returns:
        list: A list of links to statistics pages.
    """
    # Initialize a list to store the links
    all_links = []

    if level == "player":
        # Request the raw HTML for player statistics page
        html = requests.get("https://www.nfl.com/stats/player-stats/")
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find all list items with class 'd3-o-tabs__list-item'
        li_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
    elif level == "team":
        # Request the raw HTML for team statistics page
        html = requests.get("https://www.nfl.com/stats/team-stats/")
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find the unordered list element with class 'd3-o-tabbed-controls-selector__list'
        ul_element = soup.find('ul', class_='d3-o-tabbed-controls-selector__list')
        # Find all list items within the unordered list
        li_elements = ul_element.find_all('li')

    # Initialize a list to store the href values
    href_values = []

    # Iterate through the list items and extract href values from anchor tags
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            href = a_tag['href']
            href_values.append(href)

    # Loop through href_values and fetch links for each URL
    for href in href_values:
        url = base_url + href
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")
        # Find all list items with class 'd3-o-tabs__list-item'
        a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
        # Extract and store links from anchor tags
        links = [base_url + element.find('a')['href'] for element in a_elements]
        all_links.extend(links)  # Append the links to the all_links list
    
    # Replace "2023" with the value of the season variable we want to scrape
    all_links = [link.replace('2023', season) for link in all_links]
    return all_links


This function collects sub-pages for different statistics categories based on the provided unit_links. It navigates through pagination to gather all relevant pages.

In [5]:
# Define a function to get sub-pages for different categories
def get_sub_pages(unit_links):
    """
    Collects sub-pages for different statistics categories based on provided unit_links.
    
    Args:
        unit_links (dict): A dictionary of unit-specific statistics links.
        
    Returns:
        dict: A dictionary of sub-pages for each unit and category.
    """
    # Create a dictionary to store the category names and their corresponding page links
    sub_pages = {}
    base_url = "https://www.nfl.com"  # Assuming you have a base URL

    for unit, category_links in unit_links.items():
        sub_pages[unit] = {}

        for category, link in category_links.items():
            page_count = 0  # Initialize page count
            current_link = link  # Use the provided link as the starting point
            current_stat = category  # Set the current_stat to the category name

            # Initialize the category's dictionary
            sub_pages[unit][current_stat] = {page_count: current_link}

            # Create an infinite loop to scrape data from multiple pages
            while True:
                # Request raw HTML for the current page
                response = requests.get(current_link)

                # Check if the request was successful
                if response.status_code == 200:
                    # Create a BeautifulSoup object to parse the HTML
                    soup = BeautifulSoup(response.content, "html.parser")

                    # Find the "Next Page" link
                    next_page_link = soup.find('a', class_='nfl-o-table-pagination__next')

                    if next_page_link:
                        # Extract the 'href' attribute
                        href = next_page_link['href']

                        # Update current_link with the next page's URL
                        current_link = base_url + href
                        page_count += 1  # Increment page count

                        # Add the link to the category's dictionary
                        sub_pages[unit][current_stat][page_count] = current_link
                    else:
                        print(f"No more pages to scrape for {unit} - {current_stat}.")
                        break  # Exit the loop when there are no more pages
                else:
                    print(f"Error: Unable to fetch data from {current_link} for {unit} - {current_stat}.")
                    break  # Exit the loop on request error

    # Display the collected category pages and their links
    for unit, categories in sub_pages.items():
        for category, pages in categories.items():
            print(f"{unit} - {category} Sub-Pages:")
            for page_num, page_link in pages.items():
                print(f"Page {page_num}: {page_link}")
    return sub_pages

#### Define Function to format links

The format_links function organizes and formats the links for player or team statistics into a dictionary structure, making it easier to access specific categories and sub-pages.

In [6]:
# Define a function to format and organize links
def format_links(level, season):
    """
    Formats and organizes statistics links into a dictionary structure for easy access.
    
    Args:
        level (str): The statistics level, either "player" or "team".
        season (str): The NFL season for which statistics are retrieved.
        
    Returns:
        dict: A dictionary of formatted statistics links.
    """
    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}

    if level == "player":
        # Define the URL for the player level
        all_links = get_links(level, season, base_url)  # Pass the URL as an argument
        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = "individual" # e.g., individual, offense, defense, special-teams
            category = parts[6]
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    if level == "team":
        all_links = get_links(level, season, base_url)

        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = parts[5] # e.g., offense, defense, special-teams
            category = parts[6] # e.g., passing, rushing etc.
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    # Get stat category names and update unit_links
    stat_cols = {}

    for outer_key, inner_dict in team_stats_dict.items():
        inner_keys = list(inner_dict.keys())
        
        if outer_key in stat_cols:
            stat_cols[outer_key].extend(inner_keys)
        else:
            stat_cols[outer_key] = inner_keys

    # Update unit_links with the fetched links
    unit_links = team_stats_dict
    unit_links = get_sub_pages(unit_links)
    return unit_links


### Define Function to Check/Create Directory

This function checks if a directory exists at the specified path and creates it if it doesn't. It's used to ensure data storage directories are in place.

In [7]:
# Define a function to create a directory if it doesn't exist
def create_directory_if_not_exists(directory_path):
    """
    Checks if a directory exists at the specified path and creates it if it doesn't.

    Args:
        directory_path (str): The path of the directory to be checked/created.
    """
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f'Directory "{directory_path}" has been created.')
        except OSError as e:
            print(f'Error: Failed to create directory "{directory_path}".')
            print(e)
    else:
        print(f'Directory "{directory_path}" already exists.')


### Define Function to Scrape and Process Data

The scrape_and_process_data function is responsible for scraping data for a specific category, processing it, and storing it in the appropriate directory. It iterates through sub-pages to gather data.

In [8]:
# Define a function to scrape and process data
def scrape_and_process_data(unit, category, level, unit_directory_path, unit_links):
    """
    Scrapes data for a specific category, processes it, and stores it in the appropriate directory.

    Args:
        unit (str): The unit (e.g., offense, defense) for which data is scraped.
        category (str): The specific statistics category (e.g., passing, rushing).
        level (str): The statistics level, either "player" or "team".
        unit_directory_path (str): The path of the directory where data is stored.
        unit_links (dict): A dictionary of unit-specific statistics links.
    """
    # Create a list to store DataFrames for the current category
    category_dfs = []

    # Loop through the sub-pages for the current category
    for page_num, page_url in unit_links[unit][category].items():
        # Request raw HTML for the current page
        response = requests.get(page_url)

        # Check if the request was successful
        if response.status_code == 200:
            # Create a BeautifulSoup object to parse the HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all elements with the class 'd3-o-player-stats--detailed'
            stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

            # Initialize lists to collect data
            stat_val = []
            stat_col = []

            # Loop through each <tr> element to extract and collect the text from <td> elements
            for row in stats:
                # This gets the stat names
                header_cells = row.find_all('th')

                if len(header_cells) > 0:
                    for cell in header_cells:
                        stat_col.append(cell.get_text(strip=True))

                # This gets the stats
                data_cells = row.find_all('td')

                if len(data_cells) > 0:
                    for cell in data_cells:
                        stat_val.append(cell.get_text(strip=True))

            # Determine the number of columns in each row
            num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty

            # Split the list into rows
            rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

            # Create a DataFrame for the current category and page
            df = pd.DataFrame(rows, columns=stat_col)

            # Append the DataFrame to the list for the current category
            category_dfs.append(df)
        else:
            print(f"Error: Unable to fetch data from {page_url} for {category}.")

    # Concatenate dataframes for the current category into one
    if category_dfs:
        merged_df = pd.concat(category_dfs, ignore_index=True)

        # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
        if 'Team' in merged_df.columns:
            # Remove duplicated part from the "Team" column
            merged_df['Team'] = merged_df['Team'].apply(lambda x: x[:len(x) // 2])

        # Create the directory if it doesn't exist
        create_directory_if_not_exists(unit_directory_path)

        # Specify the file path within the unit's directory
        csv_file_path = os.path.join(unit_directory_path, category + '.csv')

        # Export the DataFrame to a CSV file
        merged_df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

        print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')
    else:
        print(f"No data found for category '{category}' in unit '{unit}'")


### Customer Year Error

In [None]:
class ValidSeasonError(Exception):
   def __init__(self, season):
       self.season = season
       
   def __str__(self):
        return 'The ' + str(self.season) + ' Season is not within the database ranging from 1970 to current season. ' 

### Define Function to  Initiate Scraping Process

This function serves as the entry point for collecting statistics data. It specifies the current season and week, organizes data storage directories, and initiates the scraping process.

In [10]:
def get_stats(level, season):
    """
    Initiates the data scraping process for player or team statistics.

    Args:
        level (str): The statistics level, either "player" or "team".
        season (str): The NFL season for which statistics are retrieved.
    """
    # Check if the season is in string format    
    if isinstance(season, str):
        try:
            season = int(season)
            print("Converted season to integer.")
        except ValueError:
            print('The season cannot be converted to integer.')
            return
    # Convert to string    
    else:
        season = str(season)

    if int(season) < 1970:
        raise ValidSeasonError(season)
    else:
        # Combine the base directory path with the current week for the current season,
        # else store data in 'reg' (regular season) directory
        global current_season, current_week
        directory_path = os.path.join('data', season, level)
        if int(season) == current_season:
            directory_path = os.path.join(directory_path, f'week{current_week}')

        unit_links = format_links(level, season)

        if level == "team":
            for unit, categories in unit_links.items():
                for category, _ in categories.items():
                    # Create a subdirectory for the current unit
                    unit_directory_path = os.path.join(directory_path, unit)

                    # Call scrape_and_process_data with unit and category
                    scrape_and_process_data(unit, category, level, unit_directory_path, unit_links)

        elif level == "player":
            for unit, categories in unit_links.items():
                for category, _ in categories.items():
                    # Directly use the week1 directory for player-level data
                    unit_directory_path = directory_path

                    # Pass unit and category to the scrape_and_process_data function
                    scrape_and_process_data(unit, category, level, unit_directory_path, unit_links)
        else:
            print("Invalid level specified.")