## Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

### Set parameters

In [2]:
base_url = "https://www.nfl.com"
current_week = 1

## Player Stats

#### Define function to get links

In [3]:
def get_links(level, base_url="https://www.nfl.com"):
    if level == "player":
        # URL for player-level stats
        url = "https://www.nfl.com/stats/player-stats"
    elif level == "team":
        unit_list = ["offense", "defense", "special-teams"]
        category_links = {}
        stat_cols = {
            "offense": ["passing", "rushing", "receiving", "scoring", "downs"],
            "defense": ["passing", "rushing", "receiving", "scoring", "tackles", "downs", "fumbles", "interceptions"],
            "special-teams": ["field-goals", "scoring", "kickoffs", "kickoff-returns", "punting", "punt-returns"]
        }

        base_team_stats_url = "https://www.nfl.com/stats/team-stats/"

        for unit in unit_list:
            unit_data = {}
            for stat_col in stat_cols[unit]:
                # Construct the URL for each category in stat_cols
                url = f"{base_team_stats_url}{unit}/2023/{stat_col}"
                unit_data[stat_col.capitalize()] = url

            # Add the unit's data to the main dictionary
            category_links[unit.capitalize()] = unit_data
        return category_links

    # For all other cases (invalid level), fetch general category links
    html = requests.get(url)
    soup = BeautifulSoup(html.content, "html.parser")
    a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
    tabs = [element.find('a').get_text() for element in a_elements]
    links = [base_url + element.find('a')['href'] for element in a_elements]
    print(links)

    return dict(zip(tabs, links))


#### Define Function to format links

In [5]:
def format_links_old(level):
    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}
    lst = ["offense", "defense", "special-teams"]

    # Process the URL and build the unit_links dictionary
    if level == "player":
        # Define the URL for the player level
        url = "https://www.nfl.com"
        unit_links = get_links(level, url)  # Pass the URL as an argument

    if level == "team":
        for unit in lst:
            # Initialize an empty dictionary for the current unit
            unit_data = {}
            
            # Define the URL for the current unit
            url = f"https://www.nfl.com/stats/team-stats/{unit.lower()}/2023"
            category_links = get_links(level, url)  # Pass the URL as an argument

            # Assuming category_links is your nested dictionary
            for unit, categories in category_links.items():
                for category, link in categories.items():
                    # Remove '/2023' from the link first
                    link = link.replace('/2023', '')
                    # Append '/2023/reg/all' to the modified link
                    category_links[unit][category] = f"{link}/2023/reg/all"


            print('category links are: ', category_links) 
            # Add the category links to the unit_data dictionary
            #unit_data.update(category_links)
            
            # Add the unit_data dictionary to the unit_links dictionary
            unit_links = category_links
            #unit_links[unit.capitalize()] = unit_data

    return unit_links


In [6]:
# Updated format_links function
def format_links(level):
    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}
    base_url = "https://www.nfl.com"  # Replace with your base URL

    if level == "player":
        # Define the URL for the player level
        url = "https://www.nfl.com"
        unit_links = get_links(level, url)  # Pass the URL as an argument

    if level == "team":
        # First get the base links for offense, defense, and special-teams tab
        # Request raw HTML
        html = requests.get("https://www.nfl.com/stats/team-stats/")

        # Assuming your HTML is stored in a variable named html_content
        soup = BeautifulSoup(html.content, 'html.parser')

        # Find the ul element with class 'd3-o-tabbed-controls-selector__list'
        ul_element = soup.find('ul', class_='d3-o-tabbed-controls-selector__list')

        # Find all li elements within the ul element
        li_elements = ul_element.find_all('li')

        # Initialize a list to store the href values
        href_values = []

        # Iterate through the li elements and extract the href values from the a tags
        for li in li_elements:
            a_tag = li.find('a')
            if a_tag:
                href = a_tag['href']
                href_values.append(href)

        # Initialize a list to store the links
        all_links = []

        # Loop through href_values and fetch links for each URL
        for href in href_values:
            url = base_url + href
            html = requests.get(url)
            soup = BeautifulSoup(html.content, "html.parser")
            a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
            links = [base_url + element.find('a')['href'] for element in a_elements]
            all_links.extend(links)  # Append the links to the all_links list

        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = parts[5] # e.g., offense, defense, special-teams
            category = parts[6] # e.g., passing, rushing etc.
            year = parts[7] # 2023
            leg = parts[8] # reg of playoffs
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

        # Update unit_links with the fetched links
        unit_links = team_stats_dict

    return unit_links


In [7]:
### Scrape and export to DataFrame

### Define Function to Check/Create Directory

In [8]:
# Function to create directories if they don't exist
def create_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f'Directory "{directory_path}" has been created.')
        except OSError as e:
            print(f'Error: Failed to create directory "{directory_path}".')
            print(e)
    else:
        print(f'Directory "{directory_path}" already exists.')


### Define Function to Scrape and Process Data

In [9]:
# Function to scrape and process data
def scrape_and_process_data(link, unit, category, level, unit_directory_path):
    # Request raw HTML for the current page
    response = requests.get(link)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all elements with the class 'd3-o-player-stats--detailed'
        stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

        # Initialize lists to collect data
        stat_val = []
        stat_col = []

        # Loop through each <tr> element to extract and collect the text from <td> elements
        for row in stats:
            # This gets the stat names
            header_cells = row.find_all('th')

            if len(header_cells) > 0:
                for cell in header_cells:
                    stat_col.append(cell.get_text(strip=True))

            # This gets the stats
            data_cells = row.find_all('td')

            if len(data_cells) > 0:
                for cell in data_cells:
                    stat_val.append(cell.get_text(strip=True))

        # Determine the number of columns in each row
        num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty
        
        # Split the list into rows
        rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

        # Create a DataFrame for the current category
        df = pd.DataFrame(rows, columns=stat_col)
        # Convert all columns except "Team" to numeric
        #numeric_columns = df.columns.difference(['Team'])
        #df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
        
        # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
        if 'Team' in df.columns:
            # Remove duplicated part from the "Team" column
            df['Team'] = df['Team'].apply(lambda x: x[:len(x)//2])

        # Create the directory if it doesn't exist
        create_directory_if_not_exists(unit_directory_path)
    
        
        # Specify the file path within the unit's directory
        csv_file_path = os.path.join(unit_directory_path, category + '.csv')

        # Export the DataFrame to a CSV file
        df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

        print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')
    else:
        print(f"Error: Unable to fetch the page for category '{category}' in unit '{unit}'.")

#### Initiate Scraping Process

In [10]:
def get_stats(level):
    lst = ["offense", "defense", "special-teams"]

    # Combine the base directory path with the current week
    directory_path = os.path.join('data', level, 'week' + str(current_week))
    unit_links = format_links(level)

    if level == "team":
        for unit, categories in unit_links.items(): ## change to be same with player
            for category, link in categories.items():

                # Create a subdirectory for the current unit
                unit_directory_path = os.path.join(directory_path, unit)
                create_directory_if_not_exists(unit_directory_path)

                # Call scrape_and_process_data with individual category URLs
                scrape_and_process_data(link, unit, category, level, unit_directory_path)
                
    elif level == "player":
        # Directly use the week1 directory for player-level data
        unit_directory_path = directory_path

        # Call scrape_and_process_data with the unit_links dictionary

        for category, link in unit_links.items():
            scrape_and_process_data(link, level, category, level, unit_directory_path)
    else:
        print("Invalid level specified.")


In [12]:
# Example usage:
level = "player"  # Replace with "team" or "player" as needed
get_stats(level)

['https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc', 'https://www.nfl.com/stats/player-stats/category/rushing/2023/reg/all/rushingyards/desc', 'https://www.nfl.com/stats/player-stats/category/receiving/2023/reg/all/receivingreceptions/desc', 'https://www.nfl.com/stats/player-stats/category/fumbles/2023/reg/all/defensiveforcedfumble/desc', 'https://www.nfl.com/stats/player-stats/category/tackles/2023/reg/all/defensivecombinetackles/desc', 'https://www.nfl.com/stats/player-stats/category/interceptions/2023/reg/all/defensiveinterceptions/desc', 'https://www.nfl.com/stats/player-stats/category/field-goals/2023/reg/all/kickingfgmade/desc', 'https://www.nfl.com/stats/player-stats/category/kickoffs/2023/reg/all/kickofftotal/desc', 'https://www.nfl.com/stats/player-stats/category/kickoff-returns/2023/reg/all/kickreturnsaverageyards/desc', 'https://www.nfl.com/stats/player-stats/category/punts/2023/reg/all/puntingaverageyards/desc', 'https://www.nfl.com/sta

In [13]:
level = "team"  # Replace with "player" or "team" as needed
get_stats(level)

Directory "data\team\week1\offense" has been created.
Directory "data\team\week1\offense" already exists.
DataFrame for category "passing" in unit "offense" has been exported to data\team\week1\offense\passing.csv
Directory "data\team\week1\offense" already exists.
Directory "data\team\week1\offense" already exists.
DataFrame for category "rushing" in unit "offense" has been exported to data\team\week1\offense\rushing.csv
Directory "data\team\week1\offense" already exists.
Directory "data\team\week1\offense" already exists.
DataFrame for category "receiving" in unit "offense" has been exported to data\team\week1\offense\receiving.csv
Directory "data\team\week1\offense" already exists.
Directory "data\team\week1\offense" already exists.
DataFrame for category "scoring" in unit "offense" has been exported to data\team\week1\offense\scoring.csv
Directory "data\team\week1\offense" already exists.
Directory "data\team\week1\offense" already exists.
DataFrame for category "downs" in unit "of