In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# pip install lxml

## Import Libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

## Player Stats

#### Define function to get links

In [4]:
# Function to process HTML content and return category links
def get_links(level):
    # Request raw HTML
    html = requests.get(url)

    # Create a BeautifulSoup object called `soup` to traverse the combined HTML
    soup = BeautifulSoup(html.content, "html.parser")

    # Find all <a> elements within <li> elements
    a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')

    # Extract the text from the <a> elements
    tabs = [element.find('a').get_text() for element in a_elements]

    # Extract the links (href attributes) from the <a> elements
    links = [base_url + element.find('a')['href'] for element in a_elements]

    # Create a dictionary to store the category names and their corresponding links
    category_links = dict(zip(tabs, links))

    return category_links

#### Define Function to format links

In [5]:
def format_links(level):
    # Initialize URL to None
    url = None

    # Define the URL to scrape based on the specified level
    if level == "player":
        url = f"{base_url}/stats/player-stats"
    elif level == "team":
        lst = ["offense", "defense", "special-teams"]
    else:
        print("Invalid level specified.")
        return None

    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}

    # Process the URL and build the unit_links dictionary
    if url:
        unit_links = get_links(url)
        
        # Print the dictionary as a table-like format using Pandas
        stat_pages_df = pd.DataFrame(list(unit_links.items()), columns=['Category', 'Link'])
        #print(stat_pages_df)

    if level == "team":
        for unit in lst:
            # Define the URL for the current unit
            url = f"{base_url}/stats/team-stats/{unit}/passing/2023/reg/all"
            category_links = get_links(url)

            # Add the category links to the unit_links dictionary
            unit_links[unit.capitalize()] = category_links

        # Print the nested dictionary
        #print(unit_links)

    return unit_links

In [6]:
### Scrape and export to DataFrame

### Set parameters

### Define Function to Check/Create Directory

In [7]:
# Function to create directories if they don't exist
def create_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f'Directory "{directory_path}" has been created.')
    else:
        print(f'Directory "{directory_path}" already exists.')

### Define Function to Scrape and Process Data

In [8]:
# Function to scrape and process data
def scrape_and_process_data(link, unit, category, level):
    # Request raw HTML for the current page
    response = requests.get(link)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all elements with the class 'd3-o-player-stats--detailed'
        stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

        # Initialize lists to collect data
        stat_val = []
        stat_col = []

        # Loop through each <tr> element to extract and collect the text from <td> elements
        for row in stats:
            # This gets the stat names
            header_cells = row.find_all('th')

            if len(header_cells) > 0:
                for cell in header_cells:
                    stat_col.append(cell.get_text(strip=True))

            # This gets the stats
            data_cells = row.find_all('td')

            if len(data_cells) > 0:
                for cell in data_cells:
                    stat_val.append(cell.get_text(strip=True))

        # Determine the number of columns in each row
        num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty
        
        # Split the list into rows
        rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

        # Create a DataFrame for the current category
        df = pd.DataFrame(rows, columns=stat_col)

        # Convert all columns except "Team" to numeric
        numeric_columns = df.columns.difference(['Team'])
        df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

        # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
        if 'Team' in df.columns:
            # Remove duplicated part from the "Team" column
            df['Team'] = df['Team'].apply(lambda x: x[:len(x)//2])

        # Specify the file path within the unit's directory
        csv_file_path = os.path.join(unit_directory_path, category + '.csv')

        # Export the DataFrame to a CSV file
        df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

        print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')
    else:
        print(f"Error: Unable to fetch the page for category '{category}' in unit '{unit}'.")


#### Iniate Scraping Process

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://www.nfl.com"
current_week = 1  # Replace with the current week number

# Define level we want stats for
level = "player"  # Replace with "team" or "player" as needed

# Define unit
lst = ["offense", "defense", "special-teams"]

# Initialize URL to None
url = None 

# Define the URL to scrape based on the specified level
if level == "player":
    url = f"{base_url}/stats/player-stats"
elif level == "team":
    lst = ["offense", "defense", "special-teams"]
else:
    print("Invalid level specified.")

In [17]:
# Example usage:
unit_links = format_links(level)
unit_links

{'Passing': 'https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc',
 'Rushing': 'https://www.nfl.com/stats/player-stats/category/rushing/2023/reg/all/rushingyards/desc',
 'Receiving': 'https://www.nfl.com/stats/player-stats/category/receiving/2023/reg/all/receivingreceptions/desc',
 'Fumbles': 'https://www.nfl.com/stats/player-stats/category/fumbles/2023/reg/all/defensiveforcedfumble/desc',
 'Tackles': 'https://www.nfl.com/stats/player-stats/category/tackles/2023/reg/all/defensivecombinetackles/desc',
 'Interceptions': 'https://www.nfl.com/stats/player-stats/category/interceptions/2023/reg/all/defensiveinterceptions/desc',
 'Field Goals': 'https://www.nfl.com/stats/player-stats/category/field-goals/2023/reg/all/kickingfgmade/desc',
 'Kickoffs': 'https://www.nfl.com/stats/player-stats/category/kickoffs/2023/reg/all/kickofftotal/desc',
 'Kickoff Returns': 'https://www.nfl.com/stats/player-stats/category/kickoff-returns/2023/reg/all/kickreturnsaverageyard

In [18]:
# Combine the base directory path with the current week
directory_path = os.path.join('data', level, 'week' + str(current_week))
create_directory_if_not_exists(directory_path)

if level == "team":
    # Loop through each unit in unit_links
    for unit, categories in unit_links.items():
        # Create a subdirectory for the current unit
        unit_directory_path = os.path.join(directory_path, unit)
        create_directory_if_not_exists(unit_directory_path)

        # Loop through the categories for the current unit
        for category, link in categories.items():
            scrape_and_process_data(link, unit, category, level)
elif level == "player":
    # Directly use the week1 directory for player-level data
    unit_directory_path = directory_path

    # Loop through the categories for the 'player' level
    for category, link in unit_links.items():
        scrape_and_process_data(link, level, category, level)
else:
    print("Invalid level specified.")


Directory "data\player\week1" has been created.
DataFrame for category "Passing" in unit "player" has been exported to data\player\week1\Passing.csv
DataFrame for category "Rushing" in unit "player" has been exported to data\player\week1\Rushing.csv
DataFrame for category "Receiving" in unit "player" has been exported to data\player\week1\Receiving.csv
DataFrame for category "Fumbles" in unit "player" has been exported to data\player\week1\Fumbles.csv
DataFrame for category "Tackles" in unit "player" has been exported to data\player\week1\Tackles.csv
DataFrame for category "Interceptions" in unit "player" has been exported to data\player\week1\Interceptions.csv
DataFrame for category "Field Goals" in unit "player" has been exported to data\player\week1\Field Goals.csv
DataFrame for category "Kickoffs" in unit "player" has been exported to data\player\week1\Kickoffs.csv
DataFrame for category "Kickoff Returns" in unit "player" has been exported to data\player\week1\Kickoff Returns.csv
D

In [13]:
# Example usage:
level = "team"  # Replace with "player" or "team" as needed
unit_links = format_links(level)
unit_links

{'Offense': {'Passing': 'https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc',
  'Rushing': 'https://www.nfl.com/stats/player-stats/category/rushing/2023/reg/all/rushingyards/desc',
  'Receiving': 'https://www.nfl.com/stats/player-stats/category/receiving/2023/reg/all/receivingreceptions/desc',
  'Fumbles': 'https://www.nfl.com/stats/player-stats/category/fumbles/2023/reg/all/defensiveforcedfumble/desc',
  'Tackles': 'https://www.nfl.com/stats/player-stats/category/tackles/2023/reg/all/defensivecombinetackles/desc',
  'Interceptions': 'https://www.nfl.com/stats/player-stats/category/interceptions/2023/reg/all/defensiveinterceptions/desc',
  'Field Goals': 'https://www.nfl.com/stats/player-stats/category/field-goals/2023/reg/all/kickingfgmade/desc',
  'Kickoffs': 'https://www.nfl.com/stats/player-stats/category/kickoffs/2023/reg/all/kickofftotal/desc',
  'Kickoff Returns': 'https://www.nfl.com/stats/player-stats/category/kickoff-returns/2023/reg/all/ki

In [14]:
# Combine the base directory path with the current week
directory_path = os.path.join('data', level, 'week' + str(current_week))
create_directory_if_not_exists(directory_path)

if level == "team":
    # Loop through each unit in unit_links
    for unit, categories in unit_links.items():
        # Create a subdirectory for the current unit
        unit_directory_path = os.path.join(directory_path, unit)
        create_directory_if_not_exists(unit_directory_path)

        # Loop through the categories for the current unit
        for category, link in categories.items():
            scrape_and_process_data(link, unit, category, level)
elif level == "player":
    # Directly use the week1 directory for player-level data
    unit_directory_path = directory_path

    # Loop through the categories for the 'player' level
    for category, link in unit_links.items():
        scrape_and_process_data(link, level, category, level)
else:
    print("Invalid level specified.")


Directory "data\team\week1" has been created.
Directory "data\team\week1\Offense" has been created.
DataFrame for category "Passing" in unit "Offense" has been exported to data\team\week1\Offense\Passing.csv
DataFrame for category "Rushing" in unit "Offense" has been exported to data\team\week1\Offense\Rushing.csv
DataFrame for category "Receiving" in unit "Offense" has been exported to data\team\week1\Offense\Receiving.csv
DataFrame for category "Fumbles" in unit "Offense" has been exported to data\team\week1\Offense\Fumbles.csv
DataFrame for category "Tackles" in unit "Offense" has been exported to data\team\week1\Offense\Tackles.csv
DataFrame for category "Interceptions" in unit "Offense" has been exported to data\team\week1\Offense\Interceptions.csv
DataFrame for category "Field Goals" in unit "Offense" has been exported to data\team\week1\Offense\Field Goals.csv
DataFrame for category "Kickoffs" in unit "Offense" has been exported to data\team\week1\Offense\Kickoffs.csv
DataFrame 