## Import Libraries

In [1]:
#pip install -v pandas

In [2]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import time

### Set parameters

In [3]:
base_url = "https://www.nfl.com"
week = 2

## Player Stats

#### Define function to get links

In [4]:
def get_links(level, season, base_url="https://www.nfl.com"):
    # Initialize a list to store the links
    all_links = []

    if level == "player":
        # Request raw HTML
        html = requests.get("https://www.nfl.com/stats/player-stats/")
        # Assuming your HTML is stored in a variable named html_content
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find the ul element with class 'd3-o-tabbed-controls-selector__list'
        li_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
    elif level == "team":
        # First get the base links for offense, defense, and special-teams tab
        # Request raw HTML
        html = requests.get("https://www.nfl.com/stats/team-stats/")
        # Assuming your HTML is stored in a variable named html_content
        soup = BeautifulSoup(html.content, 'html.parser')
        # Find the ul element with class 'd3-o-tabbed-controls-selector__list'
        ul_element = soup.find('ul', class_='d3-o-tabbed-controls-selector__list')
        # Find all li elements within the ul element
        li_elements = ul_element.find_all('li')

    # Initialize a list to store the href values
    href_values = []

    # Iterate through the li elements and extract the href values from the a tags
    for li in li_elements:
        a_tag = li.find('a')
        if a_tag:
            href = a_tag['href']
            href_values.append(href)

    # Loop through href_values and fetch links for each URL
    for href in href_values:
        url = base_url + href
        html = requests.get(url)
        soup = BeautifulSoup(html.content, "html.parser")
        a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')
        links = [base_url + element.find('a')['href'] for element in a_elements]
        all_links.extend(links)  # Append the links to the all_links list
    
    # Replace "2023" with the value of the season variable we want to scrape
    all_links = [link.replace('2023', season) for link in all_links]
    return all_links

In [5]:
def get_sub_pages(unit_links):
    # Create a dictionary to store the category names and their corresponding page links
    sub_pages = {}
    base_url = "https://www.nfl.com"  # Assuming you have a base URL

    for unit, category_links in unit_links.items():
        sub_pages[unit] = {}

        for category, link in category_links.items():
            page_count = 0  # Initialize page count
            current_link = link  # Use the provided link as the starting point
            current_stat = category  # Set the current_stat to the category name

            # Initialize the category's dictionary
            sub_pages[unit][current_stat] = {page_count: current_link}

            # Create an infinite loop to scrape data from multiple pages
            while True:
                # Request raw HTML for the current page
                response = requests.get(current_link)

                # Check if the request was successful
                if response.status_code == 200:
                    # Create a BeautifulSoup object to parse the HTML
                    soup = BeautifulSoup(response.content, "html.parser")

                    # Scrape the data from the current page here
                    # (You can add your scraping logic here)
                    # Example: print(soup.title.text) to print the page title

                    # Find the "Next Page" link
                    next_page_link = soup.find('a', class_='nfl-o-table-pagination__next')

                    if next_page_link:
                        # Extract the 'href' attribute
                        href = next_page_link['href']

                        # Update current_link with the next page's URL
                        current_link = base_url + href
                        page_count += 1  # Increment page count

                        # Add the link to the category's dictionary
                        sub_pages[unit][current_stat][page_count] = current_link
                    else:
                        print(f"No more pages to scrape for {unit} - {current_stat}.")
                        break  # Exit the loop when there are no more pages
                else:
                    print(f"Error: Unable to fetch data from {current_link} for {unit} - {current_stat}.")
                    break  # Exit the loop on request error

    # Display the collected category pages and their links
    for unit, categories in sub_pages.items():
        for category, pages in categories.items():
            print(f"{unit} - {category} Sub-Pages:")
            for page_num, page_link in pages.items():
                print(f"Page {page_num}: {page_link}")
    return sub_pages

#### Define Function to format links

In [6]:
# Updated format_links function
def format_links(level, season):
    # Create a dictionary to store the links for each unit and its categories
    unit_links = {}

    if level == "player":
        # Define the URL for the player level
        all_links = get_links(level, season, base_url)  # Pass the URL as an argument
        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = "individual" # e.g., individual, offense, defense, special-teams
            category = parts[6]
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    if level == "team":
        all_links = get_links(level, season, base_url)

        # Create a dictionary to store the links
        team_stats_dict = {}

        for link in all_links:
            # Split the link by "/"
            parts = link.split('/')
            # Get the keys and values
            unit = parts[5] # e.g., offense, defense, special-teams
            category = parts[6] # e.g., passing, rushing etc.
            url = link
            # Add to the dictionary
            if unit not in team_stats_dict:
                team_stats_dict[unit] = {}
            team_stats_dict[unit][category] = url

    # Get stat category names and update unit_links
    stat_cols = {}

    for outer_key, inner_dict in team_stats_dict.items():
        inner_keys = list(inner_dict.keys())
        
        if outer_key in stat_cols:
            stat_cols[outer_key].extend(inner_keys)
        else:
            stat_cols[outer_key] = inner_keys

    # Update unit_links with the fetched links
    unit_links = team_stats_dict
    print("these are the old unit links", unit_links)
    unit_links = get_sub_pages(unit_links)
    print("these are the new unit links", unit_links)
    return unit_links


### Define Function to Check/Create Directory

In [7]:
# Function to create directories if they don't exist
def create_directory_if_not_exists(directory_path):
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f'Directory "{directory_path}" has been created.')
        except OSError as e:
            print(f'Error: Failed to create directory "{directory_path}".')
            print(e)
    else:
        print(f'Directory "{directory_path}" already exists.')


### Define Function to Scrape and Process Data

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_dataframes(unit_links, level):
    dataframes = {}

    # Loop through the categories (tasks)
    for category, category_links in unit_links.items():
        category_dfs = []  # Initialize a list to store DataFrames for the current category

        # Create a list to store all page URLs for the current category
        all_page_urls = []

        # Loop through the pages for the current category and gather page URLs
        for page_num, link in category_links.items():
            all_page_urls.append(link)

        # Now, iterate through all_page_urls and scrape the data for the current category
        for page_url in all_page_urls:
            # Request raw HTML for the current page
            response = requests.get(page_url)

            # Check if the request was successful
            if response.status_code == 200:
                # Create a BeautifulSoup object to parse the HTML
                soup = BeautifulSoup(response.content, "html.parser")

                # Find all elements with the class 'd3-o-player-stats--detailed'
                stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

                # Initialize lists to collect data
                stat_val = []
                stat_col = []

                # Loop through each <tr> element to extract and collect the text from <td> elements
                for row in stats:
                    # This gets the stat names
                    header_cells = row.find_all('th')

                    if len(header_cells) > 0:
                        for cell in header_cells:
                            stat_col.append(cell.get_text(strip=True))

                    # This gets the stats
                    data_cells = row.find_all('td')

                    if len(data_cells) > 0:
                        for cell in data_cells:
                            stat_val.append(cell.get_text(strip=True))

                # Determine the number of columns in each row
                num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty

                # Split the list into rows
                rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

                # Create a DataFrame for the current category and page
                df = pd.DataFrame(rows, columns=stat_col)

                # Append the DataFrame to the list for the current category
                category_dfs.append(df)
            else:
                print(f"Error: Unable to fetch data from {page_url} for {category}.")

        # Concatenate dataframes for the current category into one
        if category_dfs:
            dataframes[category] = pd.concat(category_dfs, ignore_index=True)

    return dataframes


In [9]:
# Function to scrape and process data
def scrape_and_process_data(link, unit, category, level, unit_directory_path, unit_links):
    # Request raw HTML for the current page

# Check if the request was successful

    # Create a BeautifulSoup object to parse the HTML
    #soup = BeautifulSoup(response.content, "html.parser")

    # Find all elements with the class 'd3-o-player-stats--detailed'
    #stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

    # Initialize lists to collect data
    #stat_val = []
    #stat_col = []


    """"

    # Loop through each <tr> element to extract and collect the text from <td> elements
    for row in stats:
        # This gets the stat names
        header_cells = row.find_all('th')

        if len(header_cells) > 0:
            for cell in header_cells:
                stat_col.append(cell.get_text(strip=True))

        # This gets the stats
        data_cells = row.find_all('td')

        if len(data_cells) > 0:
            for cell in data_cells:
                stat_val.append(cell.get_text(strip=True))

    # Determine the number of columns in each row
    num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty
    
    # Split the list into rows
    rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

    # Create a DataFrame for the current category
    df = pd.DataFrame(rows, columns=stat_col)
    
    """
    df = scrape_dataframes(unit_links, level)



    
    # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
    if 'Team' in df.columns:
        # Remove duplicated part from the "Team" column
        df['Team'] = df['Team'].apply(lambda x: x[:len(x)//2])

    # Create the directory if it doesn't exist
    create_directory_if_not_exists(unit_directory_path)

    # Specify the file path within the unit's directory
    csv_file_path = os.path.join(unit_directory_path, category + '.csv')

    # Export the DataFrame to a CSV file
    df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

    print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')


In [12]:
def scrape_and_process_data(unit, category, level, unit_directory_path, unit_links):
    # Create a list to store DataFrames for the current category
    category_dfs = []

    # Loop through the sub-pages for the current category
    for page_num, page_url in unit_links[unit][category].items():
        # Request raw HTML for the current page
        response = requests.get(page_url)

        # Check if the request was successful
        if response.status_code == 200:
            # Create a BeautifulSoup object to parse the HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all elements with the class 'd3-o-player-stats--detailed'
            stats = soup.find_all(attrs={"class": f'd3-o-{level}-stats--detailed'})

            # Initialize lists to collect data
            stat_val = []
            stat_col = []

            # Loop through each <tr> element to extract and collect the text from <td> elements
            for row in stats:
                # This gets the stat names
                header_cells = row.find_all('th')

                if len(header_cells) > 0:
                    for cell in header_cells:
                        stat_col.append(cell.get_text(strip=True))

                # This gets the stats
                data_cells = row.find_all('td')

                if len(data_cells) > 0:
                    for cell in data_cells:
                        stat_val.append(cell.get_text(strip=True))

            # Determine the number of columns in each row
            num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty

            # Split the list into rows
            rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

            # Create a DataFrame for the current category and page
            df = pd.DataFrame(rows, columns=stat_col)

            # Append the DataFrame to the list for the current category
            category_dfs.append(df)
        else:
            print(f"Error: Unable to fetch data from {page_url} for {category}.")

    # Concatenate dataframes for the current category into one
    if category_dfs:
        merged_df = pd.concat(category_dfs, ignore_index=True)

        # Check if the DataFrame has a "Team" column before attempting to remove the duplicated part
        if 'Team' in merged_df.columns:
            # Remove duplicated part from the "Team" column
            merged_df['Team'] = merged_df['Team'].apply(lambda x: x[:len(x) // 2])

        # Create the directory if it doesn't exist
        create_directory_if_not_exists(unit_directory_path)

        # Specify the file path within the unit's directory
        csv_file_path = os.path.join(unit_directory_path, category + '.csv')

        # Export the DataFrame to a CSV file
        merged_df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

        print(f'DataFrame for category "{category}" in unit "{unit}" has been exported to {csv_file_path}')
    else:
        print(f"No data found for category '{category}' in unit '{unit}'")





### Define Function to  Initiate Scraping Process

In [10]:
# Define Function to  Initiate Scraping Process
def get_stats(level, season):
    lst = ["offense", "defense", "special-teams"]

    # Combine the base directory path with the current week for current season, else store data in reg (regular season)
    global current_season
    if season == current_season:
        directory_path = os.path.join('data', season, level, 'week' + str(current_week))
    elif season != current_season:
        directory_path = os.path.join('data', season, level, 'reg')
    
    unit_links = format_links(level, season)
    print("hihger level uni links", unit_links)

    if level == "team":
        for unit, categories in unit_links.items(): ## change to be same with player
            for category, link in categories.items():

                # Create a subdirectory for the current unit
                unit_directory_path = os.path.join(directory_path, unit)
                #create_directory_if_not_exists(unit_directory_path)

                # Call scrape_and_process_data with individual category URLs
                scrape_and_process_data(link, unit, category, level, unit_directory_path, unit_links)
                
    elif level == "player":
        for unit, categories in unit_links.items(): ## change to be same with player
            for category, link in categories.items():
                # Directly use the week1 directory for player-level data
                unit_directory_path = directory_path

                # Pass unit_links to the scrape_and_process_data function
                print("lower level uni links", unit_links)
                scrape_and_process_data(link, unit, category, level, unit_directory_path, unit_links)
    else:
        print("Invalid level specified.")




In [13]:
# Update the get_stats function
def get_stats(level, season):
    # Combine the base directory path with the current week for the current season,
    # else store data in 'reg' (regular season) directory
    global current_season
    if season == current_season:
        directory_path = os.path.join('data', season, level, 'week' + str(current_week))
    elif season != current_season:
        directory_path = os.path.join('data', season, level, 'reg')

    unit_links = format_links(level, season)
    print("Higher level unit links", unit_links)

    if level == "team":
        for unit, categories in unit_links.items():
            for category, _ in categories.items():
                # Create a subdirectory for the current unit
                unit_directory_path = os.path.join(directory_path, unit)

                # Call scrape_and_process_data with unit and category
                scrape_and_process_data(unit, category, level, unit_directory_path, unit_links)
                
    elif level == "player":
        for unit, categories in unit_links.items():
            for category, _ in categories.items():
                # Directly use the week1 directory for player-level data
                unit_directory_path = directory_path

                # Pass unit and category to the scrape_and_process_data function
                print("Lower level unit links", unit_links)
                scrape_and_process_data(unit, category, level, unit_directory_path, unit_links)
    else:
        print("Invalid level specified.")

In [14]:
# Set current week (only relevant for current season data)
current_week = 2

# Set current season (will adjust output folders, as passed seasons only offer aggregate not weekly results)
current_season = "2023"

get_stats(level = "player", season = "2023")

these are the old unit links {'individual': {'passing': 'https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc', 'rushing': 'https://www.nfl.com/stats/player-stats/category/rushing/2023/reg/all/rushingyards/desc', 'receiving': 'https://www.nfl.com/stats/player-stats/category/receiving/2023/reg/all/receivingreceptions/desc', 'fumbles': 'https://www.nfl.com/stats/player-stats/category/fumbles/2023/reg/all/defensiveforcedfumble/desc', 'tackles': 'https://www.nfl.com/stats/player-stats/category/tackles/2023/reg/all/defensivecombinetackles/desc', 'interceptions': 'https://www.nfl.com/stats/player-stats/category/interceptions/2023/reg/all/defensiveinterceptions/desc', 'field-goals': 'https://www.nfl.com/stats/player-stats/category/field-goals/2023/reg/all/kickingfgmade/desc', 'kickoffs': 'https://www.nfl.com/stats/player-stats/category/kickoffs/2023/reg/all/kickofftotal/desc', 'kickoff-returns': 'https://www.nfl.com/stats/player-stats/category/kickoff-returns

In [None]:
doesnt properly loop through sub_pages
passing page 1
rushing page 2

anstatt
rushing page 1 & 2 und 3