In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# pip install lxml

## Import Libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [4]:
# Define Base Url
base_url = "https://www.nfl.com"

# Define Week
current_week = 1

### Get Pages to All Tabs

In [5]:
# Request raw HTML
passing1 = requests.get("https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc")

# Create a BeautifulSoup object called `soup` to traverse the combined HTML
soup = BeautifulSoup(passing1.content, "html.parser")

# Find all <a> elements within <li> elements
a_elements = soup.find_all('li', class_='d3-o-tabs__list-item')

# Extract the text from the <a> elements
tabs = [element.find('a').get_text() for element in a_elements]

# Extract the links (href attributes) from the <a> elements
links = [base_url + element.find('a')['href'] for element in a_elements]

# Create a dictionary to store the category names and their corresponding links
stat_pages = dict(zip(tabs, links))

# Print the dictionary as a table-like format using Pandas
stat_pages_df = pd.DataFrame(list(stat_pages.items()), columns=['Category', 'Link'])

print(stat_pages_df)

           Category                                               Link
0           Passing  https://www.nfl.com/stats/player-stats/categor...
1           Rushing  https://www.nfl.com/stats/player-stats/categor...
2         Receiving  https://www.nfl.com/stats/player-stats/categor...
3           Fumbles  https://www.nfl.com/stats/player-stats/categor...
4           Tackles  https://www.nfl.com/stats/player-stats/categor...
5     Interceptions  https://www.nfl.com/stats/player-stats/categor...
6       Field Goals  https://www.nfl.com/stats/player-stats/categor...
7          Kickoffs  https://www.nfl.com/stats/player-stats/categor...
8   Kickoff Returns  https://www.nfl.com/stats/player-stats/categor...
9           Punting  https://www.nfl.com/stats/player-stats/categor...
10     Punt Returns  https://www.nfl.com/stats/player-stats/categor...


In [6]:
# Create a dictionary to store the category names and their corresponding page links
category_pages = {}

# Loop through each category's link and scrape data
for index, row in stat_pages_df.iterrows():
    current_link = row['Link']
    current_stat = row['Category']
    page_count = 0  # Initialize page count
    category_pages[current_stat] = {page_count: current_link}  # Initialize the category's dictionary
    while current_link:
        # Request raw HTML for the current page
        response = requests.get(current_link)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Create a BeautifulSoup object to parse the HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Scrape the data from the current page here
            # (You can add your scraping logic here)
            # Example: print(soup.title.text) to print the page title
            
            # Find the "Next Page" link
            next_page_link = soup.find('a', class_='nfl-o-table-pagination__next')
            
            if next_page_link:
                # Extract the 'href' attribute
                href = next_page_link['href']

                # Update current_link with the next page's URL
                current_link = base_url + href
                page_count += 1  # Increment page count
                category_pages[current_stat][page_count] = current_link  # Add the link to the category's dictionary
            else:
                # No more pages to scrape, exit the loop
                break
        else:
            print("Error: Unable to fetch the page.")
            break

In [7]:
# Print the dictionary of category pages
print(category_pages)
#category_pages["Passing"][0] # get first link of passing stat tab

{'Passing': {0: 'https://www.nfl.com/stats/player-stats/category/passing/2023/reg/all/passingyards/desc', 1: 'https://www.nfl.com/stats/player-stats/category/passing/2023/REG/all/passingyards/DESC?aftercursor=AAAAGQAAABlAYeAAAAAAADFleUp6WldGeVkyaEJablJsY2lJNld5SXhORE1pTENJek1qQXdOVEExTWkwME5USTRMVFUzTWpNdFpERmlNaTA1Tm1VNU1tVmlZekV5TkRFaUxDSXlNREl6SWwxOQ=='}, 'Rushing': {0: 'https://www.nfl.com/stats/player-stats/category/rushing/2023/reg/all/rushingyards/desc', 1: 'https://www.nfl.com/stats/player-stats/category/rushing/2023/REG/all/rushingyards/DESC?aftercursor=AAAAGQAAABdARoAAAAAAADFleUp6WldGeVkyaEJablJsY2lJNld5STBOU0lzSWpNeU1EQTFOelE1TFRSak16SXRPREkyT1MwMFlURTVMV1E0TlRGaFl6azVZbUV6WXlJc0lqSXdNak1pWFgwPQ==', 2: 'https://www.nfl.com/stats/player-stats/category/rushing/2023/REG/all/rushingyards/DESC?aftercursor=AAAAMgAAADBANgAAAAAAADFleUp6WldGeVkyaEJablJsY2lJNld5SXlNaUlzSWpNeU1EQTFORFF4TFRVNU5qRXRNRFUxTXkwNU1UZGpMVE5oTlRabE5XRTBZekkyWXlJc0lqSXdNak1pWFgwPQ==', 3: 'https://www.nfl.com/st

#### Export DataFrame

In [8]:
# Combine the base directory path with the current week
directory_path = os.path.join('data', 'week' + str(current_week))

# Check if the directory exists
if not os.path.exists(directory_path):
    # Create the directory if it doesn't exist
    os.makedirs(directory_path)
    print(f'Directory "{directory_path}" has been created.')
else:
    print(f'Directory "{directory_path}" already exists.')

# Loop through each category in category_pages
for category, links in category_pages.items():
    # Initialize a list to store DataFrames
    dataframes = []

    # Loop through the links for the current category
    for page, link in enumerate(links):
        # Request raw HTML for the current page
        response = requests.get(links[link])

        # Check if the request was successful
        if response.status_code == 200:
            # Create a BeautifulSoup object to parse the HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all elements with the class 'd3-o-player-stats--detailed'
            stats = soup.find_all(attrs={"class": 'd3-o-player-stats--detailed'})

            # Initialize lists to collect data
            stat_val = []
            stat_col = []

            # Loop through each <tr> element to extract and collect the text from <td> elements
            for row in stats:
                # This gets the stat names
                header_cells = row.find_all('th')

                if len(header_cells) > 0:
                    for cell in header_cells:
                        stat_col.append(cell.get_text(strip=True))

                # This gets the stats
                data_cells = row.find_all('td')

                if len(data_cells) > 0:
                    for cell in data_cells:
                        stat_val.append(cell.get_text(strip=True))

            # Determine the number of columns in each row
            num_columns = len(stat_col) if stat_col else 1  # Use 1 if stat_col is empty

            # Split the list into rows
            rows = [stat_val[i:i + num_columns] for i in range(0, len(stat_val), num_columns)]

            # Create a DataFrame for the current category and page
            df = pd.DataFrame(rows, columns=stat_col)

            # Convert all columns except "Player" to numeric
            numeric_columns = df.columns.difference(['Player'])
            df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

            # Add the DataFrame to the list
            dataframes.append(df)
        else:
            print("Error: Unable to fetch the page.")
            break

    # Combine all DataFrames into one final DataFrame
    final_df = pd.concat(dataframes, ignore_index=True)

    # Specify the file path where you want to save the CSV file
    csv_file_path = os.path.join(directory_path, category + '.csv')

    # Export the DataFrame to a CSV file
    final_df.to_csv(csv_file_path, index=False)  # Set index=False to exclude the index column

    print(f'DataFrame for category "{category}" has been exported to {csv_file_path}')


Directory "data\week1" has been created.
DataFrame for category "Passing" has been exported to data\week1\Passing.csv
DataFrame for category "Rushing" has been exported to data\week1\Rushing.csv
DataFrame for category "Receiving" has been exported to data\week1\Receiving.csv
DataFrame for category "Fumbles" has been exported to data\week1\Fumbles.csv
DataFrame for category "Tackles" has been exported to data\week1\Tackles.csv
DataFrame for category "Interceptions" has been exported to data\week1\Interceptions.csv
DataFrame for category "Field Goals" has been exported to data\week1\Field Goals.csv
DataFrame for category "Kickoffs" has been exported to data\week1\Kickoffs.csv
DataFrame for category "Kickoff Returns" has been exported to data\week1\Kickoff Returns.csv
DataFrame for category "Punting" has been exported to data\week1\Punting.csv
DataFrame for category "Punt Returns" has been exported to data\week1\Punt Returns.csv
