NFLVerse

In [None]:
import nfl_data_py as nfl

# Get play-by-play data for the 2023 NFL season
pbp_data = nfl.import_pbp_data([2023])

# Display the first few rows of the data
print(pbp_data.head())


FootballDB Pull

In [7]:
import requests
from bs4 import BeautifulSoup
import os

# Base URL for the site
base_url = "https://www.footballdb.com"

# Categories of stats we need to scrape
categories = [
    "passing", "rushing", "receiving", "scoring",
    "kickoffreturns", "puntreturns", "punting",
    "fieldgoals", "interceptions", "sacks"
]

# Create a directory to store the HTML files
if not os.path.exists("stat_tables"):
    os.makedirs("stat_tables")

def scrape_stats(category):
    # Construct the URL for the given category for the year 2024 with the limit parameter
    url = f"{base_url}/statistics/nfl/player-stats/{category}/2024/regular-season?sort=defsack&limit=all"
    
    # Include headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the table with the class `statistics scrollable`
        table = soup.find('table', class_='statistics scrollable')
        
        if table:
            table_html = str(table)  # Get the full HTML for the table
            
            # Save the HTML to a file named after the category
            file_name = f"stat_tables/{category}_stats.html"
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(table_html)
            print(f"Successfully saved stats for {category} to {file_name}")
        else:
            print(f"No table found for {category}.")
    else:
        print(f"Failed to fetch {category}: Status code {response.status_code}")

def main():
    for category in categories:
        scrape_stats(category)

if __name__ == "__main__":
    main()


Successfully saved stats for passing to stat_tables/passing_stats.html
Successfully saved stats for rushing to stat_tables/rushing_stats.html
Successfully saved stats for receiving to stat_tables/receiving_stats.html
Successfully saved stats for scoring to stat_tables/scoring_stats.html
Successfully saved stats for kickoffreturns to stat_tables/kickoffreturns_stats.html
Successfully saved stats for puntreturns to stat_tables/puntreturns_stats.html
Successfully saved stats for punting to stat_tables/punting_stats.html
Successfully saved stats for fieldgoals to stat_tables/fieldgoals_stats.html
Successfully saved stats for interceptions to stat_tables/interceptions_stats.html
Successfully saved stats for sacks to stat_tables/sacks_stats.html


Table Data to CSV

In [10]:
import os
import pandas as pd
from bs4 import BeautifulSoup

def html_to_csv(html_file, csv_file):
    # Read the HTML file
    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table
    table = soup.find('table', class_='statistics scrollable')

    # Extract headers
    headers = [th.text.strip() for th in table.find_all('th')]

    # Extract rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        row = [td.text.strip() for td in tr.find_all('td')]
        # Pad the row with empty strings if it's shorter than the header
        row += [''] * (len(headers) - len(row))
        rows.append(row[:len(headers)])  # Truncate if longer than header

    # Create a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Write to CSV
    df.to_csv(csv_file, index=False)
    print(f"Created {csv_file}")

def process_all_files(input_dir, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Process each HTML file
    for filename in os.listdir(input_dir):
        if filename.endswith('.html'):
            html_file = os.path.join(input_dir, filename)
            csv_file = os.path.join(output_dir, filename.replace('.html', '.csv'))
            try:
                html_to_csv(html_file, csv_file)
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

# Usage
input_directory = '/home/jose/Code/NFLData/stat_tables'  # Adjust as needed
output_directory = './csv_output'
process_all_files(input_directory, output_directory)

Created ./csv_output/punting_stats.csv
Created ./csv_output/scoring_stats.csv
Created ./csv_output/passing_stats.csv
Created ./csv_output/puntreturns_stats.csv
Created ./csv_output/fieldgoals_stats.csv
Created ./csv_output/kickoffreturns_stats.csv
Created ./csv_output/sacks_stats.csv
Created ./csv_output/interceptions_stats.csv
Created ./csv_output/receiving_stats.csv
Created ./csv_output/rushing_stats.csv


Collects Keys and Their Values

In [13]:
import requests
from bs4 import BeautifulSoup

# Base URL for the site
base_url = "https://www.footballdb.com"

# Categories of stats we need to scrape
categories = [
    "passing", "rushing", "receiving", "scoring",
    "kickoffreturns", "puntreturns", "punting",
    "fieldgoals", "interceptions", "sacks"
]

def fetch_html(url):
    """
    Fetches HTML content from the given URL.
    """
    # Include headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
                       AppleWebKit/537.36 (KHTML, like Gecko)\
                       Chrome/58.0.3029.110 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {str(e)}")
        return None

def extract_keys(soup):
    """
    Extracts only the key-value pairs for passing statistics from the HTML content.
    """
    key_b_tag = soup.find('b', text='KEY:')
    key_value_pairs = {}
    
    if key_b_tag:
        key_parent_tag = key_b_tag.parent
        key_text = key_parent_tag.get_text(separator=' ')
        
        # Extract the text between 'KEY:' and the first occurrence of a newline character
        key_values_text = key_text.split('KEY:')[1].split('\n')[0].strip()
        pairs = key_values_text.split(',')

        for pair in pairs:
            if '=' in pair:
                key, value = pair.split('=', 1)
                key = key.strip().strip('\u200b')  # Remove any zero-width spaces
                value = value.strip()
                key_value_pairs[key] = value
    else:
        print("KEY: not found in the HTML content.")
    
    return key_value_pairs

def scrape_keys_for_category(category):
    """
    Fetches the page for a category and extracts the key-value pairs.
    """
    print(f"Processing category: {category}")
    # Construct the URL for the given category for the year 2024 with the limit parameter
    url = f"{base_url}/statistics/nfl/player-stats/{category}/2024/regular-season?sort=defsack&limit=all"

    html_content = fetch_html(url)
    if not html_content:
        print(f"Failed to retrieve content for {category}")
        return None

    # Parse the HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract keys and their descriptions
    key_value_pairs = extract_keys(soup)

    if key_value_pairs:
        print(f"Extracted keys for {category}:")
        for key, value in key_value_pairs.items():
            print(f"{key}: {value}")
        print("\n")
    else:
        print(f"No keys found for {category}\n")

    return key_value_pairs

def main():
    all_keys = {}
    for category in categories:
        keys = scrape_keys_for_category(category)
        if keys:
            all_keys[category] = keys

    # Now all_keys dictionary contains keys for all categories
    # You can use this data as needed
    # For example, print all keys
    # print(all_keys)

if __name__ == "__main__":
    main()


Processing category: passing


  key_b_tag = soup.find('b', text='KEY:')


Extracted keys for passing:
Gms: Games Played
Att: Pass Attempts
Cmp: Pass Completions
Pct: Pass Completion Percentage
Yds: Passing Yards
YPA: Yards Per Pass Attempt
TD: Touchdown Passes
TD%: Touchdown Pass Percentage
Int: Intercepted Passes
Int%: Pass Interception Percentage
Lg: Longest Pass Completion
Sack: Passing Sacks
Loss: Sack Yards Lost
Rate: Passer Rating


Processing category: rushing
Extracted keys for rushing:
Gms: Games Played
Att: Rushing Attempts
Yds: Rushing Attempts
Avg: Rushing Average
YPG: Rushing Yards Per Game
Lg: Longest Rush
TD: Rushing Touchdowns
FD: Rushing First Downs


Processing category: receiving
Extracted keys for receiving:
Gms: Games Played
Rec: Receptions
Yds: Receiving Yards
Avg: Receiving Average
YPG: Receiving Yards Per Game
Lg: Longest Reception
TD: Touchdown Receptions
FD: First Down Receptions
Tar: Receiving Targets
YAC: Yards After Catch


Processing category: scoring
Extracted keys for scoring:
Tot: Total Touchdowns
R: Rushing Touchdowns
P: Tou