In [1]:
# 1. Import libraries and config
import pandas as pd
import numpy as np
import os
import sys

#!pip install --upgrade nflreadpy

# --- Install/Import NEW Library: nflreadpy ---
try:
    import nflreadpy as nfl
except:
    !pip install nflreadpy
    import nflreadpy as nfl

# --- Add Project Root to Python Path ---
PROJECT_ROOT = '/content/drive/MyDrive/NFL_Prediction_System' # Adjust if_needed
if PROJECT_ROOT not in sys.path:
    sys.path.append(str(PROJECT_ROOT))
# ---------------------------------------

from src.utils import config  # Import our config file

print("Libraries and config paths set.")
print(f"Using NEW library: nflreadpy")
print(f"Project Root: {PROJECT_ROOT}")

# --- Seasons to Ingest ---
SEASONS = range(2002, 2023 + 1)

Collecting nflreadpy
  Downloading nflreadpy-0.1.5-py3-none-any.whl.metadata (7.5 kB)
Downloading nflreadpy-0.1.5-py3-none-any.whl (30 kB)
Installing collected packages: nflreadpy
Successfully installed nflreadpy-0.1.5
Libraries and config paths set.
Using NEW library: nflreadpy
Project Root: /content/drive/MyDrive/NFL_Prediction_System


In [2]:
# 2. Ingest Play-by-Play (PBP) Data
print("--- 1. Ingesting Play-by-Play Data (2002-2023) ---")

# --- UPDATED COLUMNS LIST (FIXED) ---
pbp_cols = [
    'game_id', 'season', 'season_type', # <-- ADDED 'season_type'
    'week', 'posteam', 'defteam', 'play_type',
    'epa', 'success', 'sp',
    'interception', 'fumble_lost',
    'home_team', 'away_team', 'spread_line', 'home_score', 'away_score',
    'roof', 'temp', 'wind'
]

# Ensure the directory exists
os.makedirs(config.RAW_GAMES_PATH, exist_ok=True)
pbp_path = config.RAW_GAMES_PATH / 'pbp_2002_2023.parquet'

# --- FIX: We must convert the nflreadpy (Polars) DataFrame to Pandas ---
print("Loading full PBP dataset (this may take a few minutes)...")

# 1. Load the Polars DataFrame
pbp_df_full_polars = nfl.load_pbp(seasons=SEASONS)

# 2. IMMEDIATELY convert it to a Pandas DataFrame
pbp_df_full = pbp_df_full_polars.to_pandas()
print("Converted Polars DataFrame to Pandas DataFrame.")

# ...and *then* filter to the columns we need.
# This check ensures we only keep columns that actually exist
cols_to_keep = [col for col in pbp_cols if col in pbp_df_full.columns]
print(f"Filtering to {len(cols_to_keep)} columns...")
pbp_df = pbp_df_full[cols_to_keep]

# 3. Save the filtered Pandas DataFrame
pbp_df.to_parquet(pbp_path, index=False)
print(f"PBP data saved to: {pbp_path}")
print(f"PBP data shape: {pbp_df.shape}")

--- 1. Ingesting Play-by-Play Data (2002-2023) ---
Loading full PBP dataset (this may take a few minutes)...
Converted Polars DataFrame to Pandas DataFrame.
Filtering to 20 columns...
PBP data saved to: /content/drive/MyDrive/NFL_Prediction_System/data/raw/games/pbp_2002_2023.parquet
PBP data shape: (1044769, 20)


In [3]:
# 3. Ingest Schedules and Rosters
print("\n--- 2. Ingesting Schedules & Rosters (2002-2023) ---")
os.makedirs(config.RAW_PLAYERS_PATH, exist_ok=True)

# Ingest and save schedules
schedule_path = config.RAW_GAMES_PATH / 'schedule_2002_2023.csv'
# --- NEW FUNCTION: load_schedules ---
schedule_df_polars = nfl.load_schedules(seasons=SEASONS)
schedule_df = schedule_df_polars.to_pandas() # <-- FIX: Convert to Pandas
schedule_df.to_csv(schedule_path, index=False)
print(f"Schedule data saved to: {schedule_path}")

# Ingest and save rosters
roster_path = config.RAW_PLAYERS_PATH / 'rosters_2002_2023.parquet'
# --- NEW FUNCTION: load_rosters ---
roster_df_polars = nfl.load_rosters(seasons=SEASONS)
roster_df = roster_df_polars.to_pandas() # <-- FIX: Convert to Pandas

# --- PROACTIVE FIX: Clean ALL problematic columns before saving ---
roster_df['jersey_number'] = roster_df['jersey_number'].astype(str)
roster_df['draft_number'] = roster_df['draft_number'].astype(str)

roster_df.to_parquet(roster_path, index=False)
print(f"Roster data saved to: {roster_path}")


--- 2. Ingesting Schedules & Rosters (2002-2023) ---
Schedule data saved to: /content/drive/MyDrive/NFL_Prediction_System/data/raw/games/schedule_2002_2023.csv
Roster data saved to: /content/drive/MyDrive/NFL_Prediction_System/data/raw/players/rosters_2002_2023.parquet


In [4]:
# 4. Ingest NEW Advanced Data (Injuries)
print("\n--- 3. Ingesting NEW Advanced Data (Injuries) ---")

# NOTE: PFF data is proprietary and no longer available via these libraries.
print("Skipping PFF data (proprietary).")

# --- Injury Reports ---
# Data is only available from 2009 onward.
INJURY_SEASONS = range(2009, 2023 + 1)
injury_path = config.RAW_GAMES_PATH / 'injuries_2009_2023.parquet'

# --- NEW FUNCTION: load_injuries ---
injury_df_polars = nfl.load_injuries(seasons=INJURY_SEASONS)
injury_df = injury_df_polars.to_pandas() # <-- FIX: Convert to Pandas

injury_df.to_parquet(injury_path, index=False)
print(f"Injury data (2009-2023) saved to: {injury_path}")


--- 3. Ingesting NEW Advanced Data (Injuries) ---
Skipping PFF data (proprietary).
Injury data (2009-2023) saved to: /content/drive/MyDrive/NFL_Prediction_System/data/raw/games/injuries_2009_2023.parquet


In [5]:
# 5. Ingest Special Teams Data (REMOVED)
print("\n--- 4. Ingesting Special Teams Data (REMOVED) ---")
print("NOTE: Separate ST file is not needed.")
print("All ST plays are already in the PBP data (where 'sp' == 1).")


--- 4. Ingesting Special Teams Data (REMOVED) ---
NOTE: Separate ST file is not needed.
All ST plays are already in the PBP data (where 'sp' == 1).


In [6]:
# 6. Load Team Descriptions
print("\n--- 5. Loading Team Descriptions ---")
os.makedirs(config.RAW_TEAM_STATS_PATH, exist_ok=True)

# --- NEW FUNCTION (FIXED): load_teams ---
team_desc_df_polars = nfl.load_teams()
team_desc_df = team_desc_df_polars.to_pandas() # <-- FIX: Convert to Pandas

save_path = config.RAW_TEAM_STATS_PATH / 'team_descriptions.csv'
team_desc_df.to_csv(save_path, index=False)
print(f"Successfully loaded and saved Team Descriptions to {save_path}.")


--- 5. Loading Team Descriptions ---
Successfully loaded and saved Team Descriptions to /content/drive/MyDrive/NFL_Prediction_System/data/raw/team_stats/team_descriptions.csv.


In [7]:
# 7. Ingest Next Gen Stats (NGS)
print("\n--- 6. Ingesting Next Gen Stats (2016-2023) ---")
NGS_SEASONS = range(2016, 2023 + 1)
os.makedirs(config.RAW_TEAM_STATS_PATH, exist_ok=True) # Ensure this path exists

try:
    # --- NEW FUNCTION (v3 - FIXED): load_nextgen_stats ---
    ngs_pass_df_polars = nfl.load_nextgen_stats(stat_type='passing', seasons=NGS_SEASONS)
    ngs_pass_df = ngs_pass_df_polars.to_pandas() # <-- Convert to Pandas

    ngs_rush_df_polars = nfl.load_nextgen_stats(stat_type='rushing', seasons=NGS_SEASONS)
    ngs_rush_df = ngs_rush_df_polars.to_pandas() # <-- Convert to Pandas

    ngs_pass_df.to_parquet(config.RAW_PLAYERS_PATH / 'ngs_passing_2016_2023.parquet', index=False)
    ngs_rush_df.to_parquet(config.RAW_PLAYERS_PATH / 'ngs_rushing_2016_2023.parquet', index=False)

    print("Successfully ingested and saved NGS passing and rushing data.")
except Exception as e:
    print(f"Could not download NGS data: {e}")


--- 6. Ingesting Next Gen Stats (2016-2023) ---
Successfully ingested and saved NGS passing and rushing data.


In [8]:
# 8. Ingest Snap Counts
print("\n--- 7. Ingesting Snap Counts (2012-2023) ---")
SNAP_SEASONS = range(2012, 2023 + 1)

try:
    # --- NEW FUNCTION: load_snap_counts ---
    snap_counts_df_polars = nfl.load_snap_counts(seasons=SNAP_SEASONS)
    snap_counts_df = snap_counts_df_polars.to_pandas() # <-- FIX: Convert to Pandas

    snap_counts_df.to_parquet(config.RAW_PLAYERS_PATH / 'snap_counts_2012_2023.parquet', index=False)
    print(f"Successfully ingested and saved snap counts.")
except Exception as e:
    print(f"Could not download snap counts: {e}")


--- 7. Ingesting Snap Counts (2012-2023) ---
Successfully ingested and saved snap counts.


In [9]:
# 9. Ingest Seasonal Data
print("\n--- 8. Ingesting Seasonal Team Data (2002-2023) ---")

try:
    # --- NEW FUNCTION (FIXED): load_team_stats(seasons=True) ---
    # This is the correct function in the 'nflreadpy' library
    seasonal_data_df_polars = nfl.load_team_stats(seasons=True)
    seasonal_data_df = seasonal_data_df_polars.to_pandas() # <-- Convert to Pandas

    seasonal_data_df.to_parquet(config.RAW_TEAM_STATS_PATH / 'seasonal_team_data_2002_2023.parquet', index=False)
    print(f"Successfully ingested and saved seasonal team data.")
except Exception as e:
    print(f"Could not download seasonal data: {e}")


--- 8. Ingesting Seasonal Team Data (2002-2023) ---
Successfully ingested and saved seasonal team data.


In [10]:
# 10. Ingest Officials Data
print("\n--- 9. Ingesting Game Officials Data (2014-2023) ---")
OFFICIAL_SEASONS = range(2015, 2023 + 1)

try:
    # --- NEW FUNCTION: load_officials ---
    officials_df_polars = nfl.load_officials(seasons=OFFICIAL_SEASONS)
    officials_df = officials_df_polars.to_pandas() # <-- FIX: Convert to Pandas

    officials_df.to_parquet(config.RAW_GAMES_PATH / 'officials_2014_2023.parquet', index=False)
    print(f"Successfully ingested and saved officials data.")
except Exception as e:
    print(f"Could not download officials data: {e}")


--- 9. Ingesting Game Officials Data (2014-2023) ---
Successfully ingested and saved officials data.


In [11]:
# 11. Final Report (Updated)
print("\n--- Ingestion Complete ---")
print("All raw data files have been downloaded and saved to:")
print(f"- {config.RAW_GAMES_PATH}")
print(f"- {config.RAW_PLAYERS_PATH}")
print(f"- {config.RAW_TEAM_STATS_PATH}")
print("\nYou can now delete '06_advanced_features.ipynb'.")


--- Ingestion Complete ---
All raw data files have been downloaded and saved to:
- /content/drive/MyDrive/NFL_Prediction_System/data/raw/games
- /content/drive/MyDrive/NFL_Prediction_System/data/raw/players
- /content/drive/MyDrive/NFL_Prediction_System/data/raw/team_stats

You can now delete '06_advanced_features.ipynb'.
