In [46]:
# 1. Imports and Config
import pandas as pd
import numpy as np
import os
import sys
from tqdm import tqdm

# --- Add Project Root to Python Path ---
try:
    PROJECT_ROOT = '/content/drive/MyDrive/NFL_Prediction_System'
    if PROJECT_ROOT not in sys.path:
        sys.path.append(str(PROJECT_ROOT))
except:
    pass

from src.utils import config

# Ensure the output directory exists
os.makedirs(config.FEATURES_DIR, exist_ok=True)

# Settings
pd.set_option('display.max_columns', None)
ROLL_WINDOW_LIST = [3, 5, 8]  # The windows we want to test

print("Libraries set up.")

Libraries set up.


In [47]:
# 2. Load All Datasets
print("Loading datasets...")

# A. Load Cleaned Game Data (The Stats)
try:
    game_data = pd.read_parquet(config.CLEANED_GAMES_PATH / 'game_data_2002_2023.parquet')
    print(f"✅ Loaded Cleaned Game Data: {game_data.shape}")

    # Quick Sanity Check
    if 'home_off_epa_per_play' in game_data.columns:
        print("   -> Confirmed: EPA stats are present.")
    else:
        raise ValueError("CRITICAL: Cleaned data is missing EPA stats!")

except FileNotFoundError:
    print("❌ Error: Cleaned game data not found. Please run 02_cleaning.ipynb.")

# B. Load Helper Files (for Rest, Rookies, Injuries)
try:
    schedule_df = pd.read_csv(config.RAW_GAMES_PATH / 'schedule_2002_2023.csv')
    roster_df = pd.read_parquet(config.RAW_PLAYERS_PATH / 'rosters_2002_2023.parquet')
    injury_df = pd.read_parquet(config.RAW_GAMES_PATH / 'injuries_2009_2023.parquet')
    seasonal_df = pd.read_parquet(config.RAW_TEAM_STATS_PATH / 'seasonal_team_data_2002_2023.parquet')
    print(f"✅ Loaded Helper Data: Schedule, Rosters, Injuries, Seasonal Stats.")
except Exception as e:
    print(f"⚠️ Warning: Some helper files could not be loaded ({e}). Some features may be skipped.")

Loading datasets...
✅ Loaded Cleaned Game Data: (5679, 37)
   -> Confirmed: EPA stats are present.
✅ Loaded Helper Data: Schedule, Rosters, Injuries, Seasonal Stats.


# Part 1: Rolling Feature Functions (The "Engine")

In [48]:
# 3. Create Rolling Features
print("Generating rolling features...")

# Step A: Convert to 'Long' format (Team-Game level)
# We take the clean game_data and stack home/away rows
stats_cols = [
    'off_epa_per_play', 'off_success_rate', 'off_pass_epa', 'off_run_epa', 'off_turnovers',
    'def_epa_per_play', 'def_success_rate', 'def_pass_epa', 'def_run_epa', 'def_turnovers_forced'
]

# Home Perspective
home_df = game_data[['game_id', 'season', 'week', 'home_team']].rename(columns={'home_team': 'team'})
home_df['opponent'] = game_data['away_team']
home_df['is_home'] = 1
for col in stats_cols:
    home_df[col] = game_data[f'home_{col}']

# Away Perspective
away_df = game_data[['game_id', 'season', 'week', 'away_team']].rename(columns={'away_team': 'team'})
away_df['opponent'] = game_data['home_team']
away_df['is_home'] = 0
for col in stats_cols:
    away_df[col] = game_data[f'away_{col}']

# Combine
long_df = pd.concat([home_df, away_df]).sort_values(by=['team', 'season', 'week'])

# Step B: Calculate Rolling Averages
# Group by Team and Season to prevent leakage across years
grouped = long_df.groupby(['team', 'season'])
all_rolling = [long_df]

for window in ROLL_WINDOW_LIST:
    # Shift(1) is CRITICAL to prevent data leakage (using current game to predict current game)
    rolled = grouped[stats_cols].shift(1).rolling(window=window, min_periods=1).mean()
    rolled.columns = [f'{col}_roll{window}' for col in stats_cols]
    all_rolling.append(rolled)

# Concatenate features back to long_df
long_with_features = pd.concat(all_rolling, axis=1)

# Step C: Merge back to 'Wide' format (Game level)
# Split back into Home and Away to merge on game_id
home_feats = long_with_features[long_with_features['is_home'] == 1]
away_feats = long_with_features[long_with_features['is_home'] == 0]

# Keep only the ID columns and the new rolling columns
keep_cols = ['game_id'] + [c for c in home_feats.columns if '_roll' in c]

features_df = pd.merge(
    game_data, # Start with original game data
    home_feats[keep_cols].rename(columns={c: f'home_{c}' for c in keep_cols if c != 'game_id'}),
    on='game_id', how='left'
)
features_df = pd.merge(
    features_df,
    away_feats[keep_cols].rename(columns={c: f'away_{c}' for c in keep_cols if c != 'game_id'}),
    on='game_id', how='left'
)

# Step D: Create Matchup Differentials (Home - Away)
# --- FIX: Explicit mapping for offense vs defense stats ---
stat_map = {
    'off_epa_per_play': 'def_epa_per_play',
    'off_success_rate': 'def_success_rate',
    'off_pass_epa': 'def_pass_epa',
    'off_run_epa': 'def_run_epa',
    'off_turnovers': 'def_turnovers_forced' # This was the one breaking it!
}

for window in ROLL_WINDOW_LIST:
    for off_stat, def_stat in stat_map.items():
        home_off_col = f'home_{off_stat}_roll{window}'
        away_def_col = f'away_{def_stat}_roll{window}'

        away_off_col = f'away_{off_stat}_roll{window}'
        home_def_col = f'home_{def_stat}_roll{window}'

        # Matchup: Home Offense vs Away Defense
        features_df[f'home_{off_stat}_matchup_roll{window}'] = features_df[home_off_col] - features_df[away_def_col]

        # Matchup: Away Offense vs Home Defense
        features_df[f'away_{off_stat}_matchup_roll{window}'] = features_df[away_off_col] - features_df[home_def_col]

print(f"Rolling features created. Current shape: {features_df.shape}")

Generating rolling features...
Rolling features created. Current shape: (5679, 127)


In [49]:
# 4. Add Context Features (Rest, Rookies, Injuries)
print("Adding context features...")

# --- A. Rest Days ---
# Requires 'schedule_df' (raw) because it has the dates
def calculate_rest(df):
    df['gameday'] = pd.to_datetime(df['gameday'])
    # Stack to get one row per team-game
    h = df[['season', 'week', 'gameday', 'home_team']].rename(columns={'home_team': 'team'})
    a = df[['season', 'week', 'gameday', 'away_team']].rename(columns={'away_team': 'team'})
    combined = pd.concat([h, a]).sort_values(['team', 'season', 'week'])

    # Calculate days since last game
    combined['prev_game'] = combined.groupby(['team', 'season'])['gameday'].shift(1)
    combined['rest_days'] = (combined['gameday'] - combined['prev_game']).dt.days.fillna(7) # Default 7
    combined['rest_days'] = combined['rest_days'].clip(upper=15) # Cap at 15
    return combined[['season', 'week', 'team', 'rest_days']]

try:
    rest_data = calculate_rest(schedule_df)
    # Merge Rest
    features_df = features_df.merge(rest_data, left_on=['season', 'week', 'home_team'], right_on=['season', 'week', 'team'], how='left').rename(columns={'rest_days': 'home_rest'}).drop(columns='team')
    features_df = features_df.merge(rest_data, left_on=['season', 'week', 'away_team'], right_on=['season', 'week', 'team'], how='left').rename(columns={'rest_days': 'away_rest'}).drop(columns='team')
    features_df['rest_advantage'] = features_df['home_rest'] - features_df['away_rest']
    print(" -> Rest days added.")
except Exception as e:
    print(f" -> Skipped Rest Days: {e}")

# --- B. Rookie QBs ---
try:
    # Identify rookie QBs from roster data
    rookie_qbs = roster_df[(roster_df['position'] == 'QB') & (roster_df['season'] == roster_df['draft_year'])]
    # (Simplified logic: check if the 'qb_name' or 'qb_id' matches a rookie.
    #  For this version, we will check if the team *drafted* a QB that year and assume they might play.)
    #  A more precise way requires snap counts, but this is a good proxy.
    rookie_map = rookie_qbs.groupby(['season', 'team']).size().reset_index(name='has_rookie_qb')

    features_df = features_df.merge(rookie_map, left_on=['season', 'home_team'], right_on=['season', 'team'], how='left').rename(columns={'has_rookie_qb': 'home_rookie_qb'}).drop(columns='team')
    features_df = features_df.merge(rookie_map, left_on=['season', 'away_team'], right_on=['season', 'team'], how='left').rename(columns={'has_rookie_qb': 'away_rookie_qb'}).drop(columns='team')
    features_df[['home_rookie_qb', 'away_rookie_qb']] = features_df[['home_rookie_qb', 'away_rookie_qb']].fillna(0)
    print(" -> Rookie QB flags added.")
except Exception as e:
    print(f" -> Skipped Rookie QBs: {e}")

# --- C. Injuries ---
try:
    # Count players with "Out" or "IR" status
    key_pos = ['QB', 'RB', 'WR', 'TE', 'OL', 'DL', 'LB', 'DB']
    inj_counts = injury_df[
        (injury_df['report_status'].isin(['Out', 'Injured Reserve'])) &
        (injury_df['position'].str.contains('|'.join(key_pos)))
    ].groupby(['season', 'week', 'team']).size().reset_index(name='key_injuries')

    features_df = features_df.merge(inj_counts, left_on=['season', 'week', 'home_team'], right_on=['season', 'week', 'team'], how='left').rename(columns={'key_injuries': 'home_injuries'}).drop(columns='team')
    features_df = features_df.merge(inj_counts, left_on=['season', 'week', 'away_team'], right_on=['season', 'week', 'team'], how='left').rename(columns={'key_injuries': 'away_injuries'}).drop(columns='team')
    features_df[['home_injuries', 'away_injuries']] = features_df[['home_injuries', 'away_injuries']].fillna(0)
    print(" -> Injury counts added.")
except Exception as e:
    print(f" -> Skipped Injuries (Data might not be loaded or column names differ): {e}")

Adding context features...
 -> Rest days added.
 -> Skipped Rookie QBs: 'draft_year'
 -> Injury counts added.


In [50]:
# 5. Save Master Feature Set
print("\nFinalizing dataset...")

# Drop early weeks with NaNs (due to rolling windows)
features_df = features_df.dropna(subset=[f'home_off_epa_per_play_roll{ROLL_WINDOW_LIST[-1]}'])

# Filter for relevant years (2003 onwards, as 2002 is mostly burnt for lag creation)
features_df = features_df[features_df['season'] >= 2003]

print(f"Final shape: {features_df.shape}")

# Save
save_path = config.FEATURES_DIR / 'features_master_2003_2023.parquet'
features_df.to_parquet(save_path, index=False)
print(f"Saved Master Features to: {save_path}")


Finalizing dataset...
Final shape: (5071, 132)
Saved Master Features to: /content/drive/MyDrive/NFL_Prediction_System/data/features/features_master_2003_2023.parquet


In [51]:
# 11. --- Final Cleanup and Save ---
print("\n--- Final Cleanup and Save ---")

# Drop any game from before 2003 (since 2002 has no "last_season" data)
features_df_final = features_df[features_df['season'] >= 2003].copy()

# Drop rows with any remaining NaNs (e.g., from rolling windows in early season)
# This is critical for model training
features_df_final = features_df_final.dropna(subset=[col for col in features_df_final.columns if '_roll' in col])

# Define final columns to keep
# We want game info, the target, spread_line, and our engineered features
base_cols = ['game_id', 'season', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'spread_line', 'home_win']

# Get all our engineered feature columns
# Note: We filter columns dynamically to avoid KeyErrors if a step was skipped
potential_features = [
    'is_dome', 'temperature', 'wind_speed',
    'rookie_qb_matchup', 'home_rookie_qb', 'away_rookie_qb',
    'rest_advantage', 'home_rest', 'away_rest',
    'home_last_season_win_pct', 'away_last_season_win_pct', 'last_season_win_pct_adv',
    'home_key_players_out', 'away_key_players_out', 'injury_advantage'
]
existing_features = [col for col in potential_features if col in features_df_final.columns]
rolling_features = [col for col in features_df_final.columns if '_roll' in col or '_matchup' in col]

# Combine all columns
final_cols = base_cols + existing_features + list(set(rolling_features) - set(existing_features))

# Select only the columns that exist
final_df = features_df_final[final_cols].copy()

# Save the master feature dataset
save_path = config.FEATURES_DIR / 'features_master_2003_2023.parquet'
final_df.to_parquet(save_path, index=False)

print(f"Final master feature dataset shape: {final_df.shape}")
print(f"\nSuccessfully created and saved master feature dataset to:")
print(save_path)


--- Final Cleanup and Save ---
Final master feature dataset shape: (4753, 102)

Successfully created and saved master feature dataset to:
/content/drive/MyDrive/NFL_Prediction_System/data/features/features_master_2003_2023.parquet
