In [86]:
# import needed libraries 
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [98]:
# import nfl_data_py for direct stream to data
import nfl_data_py as nfl
from nfl_data_py import import_schedules

play_by_play = nfl.import_pbp_data(years=range(2021,2024), downcast = True)

# Define suffix mapping for merges
MERGE_SUFFIXES = {
    'opp_def':'_opp',           # def stats
    'schedule': '_sched',       # schedule-based info
    'season_avg': '_season',    # league-season averages
    'team_results': '_res'      # team results & streak
}

# Define potential team name issues
TEAM_FIX = {'WSH':'WAS', 'JAC':'JAX'} # Add here if more discovered

2021 done.
2022 done.
2023 done.
Downcasting floats.


In [105]:
# import weekly nfl data for 2021 to 2023
df = nfl.import_weekly_data(years=[2021,2022,2023])
schedules = nfl.import_schedules([2021, 2022, 2023])

# Create QB dataset for future use and testing
keep_cols_qbs = [
    'player_id', 'player_name', 'season', 'season_type','week', 'recent_team', 'opponent_team', 'passing_yards', 'attempts',
    'completions', 'passing_tds', 'interceptions', 'rushing_yards', 'rushing_tds', 'fantasy_points', 'fantasy_points_ppr', 'sacks',
]
qbs = df[df['position'] == 'QB']
qbs = qbs[keep_cols_qbs]

# Create defensive dataset
defense = (
    play_by_play.groupby(['season', 'defteam', 'week'])
    .agg(
        pass_yards_allowed = ('passing_yards', 'sum'),
        rush_yards_allowed = ('rushing_yards', 'sum'),
        sacks = ('sack', 'sum'),
        interceptions_forced = ('interception', 'sum'),
        fumbles_forced = ('fumble_forced', 'sum'),
        epa_allowed = ('epa', 'sum'),
        plays = ('play_id', 'count')
    )
    .reset_index()
)

# Create defensive points allowed for home teams
home_def = schedules[['season', 'week', 'home_team', 'away_score']].copy()
home_def = home_def.rename(columns={
    'home_team': 'team',
    'away_score': 'points_allowed'
})

# Create defensive points allowed for away teams
away_def = schedules[['season', 'week', 'away_team', 'home_score']].copy()
away_def = away_def.rename(columns={
    'away_team': 'team',
    'home_score': 'points_allowed'
})

defense['total_yards_allowed'] = defense['pass_yards_allowed'] + defense['rush_yards_allowed']

def_points = pd.concat([home_def, away_def], ignore_index = True)

defense = defense.rename(columns = {'defteam':'team'})

defense = defense.merge(def_points, on = ['season', 'team', 'week'], how = 'left')

# Potential team name fix
qbs['recent_team'] = qbs['recent_team'].replace(TEAM_FIX)
qbs['opponent_team'] = qbs['opponent_team'].replace(TEAM_FIX)
schedules['home_team'] = schedules['home_team'].replace(TEAM_FIX)
schedules['away_team'] = schedules['away_team'].replace(TEAM_FIX)

Downcasting floats.


In [73]:
'''# Validate data is clean (i.e. no dupes, missing values)
print(df.isna().sum())
print(df.dtypes)
print(df.groupby(['season','week','player_name']).size())'''
cols_to_drop = [c for c in qbs.columns if 'home_team' in c or 'away_team' in c or 'home_game' in c]
qbs = qbs.drop(columns=cols_to_drop, errors='ignore')

In [None]:
# Create helper function to designate home/away flag 
def add_home_away_flags(pos_df, schedules):
    # Simplify schedules to unique team-week mapping
    home_flags = schedules[['season', 'week', 'home_team']].copy()
    home_flags['home_game'] = 1
    home_flags.rename(columns={'home_team': 'team'}, inplace=True)

    away_flags = schedules[['season', 'week', 'away_team']].copy()
    away_flags['home_game'] = 0
    away_flags.rename(columns={'away_team': 'team'}, inplace=True)

    # Combine into one clean mapping
    team_week_flags = pd.concat([home_flags, away_flags], ignore_index=True)

    # Merge safely on team identity
    merged = pos_df.merge(
        team_week_flags,
        left_on=['season', 'week', 'recent_team'],
        right_on=['season', 'week', 'team'],
        how='left'
    )

    merged.drop(columns=['team'], inplace=True, errors='ignore')

    # Ensure the column exists even for missing merges
    if 'home_game' not in merged.columns:
        merged['home_game'] = np.nan
    return merged

# Fix below by recalculating win streak without cumsum

# Create helper function to create win/loss flags and win(loss) streak
def add_team_win_streaks(pos_df, schedules):
    sch = schedules.copy()
    if 'season_type' not in sch.columns and 'game_type' in sch.columns:
        # Rename to match used name
        sch = sch.rename(columns={'game_type':'season_type'})
    
    # Rename game types for later
    sch['season_type'] = sch['season_type'].replace({
        'REG': 'REG',
        'WC': 'POST',
        'DIV': 'POST',
        'CON': 'POST',
        'SB': 'POST'
    })

    # Grab what we need
    sch = sch[['season', 'season_type', 'week', 'home_team', 'away_team', 'home_score', 'away_score']].copy()

    # Create team results 
    home = sch[['season', 'season_type', 'week', 'home_team', 'home_score', 'away_score']].rename(columns={'home_team':'team'})
    home['team_win'] = (home['home_score'] > home['away_score']).astype(int)
    home = home[['season', 'season_type', 'week', 'team', 'team_win']]

    away = sch[['season', 'season_type', 'week', 'away_team', 'away_score', 'home_score']].rename(columns={'away_team':'team'})
    away['team_win'] = (away['away_score'] > away['home_score']).astype(int)
    away = away[['season', 'season_type', 'week', 'team', 'team_win']]

    team_results = pd.concat([home, away], ignore_index=True)

    # Order regular and post season properly for a season
    stype_order = {'REG': 1, 'POST': 2}
    team_results['stype_order'] = team_results['season_type'].map(stype_order).fillna(1)
    team_results = team_results.sort_values(['team', 'season', 'stype_order', 'week']).reset_index(drop=True)

    # Calculate streak
    def compute_streaks(x):
        # Streak counter 
        cnt = 0
        out = []
        for v in x:
            out.append(cnt)
            if v == 1:
                cnt += 1
            else:
                cnt = 0
        return pd.Series(out, index=x.index)
    
    team_results['team_win_streak'] = team_results.groupby(['team', 'season'], group_keys=False)['team_win'].apply(compute_streaks)

    # Merge streaks back into positional df
    pos_df = pos_df.merge(
        team_results[['season', 'week', 'team', 'team_win_streak']],
        left_on = ['season', 'week', 'recent_team'],
        right_on = ['season', 'week', 'team'],
        how = 'left',
        validate = 'm:1'
    )

    # Add flag for outcome of prior game
    pos_df['team_won_last'] = (
        pos_df.groupby(['recent_team', 'season'])['team_win'].shift(1).fillna(0).astype(int)
    )

    pos_df.drop(columns=['team'], inplace=True, errors='ignore')
    return pos_df

# Create rolling 3 week features and general stats
def add_qb_stats(df):
    df = df.sort_values(['player_id', 'season', 'week'])

    # Create rolling stats for meaningful OFF metrics
    # Completion percentage and efficiency metrics
    df['completion_pct'] = df['completions'] / df['attempts'].replace(0, np.nan)
    df['yards_per_attempt'] = df['passing_yards'] / df['attempts'].replace(0, np.nan)

    # Rolling mean and standard deviation calculated
    for stat in ['passing_yards', 'attempts', 'passing_tds', 'interceptions', 'completion_pct', 'yards_per_attempt']:
        df[f'{stat}_roll3_mean'] = df.groupby('player_id')[stat].transform(lambda x: x.shift().rolling(3).mean())
        df[f'{stat}_roll3_std'] = df.groupby('player_id')[stat].transform(lambda x: x.shift().rolling(3).std())

    # Momentum indicators
    df['prev_passing_yards'] = df.groupby(['player_id', 'season'])['passing_yards'].shift(1)
    df['prev_yards_minus_roll3mean'] = (
        df.groupby(['player_id', 'season'])['passing_yards']
          .apply(lambda s: s.shift().rolling(3).mean())
          .reset_index(drop = True)
    )

    df['passing_yards_volatility'] = (
        df.groupby(['player_id', 'season'])['passing_yards']
          .apply(lambda s: s.shift().rolling(3).std())
          .reset_index(drop = True)
    )

    df['passing_trend_3'] = (
        df.groupby(['player_id', 'season'])['passing_yards']
          .diff(periods=3)
    )

    df['completion_trend_3'] = (
        df.groupby(['player_id', 'season'])['completions']
          .apply(lambda s: (s - s.shift(1)) / s.shift(1))
          .reset_index(drop = True)
          .rolling(3).mean()
    )

    return df

# Create defense merger
def merge_defense_features(pos_df, defense_df):
    defense_df = defense_df.rename(columns = {'team': 'defense_team'})

    merged = pos_df.merge(
        defense_df,
        left_on = ['season', 'opponent_team', 'week'],
        right_on = ['season', 'defense_team', 'week'],
        how = 'left'
    ).drop(columns = ['defense_team'])

    # Create average stats for meaningful DEF metrics
    # Defense stat generation
    opp_def_stats = (
        pos_df.groupby(['season', 'opponent_team'])
            .agg(
                opp_avg_pass_yards_allowed=('passing_yards', 'mean'),
                opp_avg_ints_forced=('interceptions', 'mean'),
                opp_avg_sacks=('sacks', 'mean')
            )
            .reset_index()
    )

    # Merge defensive stats to the QB dataset
    merged = merged.merge(opp_def_stats, on=['season', 'opponent_team'], how='left', suffixes=('', MERGE_SUFFIXES['opp_def']))

    # Normalize DEF metrics in respect to league averages in season
    season_avgs = (pos_df.groupby('season')[['passing_yards', 'passing_tds', 'interceptions']].mean()
        .rename(columns={
            'passing_yards': 'season_avg_pass_yards',
            'passing_tds': 'season_avg_pass_tds',
            'interceptions': 'season_avg_ints'
        })
        .reset_index()
    )

    # Fix season_avg so there is no data leakage. Ensure week 4 only has data up to that point. 

    merged = merged.merge(season_avgs, on='season', how='left')
    merged['opp_pass_efficiency_index'] = merged['opp_avg_pass_yards_allowed'] / merged[f'passing_yards{MERGE_SUFFIXES["season_avg"]}']

    return merged

# Create final safe merge for all team context metrics
def merge_team_context(pos_df, schedules):
    # Win/loss flag creation
    schedules = schedules.copy()
    schedules['home_win'] = (schedules['home_score'] > schedules['away_score']).astype(int)
    schedules['away_win'] = (schedules['away_score'] > schedules['home_score']).astype(int)

    # Team-level view for both home and away team
    home_games = schedules[['season', 'week', 'home_team', 'away_team', 'home_win']].rename(
        columns={'home_team': 'team', 'away_team': 'opponent_team', 'home_win':'team_win'}
    )
    home_games['home_game'] = 1

    away_games = schedules[['season', 'week', 'home_team', 'away_team', 'away_win']].rename(
        columns={'away_team': 'team', 'home_team': 'opponent_team', 'away_win':'team_win'}
    )
    away_games['home_game'] = 0

    # Combine into unified "team_games"
    team_games = pd.concat([home_games, away_games], ignore_index=True)
    
    team_games = team_games.dropna(subset=['team', 'opponent_team'])
    team_games.drop_duplicates(subset=['season', 'week', 'team'], inplace=True)

    # Merge
    merged = pos_df.merge(
        team_games,
        left_on = ['season', 'week', 'recent_team', 'opponent_team'],
        right_on = ['season', 'week', 'team', 'opponent_team'],
        how='left'
    )

    # Clean
    merged.drop(columns=['team'], inplace=True, errors='ignore')
    merged = merged.drop_duplicates(subset = ['player_id', 'season', 'week'])
    return merged

# Build QB dataset (final)
def build_qb_dataset(qbs, schedules, defense):
    qbs = qbs.copy()

    qbs = merge_team_context(qbs, schedules)
    qbs = add_team_win_streaks(qbs)
    qbs = add_qb_stats(qbs)
    qbs = merge_defense_features(qbs, defense)

    # Bye week tracker
    qbs['bye_last_week'] = (qbs['week'] - qbs.groupby(['player_id', 'season'])['week'].shift(1) > 1).astype(int)
    '''# Bye week metric
    qbs['prev_week'] = qbs.groupby('player_id')['week'].shift()
    qbs['bye_last_week'] = np.where(qbs['week'] - qbs['prev_week'] > 1, 1, 0)
    qbs.drop(columns='prev_week', inplace=True)
    '''

    # season week index
    #qbs['season_week'] = qbs.groupby(['player_id', 'season']).cumcount() + 1
    qbs['season_week'] = qbs['week']

    # Playoffs indicator
    qbs['is_playoffs'] = (qbs['game_type'] != 'REG').astype(int)

    return qbs

SyntaxError: invalid syntax (392652199.py, line 169)

In [None]:
# Create predictive metrics for QB

qbs = build_qb_dataset(qbs, schedules, defense)

print(qbs.head(10))

'qb_name = "P.Mahomes"\nqb_df = qbs[(qbs[\'player_name\'] == qb_name) & (qbs[\'season\'] == 2023)]\n\nqb_df = qb_df.sort_values([\'season\', \'week\'])\n\nplt.figure(figsize=(10,5))\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards\'], label=\'Actual Passing Yards\', marker=\'o\')\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards_roll3_mean\'], label=\'3-Game Rolling Mean\', marker=\'x\')\nplt.title(f"{qb_name} - Passing Yards vs 3-Game Rolling Mean")\nplt.xlabel("Week")\nplt.ylabel("Passing Yards")\nplt.legend()\nplt.grid(True)\nplt.show()\n\nqb_df[[\'season\', \'week\', \'passing_yards\', \'passing_yards_roll3_mean\']].head(10) '

In [76]:
# Check df to ensure to dupes or other errors
#print(qbs.sample(10))
#print(qbs[['player_id', 'season', 'week']].duplicated().sum())
#print(qbs['home_game'].value_counts(dropna=False))
'''qb_name = "P.Mahomes"
qb_df = qbs[(qbs['player_name'] == qb_name) & (qbs['season'] == 2023)]

plt.figure(figsize=(10,5))
plt.plot(qb_df['week'], qb_df['passing_yards'], label='Passing Yards', marker='o')
plt.plot(qb_df['week'], qb_df['passing_yards_roll3_mean'], label='3-Game Rolling Mean', marker='x')
plt.fill_between(qb_df['week'], 
                 qb_df['passing_yards_roll3_mean'] - qb_df['passing_yards_volatility'],
                 qb_df['passing_yards_roll3_mean'] + qb_df['passing_yards_volatility'],
                 alpha=0.2, label='Volatility Range')
plt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")
plt.xlabel("Week")
plt.ylabel("Passing Yards")
plt.legend()
plt.grid(True)
plt.show()'''

'qb_name = "P.Mahomes"\nqb_df = qbs[(qbs[\'player_name\'] == qb_name) & (qbs[\'season\'] == 2023)]\n\nplt.figure(figsize=(10,5))\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards\'], label=\'Passing Yards\', marker=\'o\')\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards_roll3_mean\'], label=\'3-Game Rolling Mean\', marker=\'x\')\nplt.fill_between(qb_df[\'week\'], \n                 qb_df[\'passing_yards_roll3_mean\'] - qb_df[\'passing_yards_volatility\'],\n                 qb_df[\'passing_yards_roll3_mean\'] + qb_df[\'passing_yards_volatility\'],\n                 alpha=0.2, label=\'Volatility Range\')\nplt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")\nplt.xlabel("Week")\nplt.ylabel("Passing Yards")\nplt.legend()\nplt.grid(True)\nplt.show()'