In [106]:
# import needed libraries 
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# import nfl_data_py for direct stream to data
import nfl_data_py as nfl
from nfl_data_py import import_schedules

# Define suffix mapping for merges
MERGE_SUFFIXES = {
    'opp_def':'_opp',           # def stats
    'schedule': '_sched',       # schedule-based info
    'season_avg': '_season',    # league-season averages
    'team_results': '_res'      # team results & streak
}

# Define potential team name issues
TEAM_FIX = {'WSH':'WAS', 'JAC':'JAX'} # Add here if more discovered

QB_METRICS = {
    'core' : ['pass_yards', 'pass_td', 'pass_attempts', 'rush_yards', 'rush_tds', 'completions'],
    'rolling': ['passing_yards', 'passing_tds', 'interceptions',
        'air_yards', 'cpoe', 'sack', 'epa',
        'pressure_rate', 'time_to_throw', 'agg_yards',
        'rush_yards', 'rush_tds', 'completions'],
    'efficiency': ['yards_per_attempt', 'td_rate', 'int_rate']
}

RB_METRICS = {
    'core': ['rush_yards', 'rush_tds', 'rec_yards', 'rec_tds', 'rush_attempts', 'targets'],
    'efficiency': ['yards_per_carry', 'missed_tackle_rate', 'target_share']
}

WR_METRICS = {
    'core': ['rec_yards', 'rec_tds', 'targets', 'receptions', 'air_yards'],
    'efficiency': ['yards_per_route', 'target_share']
}

TE_METRICS = {
    'core': ['rec_yards', 'rec_tds', 'receptions', 'targets', 'air_yards'],
    'efficiency': ['yards_per_route', 'target_share']
}

OPPONENT_METRICS = [
    'pass_yards_allowed',
    'rush_yards_allowed',
    'epa_allowed',
    'sacks',
    'interceptions_forced',
    'pressures',
    'explosive_pass_allowed'
]

QB_SEASON_AGGREGATES = [
    'passing_yards', 'passing_tds', 'interceptions',
    'rush_yards', 'epa', 'air_yards', 'cpoe'
]

2021 done.
2022 done.
2023 done.
Downcasting floats.


In [None]:
# import weekly nfl data for 2021 to 2023
df = nfl.import_weekly_data(years=[2021,2022,2023])
schedules = nfl.import_schedules([2021, 2022, 2023])
play_by_play = nfl.import_pbp_data(years=range(2021,2024), downcast = True)
pbp = play_by_play[play_by_play['play_type'].isin(['pass', 'run'])] # Offensive Plays to Build Defense
pbp['defense_team'] = pbp['defteam']

# Potential team name fix
df['recent_team'] = df['recent_team'].replace(TEAM_FIX)
df['opponent_team'] = df['opponent_team'].replace(TEAM_FIX)
schedules['home_team'] = schedules['home_team'].replace(TEAM_FIX)
schedules['away_team'] = schedules['away_team'].replace(TEAM_FIX)
play_by_play['defteam'] = play_by_play['defteam'].replace(TEAM_FIX)

# Create QB dataset for future use and testing
keep_cols_qbs = [
    'player_id', 'player_name', 'season', 'season_type','week', 'recent_team', 'opponent_team', 'passing_yards', 'attempts',
    'completions', 'passing_tds', 'interceptions', 'rushing_yards', 'rushing_tds', 'fantasy_points', 'fantasy_points_ppr', 'sacks',
]

qbs = df[df['position'] == 'QB']
qbs = qbs[keep_cols_qbs]

print(play_by_play['rush'].head(10))

Downcasting floats.
2021 done.
2022 done.
2023 done.
Downcasting floats.
    player_id player_name player_display_name position position_group  \
0  00-0019596     T.Brady           Tom Brady       QB             QB   
1  00-0019596     T.Brady           Tom Brady       QB             QB   
2  00-0019596     T.Brady           Tom Brady       QB             QB   
3  00-0019596     T.Brady           Tom Brady       QB             QB   
4  00-0019596     T.Brady           Tom Brady       QB             QB   
5  00-0019596     T.Brady           Tom Brady       QB             QB   
6  00-0019596     T.Brady           Tom Brady       QB             QB   
7  00-0019596     T.Brady           Tom Brady       QB             QB   
8  00-0019596     T.Brady           Tom Brady       QB             QB   
9  00-0019596     T.Brady           Tom Brady       QB             QB   

                                        headshot_url recent_team  season  \
0  https://static.www.nfl.com/image/private/f_a

In [None]:
'''# Validate data is clean (i.e. no dupes, missing values)
print(df.isna().sum())
print(df.dtypes)
print(df.groupby(['season','week','player_name']).size())'''
#cols_to_drop = [c for c in qbs.columns if 'home_team' in c or 'away_team' in c or 'home_game' in c]
#qbs = qbs.drop(columns=cols_to_drop, errors='ignore')

In [None]:
# Create helper functions for math

def _shifted_rolling(series, window=3, min_period=1):
    return series.shift(1).rolling(window, min_period).mean()

def _shifted_expanding_mean(series):
    return series.shift(1).expanding().mean()

def add_season_metrics(df, metrics):
    df = df.sort_values(['player_id', 'season', 'week'])

    for m in metrics: 
        df[f'szn_total_{m}'] = df.groupby(['player_id', 'season'])[m].cumsum()
        df['szn_avg_{m}'] = df[f'szn_total_{m}'] / df.groupby(['player_id', 'season']).cumcount().add(1)
    return df

def add_rolling_features(df, metrics, windows = [3,5], group = ['player_id', 'season']):
    for m in metrics:
        for w in windows:
            df[f'{m}_roll_avg_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: s.shift().rolling(w).mean())
            )
            df[f'{m}_std_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: s.shift().rolling(w).std())
            )
            df[f'{m}_roll_change_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: (s - s.shift(1)) / s.shift(1))
            )
    return df

# ----------------------------  -  -  ----  -  -  ---------------------------------------------------------------------

# Create defensive features function
def build_defense_features(pbp):

    # Create yards allowed metrics
    pbp['pass_yards_allowed'] = pbp['passing_yards'].fillna(0)
    pbp['rush_yards_allowed'] = pbp['rushing_yards'].fillna(0)
    pbp['rec_yards_allowed'] = pbp['rec_yards']

    pbp['first_down_allowed'] = (pbp['first_down'] == 1).astype(int)
    pbp['pass_td_allowed'] = (pbp['pass_touchdown'] == 1).astype(int)
    pbp['rush_td_allowed'] = (pbp['rush_touchdown'] == 1).astype(int)
    pbp['turnover_created'] = ((pbp['interception'] == 1) | (pbp['fumble_lost'] == 1)).astype(int)
    pbp['sack_made'] = (pbp['sack'] == 1).astype(int)
    pbp['epa_allowed'] = pbp['epa'].fillna(0)

    pbp['negative_play_created'] = ((pbp['sack'] == 1) | (pbp['tackle_for_loss'] == 1)).astype(int)

    # Create defensive dataset
    defense = (
        pbp.groupby(['defense_team', 'season','week'])
        .agg({

            # Passing
            'pass_yards_allowed': 'sum',
            'pass_td_allowed': 'sum',
            'sack_made': 'sum',
            'interception': 'sum',
            'fumble_forced': 'sum',
            'play_id': 'count',
            'pressure': 'sum',
            'air_yards': 'sum',

            # Rushing
            'rush_yards_allowed': 'sum',
            'rush_td_allowed': 'sum',
            'rush_attempt': 'sum',

            # Receiving
            'rec_yards_allowed': 'sum',

            # Other
            'first_down_allowed': 'sum',
            'turnover_created': 'sum',
            'epa_allowed': 'sum',
            'success': 'mean',
            'play_type': 'count'
        })
        .reset_index()
        .rename(columns = {'defense_team': 'opponent_team'})
    )

    defense['total_yards_allowed'] = (
        defense['pass_yards_allowed'] +
        defense['rush_yards_allowed']
    )

    defense['epa_per_play'] = defense['epa_allowed'] / defense['play_type']

    defense = add_rolling_features(
        defense,
        metrics = OPPONENT_METRICS,
        windows = [3, 5, 10],
        group = ['opponent_team', 'season']
    )

    defense = add_season_metrics(defense, OPPONENT_METRICS)

    # Create defensive points allowed for home teams
    home_def = schedules[['season', 'week', 'home_team', 'away_score']].copy()
    home_def = home_def.rename(columns={
        'home_team': 'team',
        'away_score': 'points_allowed'
    })

    # Create defensive points allowed for away teams
    away_def = schedules[['season', 'week', 'away_team', 'home_score']].copy()
    away_def = away_def.rename(columns={
        'away_team': 'team',
        'home_score': 'points_allowed'
    })

    defense['total_yards_allowed'] = defense['pass_yards_allowed'] + defense['rush_yards_allowed']

    def_points = pd.concat([home_def, away_def], ignore_index = True)

    defense = defense.rename(columns = {'opponent_team':'team'})

    defense = defense.merge(def_points, on = ['season', 'team', 'week'], how = 'left')

    defense = defense.rename(columns = {'team': 'opponent_team'})

    return defense


# Create helper function to designate home/away flag 
def add_home_away_flags(pos_df, schedules):
    # Simplify schedules to unique team-week mapping
    home_flags = schedules[['season', 'week', 'home_team']].copy()
    home_flags['home_game'] = 1
    home_flags.rename(columns={'home_team': 'team'}, inplace=True)

    away_flags = schedules[['season', 'week', 'away_team']].copy()
    away_flags['home_game'] = 0
    away_flags.rename(columns={'away_team': 'team'}, inplace=True)

    # Combine into one clean mapping
    team_week_flags = pd.concat([home_flags, away_flags], ignore_index=True)

    # Merge safely on team identity
    merged = pos_df.merge(
        team_week_flags,
        left_on=['season', 'week', 'recent_team'],
        right_on=['season', 'week', 'team'],
        how='left'
    )

    merged.drop(columns=['team'], inplace=True, errors='ignore')

    # Ensure the column exists even for missing merges
    if 'home_game' not in merged.columns:
        merged['home_game'] = np.nan
    return merged

# Create helper function to create win/loss flags and win(loss) streak
def add_team_win_streaks(pos_df, schedules):
    sch = schedules.copy()
    if 'season_type' not in sch.columns and 'game_type' in sch.columns:
        # Rename to match used name
        sch = sch.rename(columns={'game_type':'season_type'})
    
    # Rename game types for later
    sch['season_type'] = sch['season_type'].replace({
        'REG': 'REG',
        'WC': 'POST',
        'DIV': 'POST',
        'CON': 'POST',
        'SB': 'POST'
    })

    # Grab what we need
    sch = sch[['season', 'season_type', 'week', 'home_team', 'away_team', 'home_score', 'away_score']].copy()

    # Create team results 
    home = sch[['season', 'season_type', 'week', 'home_team', 'home_score', 'away_score']].rename(columns={'home_team':'team'})
    home['team_win'] = (home['home_score'] > home['away_score']).astype(int)
    home = home[['season', 'season_type', 'week', 'team', 'team_win']]

    away = sch[['season', 'season_type', 'week', 'away_team', 'away_score', 'home_score']].rename(columns={'away_team':'team'})
    away['team_win'] = (away['away_score'] > away['home_score']).astype(int)
    away = away[['season', 'season_type', 'week', 'team', 'team_win']]

    team_results = pd.concat([home, away], ignore_index=True)

    # Order regular and post season properly for a season
    stype_order = {'REG': 1, 'POST': 2}
    team_results['stype_order'] = team_results['season_type'].map(stype_order).fillna(1)
    team_results = team_results.sort_values(['team', 'season', 'stype_order', 'week']).reset_index(drop=True)

    # Calculate streak
    def compute_streaks(x):
        # Streak counter 
        cnt = 0
        out = []
        for v in x:
            out.append(cnt)
            if v == 1:
                cnt += 1
            else:
                cnt = 0
        return pd.Series(out, index=x.index)
    
    team_results['team_win_streak'] = team_results.groupby(['team', 'season'], group_keys=False)['team_win'].apply(compute_streaks)

    # Merge streaks back into positional df
    pos_df = pos_df.merge(
        team_results[['season', 'week', 'team', 'team_win_streak']],
        left_on = ['season', 'week', 'recent_team'],
        right_on = ['season', 'week', 'team'],
        how = 'left',
        validate = 'm:1'
    )

    # Add flag for outcome of prior game
    pos_df['team_won_last'] = (
        pos_df.groupby(['recent_team', 'season'])['team_win'].shift(1).fillna(0).astype(int)
    )

    pos_df.drop(columns=['team'], inplace=True, errors='ignore')
    return pos_df

# Create rolling 3 week features and general stats
def add_pos_stats(df):
    df = df.sort_values(['player_id', 'season', 'week'])
    pos = df['position'].iloc[0]

    if 'pos' == "QB":
        # Create rolling stats for meaningful OFF metrics
        # Completion percentage and efficiency metrics
        df['completion_pct'] = df['completions'] / df['attempts'].replace(0, np.nan)
        df['yards_per_attempt'] = df['passing_yards'] / df['attempts'].replace(0, np.nan)

        # Rolling mean and standard deviation calculated
        for stat in ['passing_yards', 'attempts', 'passing_tds', 'interceptions', 'completion_pct', 'yards_per_attempt']:
            df[f'{stat}_roll3_mean'] = df.groupby('player_id')[stat].transform(lambda x: x.shift().rolling(3).mean())
            df[f'{stat}_roll3_std'] = df.groupby('player_id')[stat].transform(lambda x: x.shift().rolling(3).std())

        # Momentum indicators
        df['prev_passing_yards'] = df.groupby(['player_id', 'season'])['passing_yards'].shift(1)
        df['prev_yards_minus_roll3mean'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .apply(lambda s: s.shift().rolling(3).mean())
            .reset_index(drop = True)
        )

        df['passing_yards_volatility'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .apply(lambda s: s.shift().rolling(3).std())
            .reset_index(drop = True)
        )

        df['passing_trend_3'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .diff(periods=3)
        )

        df['completion_trend_3'] = (
            df.groupby(['player_id', 'season'])['completions']
            .apply(lambda s: (s - s.shift(1)) / s.shift(1))
            .reset_index(drop = True)
            .rolling(3).mean()
        )
    elif 'pos' == 'RB':
        df = add_season_metrics(df, RB_METRICS['core'])
        df = add_rolling_features(df, RB_METRICS['core'])
    elif 'pos' == 'WR':
        df = add_season_metrics(df, WR_METRICS['core'])
        df = add_rolling_features(df, WR_METRICS['core'])
    elif 'pos' == 'TE':
        df = add_season_metrics(df, TE_METRICS['core'])
        df = add_rolling_features(df, TE_METRICS['core'])
    return df

# Create defense merger
def merge_defense_features(pos_df, defense_df):
    return pos_df.merge(
        defense_df,
        how = 'left',
        on = ['opponent_team', 'season', 'week']
    )

# Create final safe merge for all team context metrics
def merge_team_context(pos_df, schedules):
    # Win/loss flag creation
    sch = schedules.copy()
    sch['home_win'] = (sch['home_score'] > sch['away_score']).astype(int)
    sch['away_win'] = (sch['away_score'] > sch['home_score']).astype(int)

    # Team-level view for both home and away team
    home_games = sch[['season', 'week', 'home_team', 'away_team', 'home_win']].rename(
        columns={'home_team': 'team', 'away_team': 'opponent_team', 'home_win':'team_win'}
    )
    home_games['home_game'] = 1

    away_games = sch[['season', 'week', 'home_team', 'away_team', 'away_win']].rename(
        columns={'away_team': 'team', 'home_team': 'opponent_team', 'away_win':'team_win'}
    )
    away_games['home_game'] = 0

    # Combine into unified "team_games"
    team_games = pd.concat([home_games, away_games], ignore_index=True)
    
    team_games = team_games.dropna(subset=['team', 'opponent_team'])
    team_games.drop_duplicates(subset=['season', 'week', 'team'], inplace=True)

    # Merge
    merged = pos_df.merge(
        team_games,
        left_on = ['season', 'week', 'recent_team', 'opponent_team'],
        right_on = ['season', 'week', 'team', 'opponent_team'],
        how='left'
    )

    # Clean
    merged.drop(columns=['team'], inplace=True, errors='ignore')
    merged = merged.drop_duplicates(subset = ['player_id', 'season', 'week'])
    return merged

# Create defensive efficiency in respect to season
def add_defense_efficiency(df):

    # Passing
    if 'opp_avg_pass_yards_allowed' in df.columns and 'szn_avg_pass_yards' in df.columns:
        df['opp_pass_efficiency_index'] = (
            df['opp_avg_pass_yards_allowed'] / df['szn_avg_pass_yards'].replace(0, np.nan)
        )

    # Rushing
    if 'opp_avg_rush_yards_allowed' in df.columns and 'szn_avg_rush_yards' in df.columns:
        df['opp_rush_efficiency_index'] = (
            df['opp_avg_rush_yards_allowed'] / df['szn_avg_rush_yards'].replace(0, np.nan)
        )

    # Receiving
    if 'opp_avg_rec_yards_allowed' in df.columns and 'szn_avg_rec_yards' in df.columns:
        df['opp_rec_efficiency_index'] = (
            df['opp_avg_rec_yards_allowed'] / df['szn_avg_rec_yards'].replace(0, np.nan)
        )
    return df

# Build QB dataset (final)
def build_qb_dataset(qbs, schedules, defense):
    
    defense = build_defense_features(play_by_play)

    qbs = merge_defense_features(qbs, defense)

    qbs = merge_team_context(qbs, schedules)
    qbs = add_team_win_streaks(qbs, schedules)
    qbs = add_qb_stats(qbs)
    qbs = add_defense_efficiency(qbs)

    # Bye week tracker
    qbs['bye_last_week'] = (qbs['week'] - qbs.groupby(['player_id', 'season'])['week'].shift(1) > 1).astype(int)

    # season week index
    qbs['season_week'] = qbs['week']

    # Playoffs indicator
    qbs['is_playoffs'] = (qbs['season_type'] != 'REG').astype(int)

    return qbs

SyntaxError: '(' was never closed (3953657681.py, line 19)

In [118]:
# Create predictive metrics for QB

qbs = build_qb_dataset(qbs, schedules, defense)

print(qbs.head(10))

    player_id player_name  season season_type  week recent_team opponent_team  \
0  00-0019596     T.Brady    2021         REG     1          TB           DAL   
1  00-0019596     T.Brady    2021         REG     2          TB           ATL   
2  00-0019596     T.Brady    2021         REG     3          TB            LA   
3  00-0019596     T.Brady    2021         REG     4          TB            NE   
4  00-0019596     T.Brady    2021         REG     5          TB           MIA   
5  00-0019596     T.Brady    2021         REG     6          TB           PHI   
6  00-0019596     T.Brady    2021         REG     7          TB           CHI   
7  00-0019596     T.Brady    2021         REG     8          TB            NO   
8  00-0019596     T.Brady    2021         REG    10          TB           WAS   
9  00-0019596     T.Brady    2021         REG    11          TB           NYG   

   passing_yards  attempts  completions  ...  opp_avg_pass_yards_allowed  \
0          379.0        50      

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(3).mean())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(3).std())
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: (s - s.shift(1)) / s.shift(1))


In [None]:
# Check df to ensure to dupes or other errors
#print(qbs.sample(10))
#print(qbs[['player_id', 'season', 'week']].duplicated().sum())
#print(qbs['home_game'].value_counts(dropna=False))
'''qb_name = "P.Mahomes"
qb_df = qbs[(qbs['player_name'] == qb_name) & (qbs['season'] == 2023)]

plt.figure(figsize=(10,5))
plt.plot(qb_df['week'], qb_df['passing_yards'], label='Passing Yards', marker='o')
plt.plot(qb_df['week'], qb_df['passing_yards_roll3_mean'], label='3-Game Rolling Mean', marker='x')
plt.fill_between(qb_df['week'], 
                 qb_df['passing_yards_roll3_mean'] - qb_df['passing_yards_volatility'],
                 qb_df['passing_yards_roll3_mean'] + qb_df['passing_yards_volatility'],
                 alpha=0.2, label='Volatility Range')
plt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")
plt.xlabel("Week")
plt.ylabel("Passing Yards")
plt.legend()
plt.grid(True)
plt.show()'''

'qb_name = "P.Mahomes"\nqb_df = qbs[(qbs[\'player_name\'] == qb_name) & (qbs[\'season\'] == 2023)]\n\nplt.figure(figsize=(10,5))\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards\'], label=\'Passing Yards\', marker=\'o\')\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards_roll3_mean\'], label=\'3-Game Rolling Mean\', marker=\'x\')\nplt.fill_between(qb_df[\'week\'], \n                 qb_df[\'passing_yards_roll3_mean\'] - qb_df[\'passing_yards_volatility\'],\n                 qb_df[\'passing_yards_roll3_mean\'] + qb_df[\'passing_yards_volatility\'],\n                 alpha=0.2, label=\'Volatility Range\')\nplt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")\nplt.xlabel("Week")\nplt.ylabel("Passing Yards")\nplt.legend()\nplt.grid(True)\nplt.show()'