In [184]:
# import needed libraries 
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [185]:
# import nfl_data_py for direct stream to data
import nfl_data_py as nfl
from nfl_data_py import import_schedules

# Define suffix mapping for merges
MERGE_SUFFIXES = {
    'opp_def':'_opp',           # def stats
    'schedule': '_sched',       # schedule-based info
    'season_avg': '_season',    # league-season averages
    'team_results': '_res'      # team results & streak
}

# Define potential team name issues
TEAM_FIX = {'WSH':'WAS', 'JAC':'JAX'} # Add here if more discovered

QB_METRICS = {
    'core' : ['pass_yards', 'pass_td', 'pass_attempts', 'rush_yards', 'rush_tds', 'completions'],
    'rolling': ['passing_yards', 'passing_tds', 'interception',
        'air_yards', 'cpoe', 'sack', 'epa',
        'pressure_rate', 'time_to_throw', 'agg_yards',
        'rush_yards', 'rush_tds', 'completions'],
    'efficiency': ['yards_per_attempt', 'td_rate', 'int_rate', 'epa']
}

RB_METRICS = {
    'core': ['rush_yards', 'rush_tds', 'rec_yards', 'rec_tds', 'rush_attempts', 'targets'],
    'efficiency': ['yards_per_carry', 'missed_tackle_rate', 'target_share']
}

WR_METRICS = {
    'core': ['rec_yards', 'rec_tds', 'targets', 'receptions', 'air_yards'],
    'efficiency': ['yards_per_route', 'target_share']
}

TE_METRICS = {
    'core': ['rec_yards', 'rec_tds', 'receptions', 'targets', 'air_yards'],
    'efficiency': ['yards_per_route', 'target_share']
}

OPPONENT_METRICS = [
    'pass_yards_allowed',
    'rush_yards_allowed',
    'air_yards_allowed',
    'yards_after_catch_allowed',
    'epa_allowed',
    'sack_made',
    'interceptions_forced',
    'turnover_created',
    'pressures',
    'explosive_pass_allowed',
    'fumbles_forced',
    'success',
    'epa_per_play'
]

In [186]:
# import weekly nfl data for 2021 to 2023
df = nfl.import_weekly_data(years=[2021,2022,2023])
schedules = nfl.import_schedules([2021, 2022, 2023])
play_by_play = nfl.import_pbp_data(years=range(2021,2024), downcast = True)
pbp = play_by_play[play_by_play['play_type'].isin(['pass', 'run'])] # Offensive Plays to Build Defense
pbp['defense_team'] = pbp['defteam']

# Potential team name fix
df['recent_team'] = df['recent_team'].replace(TEAM_FIX)
df['opponent_team'] = df['opponent_team'].replace(TEAM_FIX)
schedules['home_team'] = schedules['home_team'].replace(TEAM_FIX)
schedules['away_team'] = schedules['away_team'].replace(TEAM_FIX)
play_by_play['defteam'] = play_by_play['defteam'].replace(TEAM_FIX)
pbp['defteam'] = pbp['defteam'].replace(TEAM_FIX)
pbp['defense_team'] = pbp['defense_team'].replace(TEAM_FIX)

# Create QB dataset for future use and testing
keep_cols_qbs = [
    'player_id', 'player_name', 'season', 'season_type','week', 'recent_team', 'opponent_team', 'passing_yards', 'attempts',
    'completions', 'passing_tds', 'interception', 'rushing_yards', 'rushing_tds', 'fantasy_points', 'fantasy_points_ppr', 'sacks',
]

Downcasting floats.
2021 done.
2022 done.
2023 done.
Downcasting floats.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['defense_team'] = pbp['defteam']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['defteam'] = pbp['defteam'].replace(TEAM_FIX)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['defense_team'] = pbp['defense_team'].replace(TEAM_FIX)


In [187]:
'''# Validate data is clean (i.e. no dupes, missing values)
print(df.isna().sum())
print(df.dtypes)
print(df.groupby(['season','week','player_name']).size())'''
#cols_to_drop = [c for c in qbs.columns if 'home_team' in c or 'away_team' in c or 'home_game' in c]
#qbs = qbs.drop(columns=cols_to_drop, errors='ignore')

print(df.columns.tolist())

['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'recent_team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr']


In [None]:
# Create utility functions

def _shifted_rolling(series, window = 3, min_period = 1):
    return series.shift(1).rolling(window, min_period).mean()

def _shifted_expanding_mean(series):
    return series.shift(1).expanding().mean()

# Create stat generator functions

def add_season_metrics(df, metrics, group_cols = None):
    df = df.copy()

    if group_cols is None:
        # Create group keys
        if 'player_id' in df.columns:
            group_cols = ['player_id']
        elif 'opponent_team' in df.columns:
            group_cols = ['opponent_team']
        elif 'team' in df.columns:
            group_cols = ['team']
        else:
            raise ValueError("add_season_metrics: could not infer group cols, explicitly set group keys")
        group_keys = group_cols + ['season']

        df = df.sort_values(group_keys + ['week']).reset_index(drop = True)

        for m in metrics: 
            # Predictive season totals and averages
            df[f'szn_total_{m}'] = df.groupby(group_keys)[m].cumsum().shift(1)

            prior_sum = df.groupby(group_keys)[m].cumsum().shift(1)
            prior_count = df.groupby(group_keys).cumcount()
            df[f'szn_avg_{m}'] = prior_sum / prior_count.replace(0, np.nan)
            
    return df

def add_rolling_features(df, metrics, windows = [3,5], group = ['player_id', 'season']):
    df = df.copy()

    for m in metrics:
        for w in windows:
            df[f'{m}_roll_avg_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: s.shift(1).rolling(w).mean())
            )
            df[f'{m}_std_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: s.shift(1).rolling(w).std())
            )
            df[f'{m}_roll_change_{w}'] = (
                df.groupby(group)[m]
                .apply(lambda s: (s - s.shift(1)) / s.shift(1))
            )
    return df

def add_league_rolling_averages(defense_df, metrics):
    df = defense_df.copy()

    league = (
        df.groupby(['season', 'week'])[metrics]
          .mean()
          .reset_index()
          .rename(columns = lambda c: f'league_avg_{c}')
    )

    df = df.merge(league, on = ['season', 'week'], how = 'left')

    # Shift to ensure no data leakage
    for m in metrics:
        col = f'league_avg_{m}'
        if col in df.columns:
            df[col] = df.groupby('season')[col].shift(1)

    return df

# ----------------------------  -  -  ----  -  -  ---------------------------------------------------------------------

# Double check your build defense function works/rebuild needed parts

# Create defensive features function
def build_defense_features(pbp, schedules):
    pbp = pbp.copy()

    # Create yards allowed metrics
    pbp['pass_yards_allowed'] = pbp['passing_yards'].fillna(0)
    pbp['air_yards_allowed'] = pbp['air_yards'].fillna(0)
    pbp['rush_yards_allowed'] = pbp['rushing_yards'].fillna(0)
    pbp['yards_after_catch_allowed'] = pbp['yards_after_catch'].fillna(0)
    pbp['rec_yards_allowed'] = pbp['receiving_yards'].fillna(0)

    # Create remaining metrics

    pbp['first_down_allowed'] = (pbp['first_down'] == 1).astype(int)
    pbp['completions_allowed'] = pbp['complete_pass'].fillna(0)
    pbp['dropbacks_allowed'] = pbp['qb_dropback'].fillna(0)

    # Touchdowns
    pbp['pass_td_allowed'] = (pbp['pass_touchdown'] == 1).astype(int)
    pbp['rush_td_allowed'] = (pbp['rush_touchdown'] == 1).astype(int)

    # Turnovers/Negative Plays
    pbp['turnover_created'] = ((pbp['interception'] == 1) | (pbp['fumble_lost'] == 1)).astype(int)
    pbp['interceptions_forced'] = pbp['interception'].fillna(0)
    pbp['fumbles_forced'] = pbp['fumble_forced'].fillna(0)
    pbp['sack_made'] = (pbp['sack'] == 1).astype(int)
    pbp['negative_play_created'] = ((pbp['sack'] == 1) | (pbp['tackled_for_loss'] == 1)).astype(int)
    pbp['pressures'] = (pbp['was_pressure'] == 1).astype(int)

    # EPA
    pbp['epa_allowed'] = pbp['epa'].fillna(0)
    pbp['pass_epa_allowed'] = pbp['air_epa'].fillna(0) + pbp['yac_epa']
    pbp['rush_epa_allowed'] = pbp['total_home_rush_epa'] + pbp['total_away_rush_epa']

    # Explosives

    pbp['explosive_pass_allowed'] = ((pbp['air_yards'] >= 20) | (pbp['yards_after_catch'] >= 20)).astype(int)
    pbp['explosive_rush_allowed'] = (pbp['rushing_yards'] >=10).astype(int)

    # Create defensive dataset
    defense = (
        pbp.groupby(['defense_team', 'season','week'])
        .agg({

            # Passing
            'pass_yards_allowed': 'sum',
            'pass_td_allowed': 'sum',
            'sack_made': 'sum',
            'interceptions_forced': 'sum',
            'play_id': 'count',
            'pressures': 'sum',
            'air_yards_allowed': 'sum',
            'yards_after_catch_allowed': 'sum',
            'completions_allowed': 'sum',
            'explosive_pass_allowed': 'sum',
            'pass_epa_allowed': 'sum',
            'dropbacks_allowed': 'sum',

            # Rushing
            'rush_yards_allowed': 'sum',
            'rush_td_allowed': 'sum',
            'rush_attempt': 'sum',
            'fumbles_forced': 'sum',
            'explosive_rush_allowed': 'sum',
            'rush_epa_allowed': 'sum',

            # Receiving
            'rec_yards_allowed': 'sum',

            # Other
            'first_down_allowed': 'sum',
            'turnover_created': 'sum',
            'epa_allowed': 'sum',
            'success': 'mean',
            'play_type': 'count'
        })
        .reset_index()
        .rename(columns = {'defense_team': 'opponent_team'})
    )

    defense['total_yards_allowed'] = (
        defense['pass_yards_allowed'] +
        defense['rush_yards_allowed']
    )

    defense['epa_per_play'] = defense['epa_allowed'] / defense['play_type']

    defense['rush_yards_per_attempt'] = (
        defense['rush_yards_allowed'] / defense['rush_attempt'].replace(0, np.nan)
    )

    defense['pass_yards_per_completion'] = (
        defense['pass_yards_allowed'] / (defense['pass_yards_allowed'] > 0).astype(int)
    )

    defense['turnover_rate'] = (
        defense['turnover_created'] / defense['play_type']
    )

    defense['sack_rate'] = (
        defense['sack_made'] / defense['play_type']
    )

    defense['pressure_rate'] = (
        defense['pressures'] / defense['dropbacks_allowed']
    )

    defense = add_rolling_features(
        defense,
        metrics = OPPONENT_METRICS,
        windows = [3, 5, 10],
        group = ['opponent_team', 'season']
    )

    defense = add_season_metrics(defense, OPPONENT_METRICS)

    defense = add_league_rolling_averages(defense, OPPONENT_METRICS)

    # Create defensive points allowed for home teams
    home_def = schedules[['season', 'week', 'home_team', 'away_score']].copy()
    home_def = home_def.rename(columns={
        'home_team': 'team',
        'away_score': 'points_allowed'
    })

    # Create defensive points allowed for away teams
    away_def = schedules[['season', 'week', 'away_team', 'home_score']].copy()
    away_def = away_def.rename(columns={
        'away_team': 'team',
        'home_score': 'points_allowed'
    })

    defense['total_yards_allowed'] = defense['pass_yards_allowed'] + defense['rush_yards_allowed']

    def_points = pd.concat([home_def, away_def], ignore_index = True)

    defense = defense.rename(columns = {'opponent_team':'team'})

    defense = defense.merge(def_points, on = ['season', 'team', 'week'], how = 'left')

    defense = defense.rename(columns = {'team': 'opponent_team'})

    defense = defense.add_prefix("opp_")

    return defense


# Create helper function to designate home/away flag 
def add_home_away_flags(pos_df, schedules):
    # Simplify schedules to unique team-week mapping
    home_flags = schedules[['season', 'week', 'home_team']].copy()
    home_flags['home_game'] = 1
    home_flags.rename(columns={'home_team': 'team'}, inplace=True)

    away_flags = schedules[['season', 'week', 'away_team']].copy()
    away_flags['home_game'] = 0
    away_flags.rename(columns={'away_team': 'team'}, inplace=True)

    # Combine into one clean mapping
    team_week_flags = pd.concat([home_flags, away_flags], ignore_index=True)

    # Merge safely on team identity
    merged = pos_df.merge(
        team_week_flags,
        left_on=['season', 'week', 'recent_team'],
        right_on=['season', 'week', 'team'],
        how='left'
    )

    merged.drop(columns=['team'], inplace=True, errors='ignore')

    # Ensure the column exists even for missing merges
    if 'home_game' not in merged.columns:
        merged['home_game'] = np.nan
    return merged

# Create helper function to create win/loss flags and win(loss) streak
def add_team_win_streaks(pos_df, schedules):
    sch = schedules.copy()
    if 'season_type' not in sch.columns and 'game_type' in sch.columns:
        # Rename to match used name
        sch = sch.rename(columns={'game_type':'season_type'})
    
    # Rename game types for later
    sch['season_type'] = sch['season_type'].replace({
        'REG': 'REG',
        'WC': 'POST',
        'DIV': 'POST',
        'CON': 'POST',
        'SB': 'POST'
    })

    # Grab what we need
    sch = sch[['season', 'season_type', 'week', 'home_team', 'away_team', 'home_score', 'away_score']].copy()

    # Create team results 
    home = sch[['season', 'season_type', 'week', 'home_team', 'home_score', 'away_score']].rename(columns={'home_team':'team'})
    home['team_win'] = (home['home_score'] > home['away_score']).astype(int)
    home = home[['season', 'season_type', 'week', 'team', 'team_win']]

    away = sch[['season', 'season_type', 'week', 'away_team', 'away_score', 'home_score']].rename(columns={'away_team':'team'})
    away['team_win'] = (away['away_score'] > away['home_score']).astype(int)
    away = away[['season', 'season_type', 'week', 'team', 'team_win']]

    team_results = pd.concat([home, away], ignore_index=True)

    # Order regular and post season properly for a season
    stype_order = {'REG': 1, 'POST': 2}
    team_results['stype_order'] = team_results['season_type'].map(stype_order).fillna(1)
    team_results = team_results.sort_values(['team', 'season', 'stype_order', 'week']).reset_index(drop=True)

    # Calculate streak
    def compute_streaks(x):
        # Streak counter 
        cnt = 0
        out = []
        for v in x:
            out.append(cnt)
            if v == 1:
                cnt += 1
            else:
                cnt = 0
        return pd.Series(out, index=x.index)
    
    team_results['team_win_streak'] = team_results.groupby(['team', 'season'], group_keys=False)['team_win'].apply(compute_streaks)

    # Merge streaks back into positional df
    pos_df = pos_df.merge(
        team_results[['season', 'week', 'team', 'team_win_streak']],
        left_on = ['season', 'week', 'recent_team'],
        right_on = ['season', 'week', 'team'],
        how = 'left',
        validate = 'm:1'
    )

    # Add flag for outcome of prior game
    pos_df['team_won_last'] = (
        pos_df.groupby(['recent_team', 'season'])['team_win'].shift(1).fillna(0).astype(int)
    )

    pos_df.drop(columns=['team'], inplace=True, errors='ignore')
    return pos_df

# Create rolling 3 week features and general stats
def add_pos_stats(df):
    df = df.copy().sort_values(['player_id', 'season', 'week'])
    if 'position' not in df.columns:
        return df
    
    pos = df['position'].iloc[0]
    if pos == "QB":
        # Create rolling stats for meaningful OFF metrics
        # Completion percentage and efficiency metrics
        df['completion_pct'] = df['completions'] / df['attempts'].replace(0, np.nan)
        df['yards_per_attempt'] = df['passing_yards'] / df['attempts'].replace(0, np.nan)

        # Rolling mean and standard deviation calculated
        for stat in ['passing_yards', 'attempts', 'passing_tds', 'interceptions', 'completion_pct', 'yards_per_attempt']:
            if stat in df.columns:
                df[f'{stat}_roll3_mean'] = df.groupby('player_id')[stat].transform(lambda x: x.shift(1).rolling(3).mean())
                df[f'{stat}_roll3_std'] = df.groupby('player_id')[stat].transform(lambda x: x.shift(1).rolling(3).std())

        # Momentum indicators
        df['prev_passing_yards'] = df.groupby(['player_id', 'season'])['passing_yards'].shift(1)
        df['prev_yards_minus_roll3mean'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .apply(lambda s: s.shift(1).rolling(3).mean())
            .reset_index(drop = True)
        )

        df['passing_yards_volatility'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .apply(lambda s: s.shift(1).rolling(3).std())
            .reset_index(drop = True)
        )

        df['passing_trend_3'] = (
            df.groupby(['player_id', 'season'])['passing_yards']
            .diff(periods=3)
        )

        df['completion_trend_3'] = (
            df.groupby(['player_id', 'season'])['completions']
            .apply(lambda s: (s - s.shift(1)) / s.shift(1))
            .reset_index(drop = True)
            .rolling(3).mean()
        )
    elif 'pos' == 'RB':
        df = add_season_metrics(df, RB_METRICS['core'], group_cols = ['player_id'])
        df = add_rolling_features(df, RB_METRICS['core'], group = ['player_id', 'season'])
    elif 'pos' == 'WR':
        df = add_season_metrics(df, WR_METRICS['core'], group_cols = ['player_id'])
        df = add_rolling_features(df, WR_METRICS['core'], group = ['player_id', 'season'])
    elif 'pos' == 'TE':
        df = add_season_metrics(df, TE_METRICS['core'], group_cols = ['player_id'])
        df = add_rolling_features(df, TE_METRICS['core'], group = ['player_id', 'season'])
    return df

# Create defense merger
def merge_defense_features(pos_df, defense_df):
    return pos_df.merge(
        defense_df,
        how = 'left',
        on = ['opponent_team', 'season', 'week']
    )

# Create final safe merge for all team context metrics
def merge_team_context(pos_df, schedules):
    # Win/loss flag creation
    sch = schedules.copy()
    sch['home_win'] = (sch['home_score'] > sch['away_score']).astype(int)
    sch['away_win'] = (sch['away_score'] > sch['home_score']).astype(int)

    # Team-level view for both home and away team
    home_games = sch[['season', 'week', 'home_team', 'away_team', 'home_win']].rename(
        columns={'home_team': 'team', 'away_team': 'opponent_team', 'home_win':'team_win'}
    )
    home_games['home_game'] = 1

    away_games = sch[['season', 'week', 'home_team', 'away_team', 'away_win']].rename(
        columns={'away_team': 'team', 'home_team': 'opponent_team', 'away_win':'team_win'}
    )
    away_games['home_game'] = 0

    # Combine into unified "team_games"
    team_games = pd.concat([home_games, away_games], ignore_index=True)
    
    team_games = team_games.dropna(subset=['team', 'opponent_team'])
    team_games.drop_duplicates(subset=['season', 'week', 'team'], inplace=True)

    # Merge
    merged = pos_df.merge(
        team_games,
        left_on = ['season', 'week', 'recent_team', 'opponent_team'],
        right_on = ['season', 'week', 'team', 'opponent_team'],
        how='left'
    )

    # Clean
    merged.drop(columns=['team'], inplace=True, errors='ignore')
    merged = merged.drop_duplicates(subset = ['player_id', 'season', 'week'])
    return merged

# Create league average metrics
def compute_league_averages(defense):
    # Num columns Only
    numeric_cols = defense.select_dtypes(include = 'number').columns

    league = defense.groupby('season')[numeric_cols].mean()
    league = league.add_prefix('league_avg_')

    return league


# Create defensive efficiency in respect to season
def add_defense_efficiency(df):
    df = df.copy()

    def has(cols):
        return all(c in df.columns for c in cols)

    # Passing
    if 'opp_avg_pass_yards_allowed' in df.columns and 'league_avg_pass_yards_allowed' in df.columns:
        df['opp_pass_efficiency_index'] = (
            df['opp_avg_pass_yards_allowed'] / df['league_avg_pass_yards_allowed'].replace(0, np.nan)
        )
    if 'opp_avg_air_yards_allowed' in df.columns and 'league_avg_air_yards_allowed' in df.columns:
        df['opp_air_efficiency_index'] = (
            df['opp_avg_air_yards_allowed'] / df['league_avg_air_yards_allowed'].replace(0, np.nan)
        )
    if 'opp_avg_completions_allowed' in df.columns and 'league_avg_completions_allowed' in df.columns:
        df['opp_completion_index'] = (
            df['opp_avg_completions_allowed'] / df['league_avg_completions_allowed'].replace(0, np.nan)
        )

    # Rushing
    if 'opp_avg_rush_yards_allowed' in df.columns and 'league_avg_rush_yards_allowed' in df.columns:
        df['opp_rush_efficiency_index'] = (
            df['opp_avg_rush_yards_allowed'] / df['league_avg_rush_yards_allowed'].replace(0, np.nan)
        )

    # Receiving
    if 'opp_avg_rec_yards_allowed' in df.columns and 'league_avg_rec_yards_allowed' in df.columns:
        df['opp_rec_efficiency_index'] = (
            df['opp_avg_rec_yards_allowed'] / df['league_avg_rec_yards_allowed'].replace(0, np.nan)
        )
    if 'opp_avg_yards_after_catch_allowed' in df.columns and 'league_avg_yards_after_catch_allowed' in df.columns:
        df['opp_yac_efficiency_index'] = (
            df['opp_avg_yards_after_catch_allowed'] / df['league_avg_yards_after_catch_allowed'].replace(0, np.nan)
        )

    # TD Rate
    if 'opp_avg_pass_td_allowed' in df.columns and 'league_avg_pass_td_allowed' in df.columns:
        df['opp_pass_td_rate_index'] = (
            df['opp_avg_pass_td_allowed'] / df['league_avg_pass_td_allowed'].replace(0, np.nan)
        )
    if 'opp_avg_rush_td_allowed' in df.columns and 'league_avg_rush_td_allowed' in df.columns:
        df['opp_rush_td_rate_index'] = (
            df['opp_avg_rush_td_allowed'] / df['league_avg_rush_td_allowed'].replace(0, np.nan)
        )

    # EPA 
    if 'opp_pass_epa_allowed' in df.columns and 'league_avg_pass_epa_allowed' in df.columns:
        df['opp_pass_epa_index'] = (
            df['opp_pass_epa_allowed'] / df['league_avg_pass_epa_allowed'].replace(0, np.nan)
        )
    if 'opp_rush_epa_allowed' in df.columns and 'league_avg_rush_epa_allowed' in df.columns:
        df['opp_rush_epa_index'] = (
            df['opp_rush_epa_allowed'] / df['league_avg_rush_epa_allowed'].replace(0, np.nan)
        )

    # Explosive Plays
    if 'opp_explosive_pass_allowed' in df.columns and 'league_avg_explosive_pass_allowed' in df.columns:
        df['opp_explosive_pass_index'] = (
            df['opp_explosive_pass_allowed'] / df['league_avg_explosive_pass_allowed'].replace(0, np.nan)
        )
    if 'opp_explosive_rush_allowed' in df.columns and 'league_avg_explosive_rush_allowed' in df.columns:
        df['opp_explosive_rush_index'] = (
            df['opp_explosive_rush_allowed'] / df['league_avg_explosive_rush_allowed'].replace(0, np.nan)
        )

    return df

# Build QB dataset (final)
def build_dataset(df, schedules, play_by_play):
    df = df.copy()
    
    defense = build_defense_features(play_by_play)

    df = merge_defense_features(df, defense)

    df = merge_team_context(df, schedules)
    df = add_team_win_streaks(df, schedules)
    df = add_pos_stats(df)
    df = add_defense_efficiency(df)

    # Bye week tracker
    df['bye_last_week'] = (df['week'] - df.groupby(['player_id', 'season'])['week'].shift(1) > 1).astype(int)

    # season week index
    df['season_week'] = df['week']

    # Playoffs indicator
    df['is_playoffs'] = (df['season_type'] != 'REG').astype(int)

    return df

In [189]:
# Create predictive metrics for QB

db = build_dataset(df, schedules, pbp)

print(db.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['pass_yards_allowed'] = pbp['passing_yards'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['air_yards_allowed'] = pbp['air_yards'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pbp['rush_yards_allowed'] = pbp['rushing_yards'].fillna(0)
A value is trying to be set on a 

ValueError: Could not find a valid group key, figure it out

In [None]:
# Check df to ensure to dupes or other errors
#print(qbs.sample(10))
#print(qbs[['player_id', 'season', 'week']].duplicated().sum())
#print(qbs['home_game'].value_counts(dropna=False))
'''qb_name = "P.Mahomes"
qb_df = qbs[(qbs['player_name'] == qb_name) & (qbs['season'] == 2023)]

plt.figure(figsize=(10,5))
plt.plot(qb_df['week'], qb_df['passing_yards'], label='Passing Yards', marker='o')
plt.plot(qb_df['week'], qb_df['passing_yards_roll3_mean'], label='3-Game Rolling Mean', marker='x')
plt.fill_between(qb_df['week'], 
                 qb_df['passing_yards_roll3_mean'] - qb_df['passing_yards_volatility'],
                 qb_df['passing_yards_roll3_mean'] + qb_df['passing_yards_volatility'],
                 alpha=0.2, label='Volatility Range')
plt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")
plt.xlabel("Week")
plt.ylabel("Passing Yards")
plt.legend()
plt.grid(True)
plt.show()'''

'qb_name = "P.Mahomes"\nqb_df = qbs[(qbs[\'player_name\'] == qb_name) & (qbs[\'season\'] == 2023)]\n\nplt.figure(figsize=(10,5))\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards\'], label=\'Passing Yards\', marker=\'o\')\nplt.plot(qb_df[\'week\'], qb_df[\'passing_yards_roll3_mean\'], label=\'3-Game Rolling Mean\', marker=\'x\')\nplt.fill_between(qb_df[\'week\'], \n                 qb_df[\'passing_yards_roll3_mean\'] - qb_df[\'passing_yards_volatility\'],\n                 qb_df[\'passing_yards_roll3_mean\'] + qb_df[\'passing_yards_volatility\'],\n                 alpha=0.2, label=\'Volatility Range\')\nplt.title(f"{qb_name} - Passing Yards, Rolling Mean & Volatility (2023)")\nplt.xlabel("Week")\nplt.ylabel("Passing Yards")\nplt.legend()\nplt.grid(True)\nplt.show()'