In [1]:
# %matplotlib inline
# %pip install scikit-learn
import pandas as pd
import numpy as np  
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import warnings
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from typing import Dict, List, Tuple, Optional, Union, Callable

# Suppress all warnings
warnings.filterwarnings('ignore')

# Configuration
SUPPLEMENT_FILE = 'supplementary_data.csv'
INPUT_PATTERN = 'input_2023_w{:02d}.csv'
OUTPUT_PATTERN = 'supplement-input_2023_w{:02d}.csv'
NUM_WEEKS = 18

## Configuration Constants

In [2]:
# ============================================================================
# CONFIGURATION CONSTANTS
# ============================================================================

# Pressure zone definitions (yards from QB)
PRESSURE_ZONES = {
    'immediate': 3.0,    # 0-3 yards: Must throw/scramble NOW
    'closing': 5.0,      # 3-5 yards: Pressure building rapidly  
    'potential': 7.0     # 5-7 yards: Defenders approaching
}

# Convergence classification thresholds
CONVERGENCE_THRESHOLDS = {
    'critical': 3,    # 3+ defenders = critical convergence
    'high': 2,        # 2 defenders = high convergence
    'moderate': 1,    # 1 defender = moderate convergence
    'none': 0         # 0 defenders = no convergence
}

# Expected frame rate (frames per second)
FRAME_RATE = 10  # NFL tracking data standard


In [3]:
# ============================================================================
# CONFIGURATION: ENHANCED COLLAPSE THRESHOLDS
# ============================================================================

ENHANCED_COLLAPSE_CONFIG = {
    # QB Separation thresholds (yards)
    'separation_threshold': 7.0,
    'critical_separation': 4.0,
    
    # Convergence thresholds (defender count)
    'min_converging_defenders': 2,
    'critical_converging_defenders': 3,
    
    # Velocity threshold (yards/second)
    'velocity_threshold': -5.0,
    
    # Zone-specific thresholds
    'immediate_threat_zone': 3.0,
    'closing_threat_zone': 5.0
}

## Load Data

In [4]:
# Load supplementary data
supplementary_data = pd.read_csv(os.path.join('data', SUPPLEMENT_FILE))

# Add feature pass_category based on pass_length. 
# pass_categories are 'short', 'intermediate', and 'long'. Short is <=5 yards, intermediate is >5 and <=15 yards, long is >15 yards. Use dict mapping.
conditions = [
    supplementary_data['pass_length'] < 5,
    (supplementary_data['pass_length'] >= 5) & (supplementary_data['pass_length'] <= 15),
    supplementary_data['pass_length'] > 15
]

categories = ['short', 'intermediate', 'long']

supplementary_data['pass_type'] = np.select(
    conditions, 
    categories, 
    default='unknown'
)


In [5]:
supplementary_data.shape

(18009, 42)

In [6]:
# Concatenate all weekly data into one datase
# Load all weeks data into one dataframe
input_data_frames = []
for week in range(1, NUM_WEEKS + 1):
    file_path = os.path.join('data', 'input', INPUT_PATTERN.format(week))
    df_week = pd.read_csv(file_path)
    df_week['week'] = week  # Add week column to track the source
    input_data_frames.append(df_week)

# Combine all dataframes
input_data = pd.concat(input_data_frames, ignore_index=True)
print(f"Loaded {len(input_data_frames)} weeks of data with {len(input_data)} total rows")
print(f"Test : {input_data['week'].max()}")

Loaded 18 weeks of data with 4880579 total rows
Test : 18


In [7]:
# Merge the supplementary column 'route_of_targeted_receiver' into the input data
input_data = pd.merge(
    input_data,                 # The left DataFrame (the one receiving the new column)
    supplementary_data[['game_id', 'play_id', 'route_of_targeted_receiver']],  # The right DataFrame (only the key columns and the column to add)
    on=['game_id', 'play_id'],  # The key columns to match on
    how='left'           # Specifies a left join
)

In [8]:
input_data.shape

(4880579, 25)

In [9]:
# list total unique plays in the input data
unique_plays = input_data[['game_id', 'play_id']].drop_duplicates()
print(f"Total unique plays in input data: {len(unique_plays)}")

# group by game_id and play_id and count
play_counts = unique_plays.groupby(['game_id', 'play_id']).size().reset_index(name='count')
print(f"Total unique plays after grouping: {len(play_counts)}")

Total unique plays in input data: 14108
Total unique plays after grouping: 14108


## Feature Engineering

In [10]:
def calculate_euclidean_distance_vectorized(tr_x, tr_y, def_x, def_y):
    """
    Calculate Euclidean distance between Targeted Receiver and all defenders using vectorized operations.
    
    Parameters:
    tr_x, tr_y: Quaterback coordinates (scalar)
    def_x, def_y: Defender coordinates (arrays/series)
    
    Returns:
    Array: Euclidean distances to all defenders
    """
    return np.sqrt((def_x - tr_x)**2 + (def_y - tr_y)**2)

### Frame Level - QB 
#### QB separation from nearest defender 

In [11]:
def calculate_frame_level_separation_qb(df):
    """
    Calculate minimum separation between Quarterback and defensive players at FRAME level.
    Returns only Quarterback data with qb_min_separation per frame.
    
    Parameters:
    df: DataFrame containing tracking data with columns:
        - game_id, play_id, frame_id, nfl_id
        - player_role: 'Passer' or 'Defensive Coverage'
        - player_name
        - x, y: Player coordinates
    
    Returns:
    DataFrame: Quarterback data with columns (game_id, play_id, nfl_id, frame_id, player_name, player_role, qb_min_separation)
    """
    
    # Group by game, play, and frame (FRAME-LEVEL calculation)
    grouped = df.groupby(['game_id', 'play_id', 'frame_id'])
    
    # List to store Quarterback results
    qb_results = []
    
    for (game_id, play_id, frame_id), frame_group in grouped:
        # Identify Quarterbacks and defenders in this specific frame
        quarterbacks = frame_group[frame_group['player_role'] == 'Passer']
        defenders = frame_group[frame_group['player_role'] == 'Defensive Coverage']
        
        # ✅ HANDLE MISSING DATA INSTEAD OF SKIPPING
        if quarterbacks.empty:
            continue  # No QB = can't analyze this frame
            
        if defenders.empty:
            # No defenders = maximum separation (safe pocket)
            for qb_idx, qb_row in quarterbacks.iterrows():
                qb_result = {
                    'game_id': game_id,
                    'play_id': play_id,
                    'nfl_id': qb_row['nfl_id'],
                    'frame_id': frame_id,
                    'player_name': qb_row['player_name'],
                    'player_role': qb_row['player_role'],
                    'player_position': qb_row['player_position'],
                    'qb_min_separation': 999.0  # Very large value = no pressure
                }
                qb_results.append(qb_result)
            continue
        
        # Extract defender coordinates as arrays for vectorized calculation
        def_x = defenders['x'].values
        def_y = defenders['y'].values
        
        # For Quarterback, calculate distance to all defenders using vectorization
        for qb_idx, qb_row in quarterbacks.iterrows():
            qb_x, qb_y = qb_row['x'], qb_row['y']
            
            # Vectorized distance calculation across all defenders in this frame
            distances = calculate_euclidean_distance_vectorized(qb_x, qb_y, def_x, def_y)
            
            # Find minimum distance for this frame
            min_distance = np.round(np.min(distances), 2)
            
            # Create result record for this Quarterback
            qb_result = {
                'game_id': game_id,
                'play_id': play_id,
                'nfl_id': qb_row['nfl_id'],
                'frame_id': frame_id,
                'player_name': qb_row['player_name'],
                'player_role': qb_row['player_role'],
                'player_position': qb_row['player_position'],
                'qb_min_separation': min_distance
            }
            qb_results.append(qb_result)
    
    # Convert results to DataFrame
    result_df = pd.DataFrame(qb_results)
    
    # Sort by game, play, quarterback, and frame for chronological analysis
    if not result_df.empty:
        result_df = result_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)
    
    return result_df

#### QB Presssure Velocity Calculation

In [12]:
# ----------------------------------------------------------------------------
# 4.1 QB PRESSURE VELOCITY CALCULATION
# ----------------------------------------------------------------------------

def calculate_frame_level_pressure_velocity_qb(qb_separation_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate rate of change in QB separation (pressure velocity) and acceleration.
    
    Pressure velocity indicates how fast defenders are closing in on the quarterback.
    Negative velocity means increasing pressure (defenders getting closer).
    
    Parameters
    ----------
    qb_separation_df : DataFrame
        Frame-level QB separation data with columns:
        ['game_id', 'play_id', 'nfl_id', 'frame_id', 'qb_min_separation']
    
    Returns
    -------
    DataFrame with additional columns:
        - Change in separation from previous frame (yards)
        - separation_velocity: Rate of change (yards/second)
        - pressure_acceleration: Rate of change of velocity (yards/second²)
        
    Notes
    -----
    - Negative velocity: Defenders closing in (pressure increasing)
    - Positive velocity: QB creating space (pressure decreasing)
    - Negative acceleration = Defender speeding up toward QB (pressure intensifying)
    - Positive acceleration = Defender slowing down (pressure decreasing)
    - First frame of each play will have NaN velocity (no previous frame)
    
    Examples
    --------
    >>> qb_velocity_frame_level = calculate_qb_pressure_velocity(qb_separation_df)
    >>> qb_velocity_frame_level[['frame_id', 'qb_min_separation', 'separation_velocity']].head()
       frame_id  qb_min_separation  separation_velocity
    0         1               10.5                  NaN
    1         2               10.2                 -3.0
    2         3                9.8                 -4.0
    """
    
    # Input validation
    required_cols = ['game_id', 'play_id', 'nfl_id', 'frame_id', 'qb_min_separation']
    missing_cols = set(required_cols) - set(qb_separation_df.columns)
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Create a copy to avoid modifying original data
    df = qb_separation_df.sort_values(
        ['game_id', 'play_id', 'nfl_id', 'frame_id']
    ).copy()
    
    # Group by play to ensure calculations don't cross play boundaries
    grouping_cols = ['game_id', 'play_id', 'nfl_id']
    
    # Calculate frame-to-frame separation change
    df['separation_diff'] = df.groupby(grouping_cols)['qb_min_separation'].diff()
    
    # Convert to velocity (yards per second)
    # Frame interval = 0.1 seconds
    FRAME_INTERVAL = 0.1
    df['separation_velocity'] = (df['separation_diff'] / FRAME_INTERVAL).round(2)
    
    # Calculate acceleration (change in velocity)
    df['velocity_diff'] = df.groupby(grouping_cols)['separation_velocity'].diff()
    df['pressure_acceleration'] = (df['velocity_diff'] / FRAME_INTERVAL).round(2)
    
    # Clean up intermediate columns
    df = df.drop(columns=['separation_diff', 'velocity_diff'])
    
    return df

#### Defender Convergence Calculation

In [13]:
# ============================================================================
# CALCULATE DEFENDER CONVERGENCE Frame Level
# ============================================================================

def calculate_frame_level_defender_convergence(
    input_data: pd.DataFrame,
    zones: Optional[Dict[str, float]] = None,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Calculate defender convergence metrics for each frame.
    
    For every frame in the dataset, counts how many defenders are within
    defined pressure zones around the quarterback. Returns frame-level
    metrics suitable for ML feature engineering.
    
    Parameters
    ----------
    input_data : pd.DataFrame
        Frame-level tracking data containing:
        
        Required columns:
        - game_id : int
            Unique game identifier
        - play_id : int
            Unique play identifier within game
        - frame_id : int
            Frame number (10 frames per second)
        - nfl_id : int
            Player NFL ID
        - player_role : str
            'Passer' for QB, 'Defensive Coverage' for defenders
        - x : float
            Player x-coordinate on field (yards)
        - y : float
            Player y-coordinate on field (yards)
        
        Optional columns:
        - player_name : str
            Player name (for readability)
        - player_position : str
            Position code (QB, DE, LB, etc.)
    
    zones : Dict[str, float], optional
        Custom pressure zone definitions (yards from QB).
        Default uses PRESSURE_ZONES constant:
        {'immediate': 3.0, 'closing': 5.0, 'potential': 7.0}
    
    verbose : bool, default=True
        Print progress messages during processing.
    
    Returns
    -------
    pd.DataFrame
        Frame-level convergence metrics with columns:
        
        Identifiers:
        - game_id, play_id, frame_id, nfl_id : int
            Frame and player identifiers
        - player_name, player_role : str
            QB identification
        
        Convergence Counts:
        - defenders_immediate_zone : int
            Defenders in immediate threat zone (0-3 yards)
        - defenders_closing_zone : int
            Defenders in closing threat zone (0-5 yards)
        - defenders_potential_zone : int
            Defenders in potential threat zone (0-7 yards)
        - total_converging_defenders : int
            Total defenders within outermost zone
        
        Convergence Classification:
        - convergence_category : str
            'Critical', 'High', 'Moderate', or 'None'
        
        Validation Metrics:
        - closest_defender_distance : float
            Distance to nearest defender (yards)
        - total_defenders_on_field : int
            Total defenders in frame (for validation)
    
    Raises
    ------
    ValueError
        If required columns are missing from input_data.
    
    Notes
    -----
    **Pressure Zone Definitions:**
    
    Immediate Threat (0-3 yards):
    - QB must make decision NOW
    - Throw, scramble, or take sack imminent
    
    Closing Threat (3-5 yards):
    - Pressure building rapidly
    - QB has 0.5-1.0 seconds before immediate threat
    
    Potential Threat (5-7 yards):
    - Defenders approaching
    - QB has 1.0-1.5 seconds if pocket holds
    
    **Convergence Categories:**
    
    Critical (3+ defenders in 7-yard zone):
    - Pocket collapsing from multiple angles
    - Likely sack or forced quick throw
    
    High (2 defenders in 7-yard zone):
    - Significant pressure
    - Limited throwing lanes
    
    Moderate (1 defender in 7-yard zone):
    - Manageable pressure
    - Single rush threat
    
    None (0 defenders in 7-yard zone):
    - Clean pocket
    - QB has time to read defense
    
    **Performance:**
    Processing speed: ~1000-2000 frames per second
    Memory usage: ~50MB per 10,000 frames
    
    Examples
    --------
    >>> # Basic usage with default zones
    >>> convergence_df = calculate_frame_level_defender_convergence(input_data)
    >>> 
    >>> # Custom zones for different analysis
    >>> custom_zones = {'immediate': 2.5, 'closing': 4.0, 'potential': 6.0}
    >>> convergence_df = calculate_frame_level_defender_convergence(
    ...     input_data, 
    ...     zones=custom_zones,
    ...     verbose=False
    ... )
    >>> 
    >>> # Analyze high convergence frames
    >>> high_convergence = convergence_df[
    ...     convergence_df['convergence_category'].isin(['Critical', 'High'])
    ... ]
    >>> print(f"High convergence frames: {len(high_convergence)}")
    """
    
    # -------------------------------------------------------------------------
    # INPUT VALIDATION
    # -------------------------------------------------------------------------
    
    required_columns = [
        'game_id', 'play_id', 'frame_id', 'nfl_id',
        'player_role', 'x', 'y'
    ]
    
    missing_columns = [col for col in required_columns 
                       if col not in input_data.columns]
    
    if missing_columns:
        raise ValueError(
            f"Missing required columns for convergence calculation: "
            f"{missing_columns}\n"
            f"Required: {required_columns}"
        )
    
    # Use provided zones or defaults
    if zones is None:
        zones = PRESSURE_ZONES.copy()
    
    # -------------------------------------------------------------------------
    # PROCESSING INITIALIZATION
    # -------------------------------------------------------------------------
    
    if verbose:
        print("="*70)
        print("CALCULATING DEFENDER CONVERGENCE")
        print("="*70)
        print(f"Pressure Zones: {zones}")
        print("-"*70)
        print("Processing frames...")
    
    # Group data by frame for efficient processing
    grouped = input_data.groupby(['game_id', 'play_id', 'frame_id'])
    total_frames = len(grouped)
    
    # Storage for convergence results
    convergence_results = []
    
    # Progress tracking
    frames_processed = 0
    frames_with_data = 0
    
    # -------------------------------------------------------------------------
    # FRAME-LEVEL PROCESSING LOOP
    # -------------------------------------------------------------------------
    
    for (game_id, play_id, frame_id), frame_group in grouped:
        
        frames_processed += 1
        
        # Progress reporting (every 1000 frames)
        if verbose and frames_processed % 1000 == 0:
            print(f"  Processed {frames_processed:,} / {total_frames:,} frames...")
        
        # ---------------------------------------------------------------------
        # IDENTIFY QB AND DEFENDERS IN FRAME
        # ---------------------------------------------------------------------
        
        qb_data = frame_group[frame_group['player_role'] == 'Passer']
        defender_data = frame_group[
            frame_group['player_role'] == 'Defensive Coverage'
        ]
        
        # Skip frames without QB or defenders
        if qb_data.empty or defender_data.empty:
            continue
        
        frames_with_data += 1
        
        # ---------------------------------------------------------------------
        # PROCESS EACH QB IN FRAME (typically only 1)
        # ---------------------------------------------------------------------
        
        for qb_idx, qb_row in qb_data.iterrows():
            
            # Extract QB information
            qb_x = qb_row['x']
            qb_y = qb_row['y']
            qb_nfl_id = qb_row['nfl_id']
            qb_name = qb_row.get('player_name', 'Unknown QB')
            qb_position = qb_row.get('player_position', 'QB')
            
            # Extract defender positions as arrays (for vectorization)
            defender_x = defender_data['x'].values
            defender_y = defender_data['y'].values
            total_defenders = len(defender_x)
            
            # -----------------------------------------------------------------
            # CALCULATE DISTANCES (VECTORIZED)
            # -----------------------------------------------------------------
            
            distances = calculate_euclidean_distance_vectorized(
                qb_x, qb_y, defender_x, defender_y
            )
            
            # -----------------------------------------------------------------
            # COUNT DEFENDERS IN EACH ZONE
            # -----------------------------------------------------------------
            
            # Immediate threat zone (0-3 yards)
            defenders_immediate_zone = np.sum(
                distances < zones['immediate']
            )
            
            # Closing threat zone (3-5 yards)
            defenders_closing_zone = np.sum(
                (distances >= zones['immediate']) & (distances < zones['closing'])
            )
            
            # Potential threat zone (5-7 yards)
            defenders_potential_zone = np.sum(
                (distances >= zones['closing']) & (distances < zones['potential'])
            )
            
            # Total converging defenders (outermost zone)
            total_converging = defenders_potential_zone + defenders_closing_zone + defenders_immediate_zone
            
            # -----------------------------------------------------------------
            # CLASSIFY CONVERGENCE LEVEL
            # -----------------------------------------------------------------
            
            if total_converging >= CONVERGENCE_THRESHOLDS['critical']:
                convergence_category = 'Critical'
            elif total_converging >= CONVERGENCE_THRESHOLDS['high']:
                convergence_category = 'High'
            elif total_converging >= CONVERGENCE_THRESHOLDS['moderate']:
                convergence_category = 'Moderate'
            else:
                convergence_category = 'None'
            
            # -----------------------------------------------------------------
            # VALIDATION METRICS
            # -----------------------------------------------------------------
            
            # Closest defender distance (should match qb_min_separation)
            closest_defender_distance = float(np.min(distances))
            
            # -----------------------------------------------------------------
            # STORE FRAME RESULT
            # -----------------------------------------------------------------
            
            frame_result = {
                # Identifiers
                'game_id': game_id,
                'play_id': play_id,
                'frame_id': frame_id,
                'nfl_id': qb_nfl_id,
                
                # QB information
                'player_name': qb_name,
                'player_role': qb_row['player_role'],
                'player_position': qb_position,
                
                # Convergence counts (zone-based)
                'defenders_immediate_zone': int(defenders_immediate_zone),
                'defenders_closing_zone': int(defenders_closing_zone),
                'defenders_potential_zone': int(defenders_potential_zone),
                'total_converging_defenders': int(total_converging),
                
                # Convergence classification
                'convergence_category': convergence_category,
                
                # Validation metrics
                'closest_defender_distance': closest_defender_distance,
                'total_defenders_on_field': total_defenders
            }
            
            convergence_results.append(frame_result)
    
    # -------------------------------------------------------------------------
    # CREATE OUTPUT DATAFRAME
    # -------------------------------------------------------------------------
    
    convergence_df = pd.DataFrame(convergence_results)
    
    # Sort by game, play, QB, and frame for chronological analysis
    if not convergence_df.empty:
        convergence_df = convergence_df.sort_values(
            ['game_id', 'play_id', 'nfl_id', 'frame_id']
        ).reset_index(drop=True)
    
    # -------------------------------------------------------------------------
    # PROCESSING SUMMARY
    # -------------------------------------------------------------------------
    
    if verbose:
        print(f"\n{'='*70}")
        print("CONVERGENCE CALCULATION COMPLETE")
        print("="*70)
        print(f"Total frames processed: {frames_processed:,}")
        print(f"Frames with QB & defenders: {frames_with_data:,}")
        print(f"Convergence records created: {len(convergence_df):,}")
        
        if not convergence_df.empty:
            print(f"\nConvergence Category Distribution:")
            category_counts = convergence_df['convergence_category'].value_counts()
            for category in ['Critical', 'High', 'Moderate', 'None']:
                count = category_counts.get(category, 0)
                pct = (count / len(convergence_df) * 100) if len(convergence_df) > 0 else 0
                print(f"  • {category:12s}: {count:5,} frames ({pct:5.1f}%)")
            
            print(f"\nDefender Zone Statistics:")
            print(f"  • Avg defenders within 3yds: "
                  f"{convergence_df['defenders_immediate_zone'].mean():.2f}")
            print(f"  • Avg defenders within 5yds: "
                  f"{convergence_df['defenders_closing_zone'].mean():.2f}")
            print(f"  • Avg defenders within 7yds: "
                  f"{convergence_df['defenders_potential_zone'].mean():.2f}")
            
            print(f"\nClosest Defender Statistics:")
            print(f"  • Min distance: "
                  f"{convergence_df['closest_defender_distance'].min():.2f} yards")
            print(f"  • Avg distance: "
                  f"{convergence_df['closest_defender_distance'].mean():.2f} yards")
            print(f"  • Max distance: "
                  f"{convergence_df['closest_defender_distance'].max():.2f} yards")
        
        print("="*70)
    
    return convergence_df

#### Pocket collapse detection

In [14]:
# def create_enhanced_collapse_indicator(
#     merged_df: pd.DataFrame,
#     config: Optional[Dict] = None,
#     verbose: bool = True
# ) -> pd.DataFrame:
#     """
#     Create enhanced pocket collapse indicator using multi-dimensional criteria.
    
#     Applies sophisticated collapse detection logic that combines QB separation,
#     defender convergence, and velocity metrics. Produces both binary collapse
#     flags and severity classifications for each frame.
    
#     Parameters
#     ----------
#     merged_df : pd.DataFrame
#         Frame-level data with both QB separation and convergence metrics.
        
#         Required columns:
#         - qb_min_separation : float
#         - defenders_immediate_zone : int
#         - defenders_closing_zone : int
#         - defenders_potential_zone : int
        
#         Optional columns:
#         - separation_velocity : float
    
#     config : Dict, optional
#         Custom configuration for collapse thresholds.
#         If None, uses ENHANCED_COLLAPSE_CONFIG constant.
    
#     verbose : bool, default=True
#         Print collapse detection statistics and diagnostics.
    
#     Returns
#     -------
#     pd.DataFrame
#         Input dataframe with additional collapse-related columns:
#         - collapse_factor_separation : bool
#         - collapse_factor_convergence : bool
#         - collapse_factor_immediate_threat : bool
#         - collapse_factor_velocity : bool
#         - is_pocket_collapse : bool
#         - collapse_severity : str
    
#     Raises
#     ------
#     ValueError
#         If required columns are missing from input DataFrame.
#     TypeError
#         If merged_df is not a pandas DataFrame.
    
#     Examples
#     --------
#     >>> enhanced_df = create_enhanced_collapse_indicator(merged_features)
#     >>> collapse_rate = enhanced_df['is_pocket_collapse'].mean()
#     >>> print(f"Collapse rate: {collapse_rate:.1%}")
    
#     Notes
#     -----
#     Enhanced Collapse Logic detects pocket collapse if ANY of these conditions are true:
#     1. Separation + Convergence: QB separation < 7 yds AND 2+ defenders within 7-yard zone
#     2. High Convergence (swarm): 3+ defenders within 5-yard zone
#     3. Immediate Threat: 2+ defenders within 3-yard immediate zone
#     4. Velocity + Convergence: Rapid closure (< -5 yds/sec) AND 1+ defenders within 5-yard zone
#     """
    
#     # =========================================================================
#     # INPUT VALIDATION
#     # =========================================================================
    
#     # Check if input is None
#     if merged_df is None:
#         raise ValueError(
#             "Input DataFrame is None. "
#             "Please ensure merged_pressure_features is created successfully."
#         )
    
#     # Check if input is a DataFrame
#     if not isinstance(merged_df, pd.DataFrame):
#         raise TypeError(
#             f"Expected pandas DataFrame, got {type(merged_df).__name__}. "
#             f"Please provide a valid merged DataFrame."
#         )
    
#     # Check if DataFrame is empty
#     if merged_df.empty:
#         raise ValueError(
#             "Input DataFrame is empty. "
#             "Please ensure data is loaded and merged correctly."
#         )
    
#     # Define required columns
#     required_columns = [
#         'qb_min_separation',
#         'defenders_immediate_zone',
#         'defenders_closing_zone',
#         'defenders_potential_zone'
#     ]
    
#     # Check for missing columns
#     missing_columns = [col for col in required_columns if col not in merged_df.columns]
    
#     if missing_columns:
#         print("\n" + "="*70)
#         print("ERROR: MISSING REQUIRED COLUMNS")
#         print("="*70)
#         print(f"Missing columns: {missing_columns}")
#         print(f"\nAvailable columns in input DataFrame:")
#         print(merged_df.columns.tolist())
#         print("="*70)
#         raise ValueError(
#             f"Missing required columns: {missing_columns}\n"
#             f"Required columns: {required_columns}\n"
#             f"Please ensure you've merged QB separation with convergence metrics."
#         )
    
#     # =========================================================================
#     # CONFIGURATION SETUP
#     # =========================================================================
    
#     # Use provided config or default
#     if config is None:
#         config = ENHANCED_COLLAPSE_CONFIG
    
#     if verbose:
#         print("\n" + "="*70)
#         print("CREATING ENHANCED POCKET COLLAPSE INDICATOR")
#         print("="*70)
#         print(f"Input DataFrame shape: {merged_df.shape}")
#         print(f"Method: Multi-dimensional collapse detection")
#         print(f"\nConfiguration:")
#         for key, value in config.items():
#             print(f"  • {key}: {value}")
#         print("-"*70)
    
#     # =========================================================================
#     # DATA PREPARATION
#     # =========================================================================
    
#     try:
#         # Create working copy to avoid modifying original data
#         df = merged_df.copy()
        
#         if verbose:
#             print(f"\nDataFrame copied successfully")
#             print(f"Working with {len(df):,} frames")
#             print(f"\nAvailable columns: {df.columns.tolist()}")
    
#     except Exception as e:
#         print(f"\n{'='*70}")
#         print("ERROR: Failed to copy DataFrame")
#         print("="*70)
#         print(f"Error type: {type(e).__name__}")
#         print(f"Error message: {str(e)}")
#         print("="*70)
#         raise
    
#     # =========================================================================
#     # EXTRACT CONFIGURATION PARAMETERS
#     # =========================================================================
    
#     sep_threshold = config.get('separation_threshold', 7.0)
#     critical_sep = config.get('critical_separation', 4.0)
#     min_converging = config.get('min_converging_defenders', 2)
#     critical_converging = config.get('critical_converging_defenders', 3)
#     velocity_threshold = config.get('velocity_threshold', -5.0)
    
#     if verbose:
#         print(f"\nConfiguration extracted:")
#         print(f"  • Separation threshold: {sep_threshold} yards")
#         print(f"  • Critical separation: {critical_sep} yards")
#         print(f"  • Min converging defenders: {min_converging}")
#         print(f"  • Critical converging defenders: {critical_converging}")
#         print(f"  • Velocity threshold: {velocity_threshold} yds/sec")
    
#     # =========================================================================
#     # CONDITION 1: SEPARATION + CONVERGENCE COLLAPSE
#     # =========================================================================
#     # QB separation low AND multiple defenders converging
    
#     df['collapse_factor_separation'] = (
#         (df['qb_min_separation'] < sep_threshold) &
#         (df['defenders_potential_zone'] >= min_converging)
#     )
    
#     # =========================================================================
#     # CONDITION 2: HIGH CONVERGENCE (SWARM)
#     # =========================================================================
#     # 3+ defenders within 5-yard zone regardless of QB separation
    
#     df['collapse_factor_convergence'] = (
#         df['defenders_closing_zone'] >= critical_converging
#     )
    
#     # =========================================================================
#     # CONDITION 3: IMMEDIATE THREAT
#     # =========================================================================
#     # 2+ defenders within 3-yard immediate zone (imminent sack)
    
#     df['collapse_factor_immediate_threat'] = (
#         df['defenders_immediate_zone'] >= 2
#     )
    
#     # =========================================================================
#     # CONDITION 4: VELOCITY + CONVERGENCE (IF VELOCITY AVAILABLE)
#     # =========================================================================
#     # Rapid closure (velocity < -5 yds/sec) AND defenders within 5-yard zone
    
#     if 'separation_velocity' in df.columns:
#         # Check for non-null velocity values
#         has_velocity_data = df['separation_velocity'].notna().any()
        
#         if has_velocity_data:
#             df['collapse_factor_velocity'] = (
#                 (df['separation_velocity'] < velocity_threshold) &
#                 (df['defenders_closing_zone'] >= 1)
#             )
#             if verbose:
#                 print(f"\nVelocity-based detection enabled")
#                 print(f"  • Frames with velocity data: {df['separation_velocity'].notna().sum():,}")
#         else:
#             df['collapse_factor_velocity'] = False
#             if verbose:
#                 print(f"\nVelocity column exists but contains no valid data")
#     else:
#         df['collapse_factor_velocity'] = False
#         if verbose:
#             print(f"\nVelocity-based detection disabled (no velocity column)")
    
#     # =========================================================================
#     # PRIMARY COLLAPSE INDICATOR (ANY CONDITION TRIGGERS COLLAPSE)
#     # =========================================================================
    
#     df['is_pocket_collapse'] = (
#         df['collapse_factor_separation'] |
#         df['collapse_factor_convergence'] |
#         df['collapse_factor_immediate_threat'] |
#         df['collapse_factor_velocity']
#     )
    
#     # =========================================================================
#     # COLLAPSE SEVERITY CLASSIFICATION
#     # =========================================================================
    
#     # Define severity conditions (evaluated in order of severity)
#     conditions = [
#         # Critical: 2+ defenders within 3 yards OR separation < 4 with 3+ converging
#         (df['defenders_immediate_zone'] >= 2) | 
#         ((df['qb_min_separation'] < critical_sep) & 
#          (df['defenders_potential_zone'] >= critical_converging)),
        
#         # Severe: 3+ defenders within 5 yards OR separation < 5 with 2+ converging
#         (df['defenders_closing_zone'] >= critical_converging) |
#         ((df['qb_min_separation'] < 5.0) & 
#          (df['defenders_potential_zone'] >= min_converging)),
        
#         # Moderate: 2+ defenders within 7 yards AND separation < 7
#         (df['defenders_potential_zone'] >= min_converging) & 
#         (df['qb_min_separation'] < sep_threshold),
        
#         # Light: Separation < 7 BUT only 0-1 converging defenders
#         (df['qb_min_separation'] < sep_threshold) & 
#         (df['defenders_potential_zone'] < min_converging)
#     ]
    
#     severity_levels = ['Critical', 'Severe', 'Moderate', 'Light']
#     df['collapse_severity'] = np.select(conditions, severity_levels, default='None')
    
#     # =========================================================================
#     # SUMMARY STATISTICS
#     # =========================================================================
    
#     if verbose:
#         print("\n" + "="*70)
#         print("COLLAPSE DETECTION COMPLETE")
#         print("="*70)
        
#         print(f"\nOverall Statistics:")
#         print(f"  • Total frames processed: {len(df):,}")
#         print(f"  • Collapsed frames: {df['is_pocket_collapse'].sum():,} "
#               f"({df['is_pocket_collapse'].mean():.1%})")
#         print(f"  • Clean pocket frames: {(~df['is_pocket_collapse']).sum():,} "
#               f"({(~df['is_pocket_collapse']).mean():.1%})")
        
#         print(f"\nSeverity Distribution:")
#         severity_counts = df['collapse_severity'].value_counts()
#         for severity in ['Critical', 'Severe', 'Moderate', 'Light', 'None']:
#             count = severity_counts.get(severity, 0)
#             pct = (count / len(df) * 100) if len(df) > 0 else 0
#             print(f"  • {severity:12s}: {count:5,} frames ({pct:5.1f}%)")
        
#         print(f"\nContributing Factors (frames triggered by each condition):")
#         print(f"  • Separation + Convergence: {df['collapse_factor_separation'].sum():,} frames")
#         print(f"  • High Convergence (Swarm): {df['collapse_factor_convergence'].sum():,} frames")
#         print(f"  • Immediate Threat (3yds): {df['collapse_factor_immediate_threat'].sum():,} frames")
#         print(f"  • Velocity-based Closure: {df['collapse_factor_velocity'].sum():,} frames")
        
#         print(f"\nNew Columns Added:")
#         new_cols = [
#             'collapse_factor_separation',
#             'collapse_factor_convergence',
#             'collapse_factor_immediate_threat',
#             'collapse_factor_velocity',
#             'is_pocket_collapse',
#             'collapse_severity'
#         ]
#         for col in new_cols:
#             print(f"  • {col}")
        
#         print("="*70)
    
#     return df

#### QB Merge Frame Level- Separation, Pressure Velocity, Convergence

In [15]:
# def create_enhanced_collapse_indicator(
#     merged_df: pd.DataFrame,
#     config: Optional[Dict] = None,
#     verbose: bool = True
# ) -> pd.DataFrame:
#     """
#     Create enhanced pocket collapse indicator using multi-dimensional criteria.
    
#     Applies sophisticated collapse detection logic that combines QB separation,
#     defender convergence, and velocity metrics. Produces both binary collapse
#     flags and severity classifications for each frame.
    
#     Parameters
#     ----------
#     merged_df : pd.DataFrame
#         Frame-level data with both QB separation and convergence metrics.
        
#         Required columns:
#         - qb_min_separation : float
#         - defenders_immediate_zone : int
#         - defenders_closing_zone : int
#         - defenders_potential_zone : int
        
#         Optional columns:
#         - separation_velocity : float
    
#     config : Dict, optional
#         Custom configuration for collapse thresholds.
#         If None, uses ENHANCED_COLLAPSE_CONFIG constant.
    
#     verbose : bool, default=True
#         Print collapse detection statistics and diagnostics.
    
#     Returns
#     -------
#     pd.DataFrame
#         Input dataframe with additional collapse-related columns:
#         - collapse_factor_separation : bool
#         - collapse_factor_convergence : bool
#         - collapse_factor_immediate_threat : bool
#         - collapse_factor_velocity : bool
#         - is_pocket_collapse : bool
#         - collapse_severity : str
    
#     Raises
#     ------
#     ValueError
#         If required columns are missing from input DataFrame.
#     TypeError
#         If merged_df is not a pandas DataFrame.
    
#     Examples
#     --------
#     >>> enhanced_df = create_enhanced_collapse_indicator(merged_features)
#     >>> collapse_rate = enhanced_df['is_pocket_collapse'].mean()
#     >>> print(f"Collapse rate: {collapse_rate:.1%}")
    
#     Notes
#     -----
#     Enhanced Collapse Logic detects pocket collapse if ANY of these conditions are true:
#     1. Separation + Convergence: QB separation < 7 yds AND 2+ defenders within 7-yard zone
#     2. High Convergence (swarm): 3+ defenders within 5-yard zone
#     3. Immediate Threat: 2+ defenders within 3-yard immediate zone
#     4. Velocity + Convergence: Rapid closure (< -5 yds/sec) AND 1+ defenders within 5-yard zone
#     """
    
#     # =========================================================================
#     # INPUT VALIDATION
#     # =========================================================================
    
#     # Check if input is None
#     if merged_df is None:
#         raise ValueError(
#             "Input DataFrame is None. "
#             "Please ensure merged_pressure_features is created successfully."
#         )
    
#     # Check if input is a DataFrame
#     if not isinstance(merged_df, pd.DataFrame):
#         raise TypeError(
#             f"Expected pandas DataFrame, got {type(merged_df).__name__}. "
#             f"Please provide a valid merged DataFrame."
#         )
    
#     # Check if DataFrame is empty
#     if merged_df.empty:
#         raise ValueError(
#             "Input DataFrame is empty. "
#             "Please ensure data is loaded and merged correctly."
#         )
    
#     # Define required columns
#     required_columns = [
#         'qb_min_separation',
#         'defenders_immediate_zone',
#         'defenders_closing_zone',
#         'defenders_potential_zone'
#     ]
    
#     # Check for missing columns
#     missing_columns = [col for col in required_columns if col not in merged_df.columns]
    
#     if missing_columns:
#         print("\n" + "="*70)
#         print("ERROR: MISSING REQUIRED COLUMNS")
#         print("="*70)
#         print(f"Missing columns: {missing_columns}")
#         print(f"\nAvailable columns in input DataFrame:")
#         print(merged_df.columns.tolist())
#         print("="*70)
#         raise ValueError(
#             f"Missing required columns: {missing_columns}\n"
#             f"Required columns: {required_columns}\n"
#             f"Please ensure you've merged QB separation with convergence metrics."
#         )
    
#     # =========================================================================
#     # CONFIGURATION SETUP
#     # =========================================================================
    
#     # Use provided config or default
#     if config is None:
#         config = ENHANCED_COLLAPSE_CONFIG
    
#     if verbose:
#         print("\n" + "="*70)
#         print("CREATING ENHANCED POCKET COLLAPSE INDICATOR")
#         print("="*70)
#         print(f"Input DataFrame shape: {merged_df.shape}")
#         print(f"Method: Multi-dimensional collapse detection")
#         print(f"\nConfiguration:")
#         for key, value in config.items():
#             print(f"  • {key}: {value}")
#         print("-"*70)
    
#     # =========================================================================
#     # DATA PREPARATION
#     # =========================================================================
    
#     try:
#         # Create working copy to avoid modifying original data
#         df = merged_df.copy()
        
#         if verbose:
#             print(f"\nDataFrame copied successfully")
#             print(f"Working with {len(df):,} frames")
#             print(f"\nAvailable columns: {df.columns.tolist()}")
    
#     except Exception as e:
#         print(f"\n{'='*70}")
#         print("ERROR: Failed to copy DataFrame")
#         print("="*70)
#         print(f"Error type: {type(e).__name__}")
#         print(f"Error message: {str(e)}")
#         print("="*70)
#         raise
    
#     # =========================================================================
#     # EXTRACT CONFIGURATION PARAMETERS
#     # =========================================================================
    
#     sep_threshold = config.get('separation_threshold', 7.0)
#     critical_sep = config.get('critical_separation', 4.0)
#     min_converging = config.get('min_converging_defenders', 2)
#     critical_converging = config.get('critical_converging_defenders', 3)
#     velocity_threshold = config.get('velocity_threshold', -5.0)
    
#     if verbose:
#         print(f"\nConfiguration extracted:")
#         print(f"  • Separation threshold: {sep_threshold} yards")
#         print(f"  • Critical separation: {critical_sep} yards")
#         print(f"  • Min converging defenders: {min_converging}")
#         print(f"  • Critical converging defenders: {critical_converging}")
#         print(f"  • Velocity threshold: {velocity_threshold} yds/sec")
    
#     # =========================================================================
#     # CONDITION 1: SEPARATION + CONVERGENCE COLLAPSE
#     # =========================================================================
#     # QB separation low AND multiple defenders converging
    
#     df['collapse_factor_separation'] = (
#         (df['qb_min_separation'] < sep_threshold) &
#         (df['defenders_potential_zone'] >= min_converging)
#     )
    
#     # =========================================================================
#     # CONDITION 2: HIGH CONVERGENCE (SWARM)
#     # =========================================================================
#     # 3+ defenders within 5-yard zone regardless of QB separation
    
#     df['collapse_factor_convergence'] = (
#         df['defenders_closing_zone'] >= critical_converging
#     )
    
#     # =========================================================================
#     # CONDITION 3: IMMEDIATE THREAT
#     # =========================================================================
#     # 2+ defenders within 3-yard immediate zone (imminent sack)
    
#     df['collapse_factor_immediate_threat'] = (
#         df['defenders_immediate_zone'] >= 2
#     )
    
#     # =========================================================================
#     # CONDITION 4: VELOCITY + CONVERGENCE (IF VELOCITY AVAILABLE)
#     # =========================================================================
#     # Rapid closure (velocity < -5 yds/sec) AND defenders within 5-yard zone
    
#     if 'separation_velocity' in df.columns:
#         # Check for non-null velocity values
#         has_velocity_data = df['separation_velocity'].notna().any()
        
#         if has_velocity_data:
#             df['collapse_factor_velocity'] = (
#                 (df['separation_velocity'] < velocity_threshold) &
#                 (df['defenders_closing_zone'] >= 1)
#             )
#             if verbose:
#                 print(f"\nVelocity-based detection enabled")
#                 print(f"  • Frames with velocity data: {df['separation_velocity'].notna().sum():,}")
#         else:
#             df['collapse_factor_velocity'] = False
#             if verbose:
#                 print(f"\nVelocity column exists but contains no valid data")
#     else:
#         df['collapse_factor_velocity'] = False
#         if verbose:
#             print(f"\nVelocity-based detection disabled (no velocity column)")
    
#     # =========================================================================
#     # PRIMARY COLLAPSE INDICATOR (ANY CONDITION TRIGGERS COLLAPSE)
#     # =========================================================================
    
#     df['is_pocket_collapse'] = (
#         df['collapse_factor_separation'] |
#         df['collapse_factor_convergence'] |
#         df['collapse_factor_immediate_threat'] |
#         df['collapse_factor_velocity']
#     )
    
#     # =========================================================================
#     # COLLAPSE SEVERITY CLASSIFICATION
#     # =========================================================================
    
#     # Define severity conditions (evaluated in order of severity)
#     conditions = [
#         # Critical: 2+ defenders within 3 yards OR separation < 4 with 3+ converging
#         (df['defenders_immediate_zone'] >= 2) | 
#         ((df['qb_min_separation'] < critical_sep) & 
#          (df['defenders_potential_zone'] >= critical_converging)),
        
#         # Severe: 3+ defenders within 5 yards OR separation < 5 with 2+ converging
#         (df['defenders_closing_zone'] >= critical_converging) |
#         ((df['qb_min_separation'] < 5.0) & 
#          (df['defenders_potential_zone'] >= min_converging)),
        
#         # Moderate: 2+ defenders within 7 yards AND separation < 7
#         (df['defenders_potential_zone'] >= min_converging) & 
#         (df['qb_min_separation'] < sep_threshold),
        
#         # Light: Separation < 7 BUT only 0-1 converging defenders
#         (df['qb_min_separation'] < sep_threshold) & 
#         (df['defenders_potential_zone'] < min_converging)
#     ]
    
#     severity_levels = ['Critical', 'Severe', 'Moderate', 'Light']
#     df['collapse_severity'] = np.select(conditions, severity_levels, default='None')
    
#     # =========================================================================
#     # SUMMARY STATISTICS
#     # =========================================================================
    
#     if verbose:
#         print("\n" + "="*70)
#         print("COLLAPSE DETECTION COMPLETE")
#         print("="*70)
        
#         print(f"\nOverall Statistics:")
#         print(f"  • Total frames processed: {len(df):,}")
#         print(f"  • Collapsed frames: {df['is_pocket_collapse'].sum():,} "
#               f"({df['is_pocket_collapse'].mean():.1%})")
#         print(f"  • Clean pocket frames: {(~df['is_pocket_collapse']).sum():,} "
#               f"({(~df['is_pocket_collapse']).mean():.1%})")
        
#         print(f"\nSeverity Distribution:")
#         severity_counts = df['collapse_severity'].value_counts()
#         for severity in ['Critical', 'Severe', 'Moderate', 'Light', 'None']:
#             count = severity_counts.get(severity, 0)
#             pct = (count / len(df) * 100) if len(df) > 0 else 0
#             print(f"  • {severity:12s}: {count:5,} frames ({pct:5.1f}%)")
        
#         print(f"\nContributing Factors (frames triggered by each condition):")
#         print(f"  • Separation + Convergence: {df['collapse_factor_separation'].sum():,} frames")
#         print(f"  • High Convergence (Swarm): {df['collapse_factor_convergence'].sum():,} frames")
#         print(f"  • Immediate Threat (3yds): {df['collapse_factor_immediate_threat'].sum():,} frames")
#         print(f"  • Velocity-based Closure: {df['collapse_factor_velocity'].sum():,} frames")
        
#         print(f"\nNew Columns Added:")
#         new_cols = [
#             'collapse_factor_separation',
#             'collapse_factor_convergence',
#             'collapse_factor_immediate_threat',
#             'collapse_factor_velocity',
#             'is_pocket_collapse',
#             'collapse_severity'
#         ]
#         for col in new_cols:
#             print(f"  • {col}")
        
#         print("="*70)
    
#     return df

In [16]:
# ============================================================================
# FUNCTION 1: MERGE QB SEPARATION WITH CONVERGENCE
# ============================================================================

def merge_qb_separation_with_convergence(
    qb_separation_df: pd.DataFrame,
    convergence_df: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Merge frame-level QB separation data with defender convergence metrics.
    
    Combines QB separation metrics (distance to nearest defender, velocity)
    with defender convergence metrics (zone counts, convergence category)
    at frame granularity.
    
    Parameters
    ----------
    qb_separation_df : pd.DataFrame
        Frame-level QB separation and velocity data.
        
        Required columns:
        - game_id : int
            Unique game identifier
        - play_id : int
            Unique play identifier
        - frame_id : int
            Frame number (10 fps)
        - nfl_id : int
            Quarterback NFL ID
        - qb_min_separation : float
            Distance to nearest defender (yards)
        - separation_velocity : float
            Rate of separation change (yards/second)
        - pressure_acceleration : float, optional
            Rate of velocity change (yards/second²)
        
        Optional columns:
        - player_name, player_role, player_position
    
    convergence_df : pd.DataFrame
        Frame-level defender convergence data from Step A.2.
        
        Required columns:
        - game_id, play_id, frame_id, nfl_id : identifiers
        - defenders_immediate_zone : int
            Count in immediate threat zone (0-3 yards)
        - defenders_closing_zone : int
            Count in closing threat zone (0-5 yards)
        - defenders_potential_zone : int
            Count in potential threat zone (0-7 yards)
        - total_converging_defenders : int
            Total defenders within 7-yard zone
        - convergence_category : str
            'Critical', 'High', 'Moderate', or 'None'
        - closest_defender_distance : float
            Distance to nearest defender (for validation)
        - total_defenders_on_field : int
            Total defenders in frame
    
    verbose : bool, default=True
        Print merge statistics and diagnostics.
    
    Returns
    -------
    pd.DataFrame
        Merged frame-level data with both QB separation and convergence metrics.
        
        Output columns include all columns from both input dataframes.
        Merge key: (game_id, play_id, frame_id, nfl_id)
        
        Note: Does NOT include collapse indicators - those are created in
        the next pipeline step.
    
    Raises
    ------
    ValueError
        If merge produces unexpected number of rows or low match rate.
    
    Notes
    -----
    **Merge Strategy:**
    - Type: LEFT join (preserve all QB frames)
    - Key: (game_id, play_id, frame_id, nfl_id)
    - Missing convergence data filled with zeros (frames without defenders)
    
    **Expected Match Rate:**
    Should be ~100% if both calculations used same input_data.
    Lower match rate may indicate:
    - Frames without defenders (filled with zeros)
    - Data inconsistencies between sources
    
    **Pipeline Position:**
    1. calculate_frame_level_separation_qb() → QB separation
    2. calculate_qb_pressure_velocity() → Add velocity
    3. calculate_frame_level_defender_convergence() → Convergence metrics
    4. merge_qb_separation_with_convergence() → Combined features ← THIS
    
    **Data Quality:**
    - Fills missing convergence with zeros (safe assumption: no defenders)
    - Validates match rate (warns if < 95%)
    - Preserves temporal continuity (all QB frames retained)
    
    Examples
    --------
    >>> # Basic usage
    >>> merged_df = merge_qb_separation_with_convergence(
    ...     qb_velocity_frame_level,
    ...     convergence_frame_level
    ... )
    >>> 
    >>> # Check merge quality
    >>> print(f"Shape: {merged_df.shape}")
    >>> print(f"Columns: {merged_df.columns.tolist()}")
    >>> 
    >>> # Verify no collapse indicators yet
    >>> assert 'is_pocket_collapse' not in merged_df.columns
    >>> # Collapse indicators created in next step
    
    See Also
    --------
    calculate_qb_pressure_velocity : Creates velocity features
    calculate_frame_level_defender_convergence : Creates convergence features
    """
    
    if verbose:
        print("="*70)
        print("MERGING QB SEPARATION WITH CONVERGENCE (FRAME-LEVEL)")
        print("="*70)
        print(f"QB separation data: {qb_separation_df.shape[0]:,} frames")
        print(f"Convergence data: {convergence_df.shape[0]:,} frames")
        print("\nPurpose: Combine pressure metrics for collapse detection")
        print("Note: Collapse indicators created in next step")
    
    # -------------------------------------------------------------------------
    # COLUMN SELECTION
    # -------------------------------------------------------------------------
    
    # Select convergence columns to merge (avoid duplicates)
    convergence_cols_to_merge = [
        'game_id', 'play_id', 'frame_id', 'nfl_id',
        'defenders_immediate_zone',
        'defenders_closing_zone',
        'defenders_potential_zone',
        'total_converging_defenders',
        'convergence_category',
        'total_defenders_on_field'
        # Exclude 'closest_defender_distance' - it's duplicate of qb_min_separation
    ]
    
    # Filter to available columns
    convergence_available_cols = [
        col for col in convergence_cols_to_merge 
        if col in convergence_df.columns
    ]

    # Filter to available columns
    available_cols = [col for col in convergence_cols_to_merge 
                      if col in convergence_df.columns]
    
    # -------------------------------------------------------------------------
    # MERGE DATAFRAMES
    # -------------------------------------------------------------------------
    
    # Perform left join (keep all QB frames)
    merged_df = qb_separation_df.merge(
        convergence_df[convergence_available_cols],
        on=['game_id', 'play_id', 'frame_id', 'nfl_id'],
        how='left',
        suffixes=('', '_conv')  # Handle any column overlaps
    )
    
    # -------------------------------------------------------------------------
    # HANDLE MISSING VALUES
    # -------------------------------------------------------------------------
    
    # Fill missing convergence data with zeros
    # (Frames without defenders = zero convergence)
    convergence_numeric_cols = [
        'defenders_immediate_zone',
        'defenders_closing_zone',
        'defenders_potential_zone',
        'total_converging_defenders',
        'total_defenders_on_field'
    ]
    
    for col in convergence_numeric_cols:
        if col in merged_df.columns:
            merged_df[col] = merged_df[col].fillna(0).astype(int)
    
    # Fill missing convergence category
    if 'convergence_category' in merged_df.columns:
        merged_df['convergence_category'] = merged_df['convergence_category'].fillna('None')
    
    # -------------------------------------------------------------------------
    # MERGE QUALITY VALIDATION
    # -------------------------------------------------------------------------
    
    if verbose:
        # Calculate match rate
        matched_frames = (
            (~merged_df['defenders_potential_zone'].isna()).sum() 
            if 'defenders_potential_zone' in merged_df.columns 
            else merged_df['convergence_category'].notna().sum()
        )
        match_rate = (matched_frames / len(merged_df) * 100) if len(merged_df) > 0 else 0
        
        print(f"\nMerge Results:")
        print(f"  • Total frames after merge: {len(merged_df):,}")
        print(f"  • Frames with convergence data: {matched_frames:,}")
        print(f"  • Match rate: {match_rate:.1f}%")
        print(f"  • Total columns: {merged_df.shape[1]}")
        
        # Validation warnings
        if match_rate < 95:
            print(f"\n  ⚠ Warning: Low match rate ({match_rate:.1f}%)")
            print(f"     Expected ~100% if same input_data used")
            print(f"     Missing data filled with zeros (safe for frames without defenders)")
        else:
            print(f"  ✓ Good match rate - data sources aligned")
        
        print("="*70)
    
    return merged_df


In [17]:
def organize_qb_frame_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Organize QB frame-level columns in logical order."""
    
    # Define column order
    column_order = [
        # Identifiers
        'game_id', 'play_id', 'nfl_id', 'frame_id',
        
        # Player info
        'player_name', 'player_role', 'player_position',
        
        # QB Separation metrics
        'qb_min_separation',
        'separation_velocity', 
        'pressure_acceleration',
        
        # Defender convergence counts
        'defenders_immediate_zone',
        'defenders_closing_zone',
        'defenders_potential_zone', 
        'total_converging_defenders',
        'convergence_category',
        'total_defenders_on_field',
        
        # Collapse indicators
        'collapse_factor_separation',
        'collapse_factor_convergence',
        'collapse_factor_immediate_threat',
        'collapse_factor_velocity',
        'is_pocket_collapse',
        'collapse_severity'
    ]
    
    # Select available columns in order
    available_cols = [col for col in column_order if col in df.columns]
    
    # Add any remaining columns
    remaining_cols = [col for col in df.columns if col not in available_cols]
    final_cols = available_cols + remaining_cols
    
    return df[final_cols]

### Frame Level - TR

In [18]:
def calculate_frame_level_separation_tr(df):
    """
    Calculate minimum separation between Targeted Receiver and defensive players at FRAME level.
    Returns only Targeted Receiver data with tr_min_separation per frame.
    
    Parameters:
    df: DataFrame containing tracking data with columns:
        - game_id, play_id, frame_id, nfl_id
        - player_role: 'Targeted Receiver' or 'Defensive Coverage'
        - player_name
        - x, y: Player coordinates
    
    Returns:
    DataFrame: Targeted Receiver data with columns (game_id, play_id, nfl_id, frame_id, player_name, player_role, tr_min_separation)
    """
    
    # Group by game, play, and frame (FRAME-LEVEL calculation)
    grouped = df.groupby(['game_id', 'play_id', 'frame_id'])
    
    # List to store Targeted Receiver results
    tr_results = []
    
    for (game_id, play_id, frame_id), frame_group in grouped:
        # Identify Targeted Receivers and defenders in this specific frame
        target_receivers = frame_group[frame_group['player_role'] == 'Targeted Receiver']
        defenders = frame_group[frame_group['player_role'] == 'Defensive Coverage']
        
        # Skip if no Targeted Receivers or defenders in this frame
        if target_receivers.empty or defenders.empty:
            continue
        
        # Extract defender coordinates as arrays for vectorized calculation
        def_x = defenders['x'].values
        def_y = defenders['y'].values
        
        # For each Targeted Receiver, calculate distance to all defenders using vectorization
        for tr_idx, tr_row in target_receivers.iterrows():
            tr_x, tr_y = tr_row['x'], tr_row['y']
            
            # Vectorized distance calculation across all defenders in this frame
            distances = calculate_euclidean_distance_vectorized(tr_x, tr_y, def_x, def_y)
            
            # Find minimum distance for this frame
            min_distance = np.round(np.min(distances), 2)
            
            # Create result record for this Targeted Receiver
            tr_result = {
                'game_id': game_id,
                'play_id': play_id,
                'nfl_id': tr_row['nfl_id'],
                'frame_id': frame_id,
                'player_name': tr_row['player_name'],
                'player_role': tr_row['player_role'],
                'player_position': tr_row['player_position'],
                'tr_min_separation': min_distance,
                'route_of_targeted_receiver': tr_row['route_of_targeted_receiver'],
                'dir': tr_row['dir']
            }
            tr_results.append(tr_result)
    
    # Convert results to DataFrame
    result_df = pd.DataFrame(tr_results)
    
    # Sort by game, play, receiver, and frame for chronological analysis
    if not result_df.empty:
        result_df = result_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)
    
    return result_df

### Play Level Aggregation

#### PLay Level - QB Separation, Pressure Velocity, Convergence and Pocket Collapse

In [19]:
# ============================================================================
# QB PLAY-LEVEL AGGREGATION
# ============================================================================
# Project: NFL Passing Game Analysis
# Author: Hsotuhsa-S
# Date: 2025-11-19 23:35:15
# Purpose: Aggregate complete QB frame-level features to play-level
# Version: Corrected - Pure aggregation, no feature engineering
#
# Design Principle:
#   - Input: Complete frame-level features (already calculated)
#   - Process: GROUP BY play, AGGREGATE with statistical functions
#   - Output: Play-level summary statistics
#   
#
# Dependencies:
#   - pandas >= 1.3.0
#   - numpy >= 1.21.0
# ============================================================================

import pandas as pd
import numpy as np
from typing import Optional
import warnings
warnings.filterwarnings('ignore')


def aggregate_qb_features_to_play_level(
    frame_df: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Aggregate QB frame-level features to play-level using pandas groupby.
    
    This is PURE AGGREGATION - no feature engineering, no distance calculations,
    no loops. Simply summarizes existing frame-level metrics.
    
    Parameters
    ----------
    frame_df : pd.DataFrame
        Complete frame-level QB data with ALL features already calculated.
        
        Expected input columns (from previous steps):
        - game_id, play_id, nfl_id, frame_id
        - qb_min_separation (from calculate_frame_level_separation_qb)
        - separation_velocity (from calculate_qb_pressure_velocity)
        - defenders_immediate_zone, defenders_closing_zone, defenders_potential_zone
          (from calculate_frame_level_defender_convergence)
        - total_converging_defenders (from calculate_frame_level_defender_convergence)
        - closest_defender_distance (from calculate_frame_level_defender_convergence)
        - player_name, player_role (metadata)
    
    verbose : bool, default=True
        Print aggregation summary.
    
    Returns
    -------
    pd.DataFrame
        Play-level aggregated QB features.
        
        Output columns (~25-30):
        - game_id, play_id, nfl_id (identifiers)
        - qb_play_min_separation, qb_play_avg_separation, qb_play_var_separation
        - max_pressure_velocity, pressure_velocity_avg, pressure_volatility
        - max_converging_defenders, avg_converging_defenders, std_converging_defenders
        - max_defenders_immediate_zone, max_defenders_closing_zone
        - time_to_throw, total_frames
        - (and more...)
    
    Notes
    -----
    **This Function Does:**
    - Groups frame-level data by (game_id, play_id, nfl_id)
    - Applies aggregation functions (MIN, MAX, MEAN, STD, SUM, COUNT)
    - Creates play-level summary statistics
    
    **This Function Does NOT:**
    - Calculate distances (already done in frame-level functions)
    - Loop through rows (uses vectorized pandas operations)
    - Create new features (only aggregates existing ones)
    - Filter or transform data (pure aggregation)
    
    **Why This is Correct:**
    Aggregation should be:
    1. Simple: Just groupby + agg
    2. Fast: Vectorized pandas operations
    3. Clear: Obvious what it does
    4. Separation of concerns: Aggregation ≠ Feature Engineering
    
    Examples
    --------
    >>> # Input: Frame-level features (already calculated)
    >>> print(enhanced_collapse_frame_level.shape)
    (23103, 24)  # 23K frames with complete features
    >>> 
    >>> # Aggregate to play-level
    >>> qb_play = aggregate_qb_features_to_play_level(
    ...     enhanced_collapse_frame_level
    ... )
    >>> 
    >>> # Output: Play-level aggregates
    >>> print(qb_play.shape)
    (818, 28)  # 818 plays with aggregated features
    """
    
    if verbose:
        print("="*70)
        print("AGGREGATING QB FRAME-LEVEL FEATURES TO PLAY-LEVEL")
        print("="*70)
        print(f"Input: {frame_df.shape[0]:,} frames")
    
    # -------------------------------------------------------------------------
    # DEFINE AGGREGATION DICTIONARY
    # -------------------------------------------------------------------------
    # Simple, declarative aggregation - no calculations, no loops
    
    agg_dict = {}
    
    # Separation metrics (if available)
    if 'qb_min_separation' in frame_df.columns:
        agg_dict['qb_min_separation'] = [
            ('qb_play_min_separation', 'min'),      # Peak pressure
            ('qb_play_avg_separation', 'mean'),     # Sustained pressure
            ('qb_play_var_separation', 'var')       # Volatility
        ]
    
    # Velocity metrics (if available)
    if 'separation_velocity' in frame_df.columns:
        agg_dict['separation_velocity'] = [
            ('max_pressure_velocity', 'min'),       # Most negative
            ('pressure_velocity_avg', 'mean'),
            ('pressure_volatility', 'std')
        ]
    
    # Acceleration metrics (if available)
    if 'pressure_acceleration' in frame_df.columns:
        agg_dict['pressure_acceleration'] = [
            ('max_pressure_acceleration', 'min'),
            ('pressure_acceleration_avg', 'mean')
        ]
    
    # Convergence metrics (if available)
    if 'total_converging_defenders' in frame_df.columns:
        agg_dict['total_converging_defenders'] = [
            ('max_converging_defenders', 'max'),
            ('avg_converging_defenders', 'mean'),
            ('std_converging_defenders', 'std')
        ]
    
    if 'defenders_immediate_zone' in frame_df.columns:
        agg_dict['defenders_immediate_zone'] = [
            ('max_defenders_immediate_zone', 'max')
        ]
    
    if 'defenders_closing_zone' in frame_df.columns:
        agg_dict['defenders_closing_zone'] = [
            ('max_defenders_closing_zone', 'max'),
            ('avg_defenders_closing_zone', 'mean')
        ]
    
    if 'defenders_potential_zone' in frame_df.columns:
        agg_dict['defenders_potential_zone'] = [
            ('max_defenders_potential_zone', 'max')
        ]
    
    # Player metadata
    if 'player_name' in frame_df.columns:
        agg_dict['player_name'] = [('player_name', 'first')]
    
    if 'player_role' in frame_df.columns:
        agg_dict['player_role'] = [('player_role', 'first')]
    
    
    # -------------------------------------------------------------------------
    # PERFORM AGGREGATION (Simple groupby)
    # -------------------------------------------------------------------------
    
    play_agg = frame_df.groupby(['game_id', 'play_id', 'nfl_id']).agg(agg_dict)
    
    
    # ✅ CORRECT: Extract only the new column names
    play_agg.columns = play_agg.columns.get_level_values(1)
    play_agg = play_agg.reset_index()

    if verbose:
        print(f"✓ Aggregated to {len(play_agg):,} rows")
        print(f"✓ Features: {len(play_agg.columns)} columns")
    
    
    # -------------------------------------------------------------------------
    # FILL MISSING VALUES
    # -------------------------------------------------------------------------
    
    # Fill NaN standard deviations
    std_cols = [col for col in play_agg.columns if 'std_' in col or '_volatility' in col]
    for col in std_cols:
        play_agg[col] = play_agg[col].fillna(0)
    
    # Fill NaN counts
    count_cols = [col for col in play_agg.columns if 'count' in col.lower() or 'frames' in col.lower()]
    for col in count_cols:
        if col in play_agg.columns:
            play_agg[col] = play_agg[col].fillna(0).astype(int)
    
    # -------------------------------------------------------------------------
    # SUMMARY
    # -------------------------------------------------------------------------
    
    if verbose:
        print(f"Output: {play_agg.shape[0]:,} plays × {play_agg.shape[1]} features")
        print(f"Reduction: {frame_df.shape[0] / play_agg.shape[0]:.1f}:1 (frames to plays)")
        
        feature_groups = {
            'Separation': [c for c in play_agg if 'separation' in c.lower()],
            'Velocity': [c for c in play_agg if 'velocity' in c.lower() or 'acceleration' in c.lower()],
            'Convergence': [c for c in play_agg if 'defender' in c.lower() or 'converg' in c.lower()],
            #'Collapse': [c for c in play_agg if 'collapse' in c.lower()],
            'Temporal': [c for c in play_agg if 'frame' in c.lower() or 'time' in c.lower()]
        }
        
        print(f"\nFeature Groups:")
        for group, features in feature_groups.items():
            if features:
                print(f"  • {group:15s}: {len(features):2d} features")
        
        print("="*70)
    
    return play_agg


#### Play Level - TR Separation

In [20]:
def tr_calculate_play_level_aggregates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate TR frame-level metrics to play-level statistics.
    
    Parameters
    ----------
    df : pd.DataFrame
        Frame-level TR separation data with columns:
        - game_id, play_id, nfl_id, frame_id
        - tr_min_separation : float
        - route_of_targeted_receiver : str
        - player_name, player_role, player_position
    
    Returns
    -------
    pd.DataFrame
        Play-level TR aggregates.
    
    Notes
    -----
    **TR-Specific Aggregation Strategy:**
    
    - FIRST/LAST: Capture separation at snap and throw
    - MIN: Tightest coverage moment
    - MEAN: Overall coverage quality
    - VAR: Coverage consistency (defender tracking)
    
    **Use Cases:**
    - Route effectiveness analysis
    - Coverage quality assessment
    - QB decision evaluation
    
    Examples
    --------
    >>> tr_play_agg = tr_calculate_play_level_aggregates(tr_separation_df)
    >>> 
    >>> # Analyze route effectiveness
    >>> route_analysis = tr_play_agg.groupby('route_of_targeted_receiver').agg({
    ...     'tr_play_avg_separation': 'mean',
    ...     'tr_play_min_separation': 'mean'
    ... })
    """
    
    # Filter to Targeted Receivers only
    tr_data = df[df['player_role'] == 'Targeted Receiver'].copy()
    
    # Remove rows with missing separation
    # tr_data = tr_data[tr_data['tr_min_separation'].notna()]
    
    if tr_data.empty:
        return pd.DataFrame()
    
    # -------------------------------------------------------------------------
    # AGGREGATION FUNCTIONS
    # -------------------------------------------------------------------------
    
    play_aggregates = tr_data.groupby(['game_id', 'play_id', 'nfl_id']).agg({
        # Separation metrics
        'tr_min_separation': [
            ('tr_play_min_separation', 'min'),      # Tightest coverage
            ('tr_play_avg_separation', 'mean'),     # Average coverage
            ('tr_play_var_separation', 'var')       # Coverage consistency
        ],
        
        # Player information
        'player_name': [('player_name', 'first')],
        'player_role': [('player_role', 'first')],
        'player_position': [('player_position', 'first')],
        'route_of_targeted_receiver': [('route_of_targeted_receiver', 'first')],
        
        # Temporal features
        'frame_id': [
            ('nums_frame_pre_throw', 'count')       # Route duration
        ]
    })
    
    # Flatten column names

     # ✅ CORRECT: Extract only the new column names
    play_aggregates.columns = play_aggregates.columns.get_level_values(1)
    play_aggregates = play_aggregates.reset_index(drop=False)

    # Calculate time to throw from frame count
    play_aggregates['time_to_throw'] = (play_aggregates['nums_frame_pre_throw'] * 0.1).round(2)

    # Print shape of aggregated DataFrame
    print(f"TR Play-level aggregates shape: {play_aggregates.shape}")

    # Add rounding for separation metrics to ensure consistent 2 decimal places
    separation_cols = ['tr_play_min_separation', 'tr_play_avg_separation', 'tr_play_var_separation']
    for col in separation_cols:
        if col in play_aggregates.columns:
            play_aggregates[col] = play_aggregates[col].round(2)
    
    # -------------------------------------------------------------------------
    # FIRST AND LAST FRAME SEPARATION
    # -------------------------------------------------------------------------
    
    # Get first frame data (at snap)
    first_frames = tr_data.sort_values('frame_id').groupby(
        ['game_id', 'play_id', 'nfl_id'], as_index=False
    ).first().reset_index()
    
    # Get last frame data (at throw)
    last_frames = tr_data.sort_values('frame_id').groupby(
        ['game_id', 'play_id', 'nfl_id'], as_index=False
    ).last().reset_index()
    
    # Rename direction column in last frame
    last_frames = last_frames.rename(columns={'dir': 'tr_last_dir'})
    
    # Merge first and last frame data
    first_last = pd.merge(
        first_frames[['game_id', 'play_id', 'nfl_id', 'tr_min_separation']],
        last_frames[['game_id', 'play_id', 'nfl_id', 'tr_min_separation', 'tr_last_dir']],
        on=['game_id', 'play_id', 'nfl_id'],
        suffixes=('_first', '_last')
    )
    
    # Merge with play aggregates
    play_aggregates = pd.merge(
        play_aggregates,
        first_last,
        on=['game_id', 'play_id', 'nfl_id'],
        how='left'
    )
    
    return play_aggregates

#### Play Level - QB + TR + Supplementary Features

In [21]:
def merge_play_level_features(
    qb_play_enhanced: pd.DataFrame,
    tr_play_agg: pd.DataFrame,
    supplementary_data: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    """
    CORRECTED: Keep only plays that have actual tracking data (QB + TR).
    Drop supplementary plays that don't exist in tracking datasets.
    
    Strategy: Inner joins to ensure all plays have complete tracking metrics.
    """
    
    if verbose:
        print("\n" + "="*70)
        print("CORRECTED: TRACKING DATA ONLY - DROP INCOMPLETE PLAYS")
        print("="*70)
        print("Strategy: Inner joins - only keep plays with QB AND TR tracking data")
    
    # -------------------------------------------------------------------------
    # STEP 1: START WITH QB DATA (HAS ACTUAL PRESSURE METRICS)
    # -------------------------------------------------------------------------
    
    base_df = qb_play_enhanced.copy()
    
    if verbose:
        print(f"\n[STEP 1] QB pressure data as base:")
        print(f"  • QB plays with pressure metrics: {len(base_df):,}")
        print(f"  • These have actual tracking-derived features")
    
    # -------------------------------------------------------------------------
    # STEP 2: INNER JOIN WITH TR DATA (KEEP ONLY MATCHING PLAYS)
    # -------------------------------------------------------------------------
    
    # TR columns to merge
    tr_cols_to_merge = [
        'game_id', 'play_id',
        'tr_play_min_separation',
        'tr_play_avg_separation', 
        'tr_play_var_separation',
        'tr_min_separation_first',
        'tr_min_separation_last',
        'nums_frame_pre_throw',
        'time_to_throw',
        'route_of_targeted_receiver'
    ]
    
    tr_available = [col for col in tr_cols_to_merge if col in tr_play_agg.columns]
    
    # INNER JOIN: Only keep plays that exist in BOTH QB and TR datasets
    merged_qb_tr = base_df.merge(
        tr_play_agg[tr_available],
        on=['game_id', 'play_id'],
        how='inner'  # ← KEY CHANGE: Only keep matching plays
    )
    
    if verbose:
        qb_only_plays = len(base_df) - len(merged_qb_tr)
        print(f"\n[STEP 2] After QB + TR inner join:")
        print(f"  • Plays with BOTH QB and TR data: {len(merged_qb_tr):,}")
        print(f"  • QB-only plays dropped: {qb_only_plays:,}")
        print(f"  • Retention rate: {(len(merged_qb_tr)/len(base_df)*100):.1f}%")
        
        # Verify time_to_throw preservation
        if 'time_to_throw' in merged_qb_tr.columns:
            missing_ttt = merged_qb_tr['time_to_throw'].isna().sum()
            print(f"  • time_to_throw missing: {missing_ttt} (should be 0)")
    
    # -------------------------------------------------------------------------
    # STEP 3: INNER JOIN WITH SUPPLEMENTARY (KEEP ONLY PLAYS WITH OUTCOMES)
    # -------------------------------------------------------------------------
    
    supp_cols = [
        'game_id', 'play_id',
        'pass_result',                  # TARGET VARIABLE
        'pass_length', 'pass_type',
        'yards_gained',
        'pass_location_type',
        'dropback_type', 'dropback_distance',
        'play_action',
        'offense_formation',
        'defenders_in_the_box',
        'team_coverage_man_zone',
        'team_coverage_type'
    ]
    
    available_supp_cols = [col for col in supp_cols if col in supplementary_data.columns]
    
    # INNER JOIN: Only keep plays that have outcomes
    final_merged = merged_qb_tr.merge(
        supplementary_data[available_supp_cols],
        on=['game_id', 'play_id'],
        how='inner'  # ← KEY CHANGE: Only keep plays with outcomes
    )
    
    if verbose:
        tracking_only_plays = len(merged_qb_tr) - len(final_merged)
        original_supp = len(supplementary_data)
        
        print(f"\n[STEP 3] After adding supplementary data (inner join):")
        print(f"  • Final plays with complete data: {len(final_merged):,}")
        print(f"  • Tracking-only plays dropped: {tracking_only_plays:,}")
        print(f"  • Total supplementary plays: {original_supp:,}")
        print(f"  • Supplementary plays used: {len(final_merged):,} ({len(final_merged)/original_supp*100:.1f}%)")
        
        # Target variable check
        target_available = (~final_merged['pass_result'].isna()).sum()
        print(f"  • Plays with target variable: {target_available:,}")
        
        if target_available == len(final_merged):
            print(f"  ✓ All plays have target variable (perfect)")
        else:
            print(f"  ⚠ Some plays missing target: {len(final_merged) - target_available}")
        
        # Final data quality summary
        print(f"\n[SUMMARY] Data Quality:")
        print(f"  ✓ All plays have QB pressure metrics (tracking-derived)")
        print(f"  ✓ All plays have TR coverage metrics (tracking-derived)")
        print(f"  ✓ All plays have game context (supplementary)")
        print(f"  ✓ No missing time_to_throw (from TR tracking)")
        print(f"  ✓ High-quality dataset: {len(final_merged):,} complete plays")
        
        print("="*70)
    
    return final_merged



## Create pressure components score

In [22]:
def create_pressure_component_scores(
    df: pd.DataFrame,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Create individual pressure component scores per play.
    
    Creates three separate pressure scores using same logic as create_qb_pressure_index:
    - convergence_pressure_score: Defender convergence (0-100)
    - velocity_pressure_score: Velocity pressure (0-100) 
    - tr_coverage_pressure_score: TR separation pressure (0-100)
    
    Parameters
    ----------
    df : pd.DataFrame
        Play-level dataset with QB metrics
    verbose : bool, default=True
        Print detailed analysis
        
    Returns
    -------
    pd.DataFrame
        Dataset with three new pressure component score columns
    """
    
    if verbose:
        print("="*70)
        print("CREATING PRESSURE COMPONENT SCORES")
        print("="*70)
        print("Components:")
        print("  1. Convergence Pressure: Defender convergence")
        print("  2. Velocity Pressure: Velocity pressure")  
        print("  3. TR Coverage Pressure: TR separation")
        print("-"*70)
    
    df_scores = df.copy()
    
    # -------------------------------------------------------------------------
    # 1. SPATIAL PRESSURE SCORE (Defender Convergence)
    # -------------------------------------------------------------------------
    
    if 'max_converging_defenders' in df.columns:
        convergence_raw = df_scores['max_converging_defenders'].fillna(0)
        
        # Use 95th percentile as scaling factor, minimum of 4 for normalization
        p95 = np.percentile(convergence_raw[convergence_raw > 0], 95) if (convergence_raw > 0).any() else 4
        convergence_pressure = np.clip(convergence_raw / max(p95, 1) * 100, 0, 100)
        
        df_scores['convergence_pressure_score'] = convergence_pressure.round(1)
        
        if verbose:
            print(f"[CONVERGENCE PRESSURE]")
            print(f"  • Source: max_converging_defenders")
            print(f"  • Raw range: {convergence_raw.min():.1f} - {convergence_raw.max():.1f}")
            print(f"  • 95th percentile: {p95:.1f}")
            print(f"  • Score range: {convergence_pressure.min():.1f} - {convergence_pressure.max():.1f}")
            print(f"  • Score mean: {convergence_pressure.mean():.1f}")
    else:
        df_scores['convergence_pressure_score'] = 0.0
        if verbose:
            print(f"[CONVERGENCE PRESSURE] ⚠ max_converging_defenders not found - using 0")
    
    # -------------------------------------------------------------------------
    # 2. VELOCITY PRESSURE SCORE (Temporal Pressure)
    # -------------------------------------------------------------------------
    
    if 'max_pressure_velocity' in df.columns:
        velocity_raw = df_scores['max_pressure_velocity'].fillna(0)
        
        # Clip velocity to reasonable range (-25 to 0), normalize to 0-100
        velocity_clipped = np.clip(velocity_raw, -25, 0)
        velocity_pressure = np.abs(velocity_clipped) / 15 * 100
        velocity_pressure = np.clip(velocity_pressure, 0, 100)
        
        df_scores['velocity_pressure_score'] = velocity_pressure.round(1)
        
        if verbose:
            print(f"\n[VELOCITY PRESSURE]")
            print(f"  • Source: max_pressure_velocity")
            print(f"  • Raw range: {velocity_raw.min():.1f} - {velocity_raw.max():.1f} yds/sec")
            print(f"  • Clipped range: {velocity_clipped.min():.1f} - {velocity_clipped.max():.1f}")
            print(f"  • Score range: {velocity_pressure.min():.1f} - {velocity_pressure.max():.1f}")
            print(f"  • Score mean: {velocity_pressure.mean():.1f}")
    else:
        df_scores['velocity_pressure_score'] = 0.0
        if verbose:
            print(f"\n[VELOCITY PRESSURE] ⚠ max_pressure_velocity not found - using 0")
    
    # -------------------------------------------------------------------------
    # 3. TR COVERAGE PRESSURE SCORE (TR Separation)
    # -------------------------------------------------------------------------
    
    if 'tr_play_min_separation' in df.columns:
        tr_sep_raw = df_scores['tr_play_min_separation'].fillna(8.0)
        
        # Inverse relationship: smaller separation = higher pressure
        # Normalize to 0-10 yards range, then convert to 0-100 pressure scale
        tr_coverage_pressure = (1 - np.clip(tr_sep_raw / 10.0, 0, 1)) * 100
        
        df_scores['tr_coverage_pressure_score'] = tr_coverage_pressure.round(1)
        
        if verbose:
            print(f"\n[COVERAGE PRESSURE]")
            print(f"  • Source: tr_play_min_separation")
            print(f"  • Raw range: {tr_sep_raw.min():.1f} - {tr_sep_raw.max():.1f} yards")
            print(f"  • Score range: {tr_coverage_pressure.min():.1f} - {tr_coverage_pressure.max():.1f}")
            print(f"  • Score mean: {tr_coverage_pressure.mean():.1f}")
            print(f"  • Logic: Lower separation = Higher pressure")
    else:
        df_scores['tr_coverage_pressure_score'] = 0.0
        if verbose:
            print(f"\n[TR COVERAGE PRESSURE] ⚠ tr_play_min_separation not found - using 0")
    
    # -------------------------------------------------------------------------
    # SUMMARY STATISTICS
    # -------------------------------------------------------------------------
    
    if verbose:
        print(f"\n" + "="*50)
        print("COMPONENT SCORES SUMMARY")
        print("="*50)
        
        # Component statistics
        components = [
            ('convergence_pressure_score', 'Spatial (Convergence)'),
            ('velocity_pressure_score', 'Temporal (Velocity)'),
            ('tr_coverage_pressure_score', 'Coverage (TR Sep)')
        ]
        
        print(f"Component Statistics:")
        for score_col, description in components:
            if score_col in df_scores.columns:
                mean_val = df_scores[score_col].mean()
                std_val = df_scores[score_col].std()
                min_val = df_scores[score_col].min()
                max_val = df_scores[score_col].max()
                
                print(f"  • {description:20s}: {mean_val:5.1f} ± {std_val:4.1f} (range: {min_val:4.1f}-{max_val:4.1f})")
        
        # Component correlations
        available_components = [comp[0] for comp in components if comp[0] in df_scores.columns]
        if len(available_components) >= 2:
            print(f"\nComponent Correlations:")
            for i, comp1 in enumerate(available_components):
                for comp2 in available_components[i+1:]:
                    corr = df_scores[comp1].corr(df_scores[comp2])
                    comp1_name = next(desc for col, desc in components if col == comp1)
                    comp2_name = next(desc for col, desc in components if col == comp2)
                    print(f"  • {comp1_name:20s} ↔ {comp2_name:20s}: {corr:+.3f}")
        
        # Missing data check
        missing_components = []
        for score_col, description in components:
            if score_col not in df_scores.columns or (df_scores[score_col] == 0).all():
                missing_components.append(description)
        
        if missing_components:
            print(f"\n⚠ Missing Components:")
            for comp in missing_components:
                print(f"    • {comp}")
        else:
            print(f"\n✓ All components successfully calculated")
        
        print("="*50)
    
    return df_scores



## Validation

In [23]:
# ============================================================================
# FUNCTION 4: VALIDATE FINAL FEATURES FOR ML
# ============================================================================

def validate_final_features(
    final_df: pd.DataFrame,
    target_column: str = 'pass_result'
) -> Dict[str, bool]:
    """
    Validate final feature set for ML pipeline readiness.
    
    Performs comprehensive data quality checks to ensure dataset meets
    requirements for exploratory data analysis (EDA) and model training.
    
    This is the final quality gate before proceeding to the ML pipeline.
    
    Parameters
    ----------
    final_df : pd.DataFrame
        Final consolidated play-level feature set.
        Output from merge_play_level_features().
    
    target_column : str, default='pass_result'
        Name of target variable column.
        Should contain binary classification labels.
    
    Returns
    -------
    Dict[str, bool]
        Validation results dictionary with boolean flags:
        
        - 'has_target_variable' : bool
            Target column exists and has < 5% missing values
        
        - 'sufficient_samples' : bool
            At least 100 samples (minimum for ML)
        
        - 'target_is_binary' : bool
            Target has exactly 2 classes (binary classification)
        
        - 'balanced_target' : bool
            Minority class ≥ 20% (reasonably balanced)
        
        - 'no_duplicate_rows' : bool
            No duplicate play records exist
        
        - 'numeric_features_valid' : bool
            No infinite values or excessive missing (> 50%)
        
        - 'ready_for_ml' : bool
            Overall ML readiness (all critical checks passed)
    
    Notes
    -----
    **Validation Criteria:**
    
    1. Target Variable Quality:
       - Exists in dataframe
       - < 5% missing values
       - Exactly 2 unique classes (binary classification)
       - Minority class ≥ 20% (avoid severe imbalance)
    
    2. Sample Size:
       - Minimum: 100 samples (required)
       - Recommended: 500+ samples (ideal for Random Forest)
       - Current dataset: ~818 samples (✓ Good)
    
    3. Data Integrity:
       - No duplicate play records
       - No infinite values in numeric features
       - < 50% missing in any feature column
    
    4. Feature Quality:
       - At least 10 features available
       - Mix of pressure, convergence, and context features
       - Numeric features in valid ranges
    
    **Pass Criteria:**
    All critical checks must pass for 'ready_for_ml' = True:
    - has_target_variable
    - sufficient_samples
    - target_is_binary
    - no_duplicate_rows
    - numeric_features_valid
    
    Optional checks (warnings if fail):
    - balanced_target (can proceed with class weights)
    
    **Next Steps Based on Results:**
    
    If ready_for_ml = True:
    → Proceed to Data Cleaning & Preprocessing
    → Exploratory Data Analysis (EDA)
    → Feature Selection
    → Model Training
    
    If ready_for_ml = False:
    → Address failed checks
    → Re-validate before proceeding
    
    Examples
    --------
    >>> # Validate dataset
    >>> validation = validate_final_features(final_ml_features)
    >>> 
    >>> # Check results
    >>> if validation['ready_for_ml']:
    ...     print("✓ Dataset ready for ML pipeline")
    ...     print("→ Proceed to Data Cleaning")
    ... else:
    ...     print("⚠ Dataset needs attention")
    ...     for check, passed in validation.items():
    ...         if not passed:
    ...             print(f"  ✗ Failed: {check}")
    >>> 
    >>> # Detailed inspection
    >>> print(f"Sample size: {len(final_ml_features)}")
    >>> print(f"Target balance: {final_ml_features[target_column].value_counts()}")
    >>> print(f"Missing values: {final_ml_features.isnull().sum().sum()}")
    
    See Also
    --------
    merge_play_level_features : Creates final feature set
    """
    
    print("\n" + "="*70)
    print("VALIDATING FINAL FEATURES FOR ML PIPELINE")
    print("="*70)
    print(f"Target column: '{target_column}'")
    print("-"*70)
    
    validation_results = {}
    
    # -------------------------------------------------------------------------
    # CHECK 1: TARGET VARIABLE EXISTS AND POPULATED
    # -------------------------------------------------------------------------
    
    if target_column in final_df.columns:
        target_missing = final_df[target_column].isna().sum()
        target_missing_pct = (target_missing / len(final_df) * 100) if len(final_df) > 0 else 100
        
        validation_results['has_target_variable'] = (target_missing_pct < 5)
        
        if validation_results['has_target_variable']:
            print(f"✓ CHECK 1 PASSED: Target variable '{target_column}' available")
            print(f"    Missing: {target_missing} ({target_missing_pct:.1f}%)")
        else:
            print(f"✗ CHECK 1 FAILED: Target variable has {target_missing_pct:.1f}% missing")
            print(f"    Threshold: < 5% missing required")
    else:
        validation_results['has_target_variable'] = False
        print(f"✗ CHECK 1 FAILED: Target column '{target_column}' not found")
        print(f"    Available columns: {list(final_df.columns[:10])}...")
    
    # -------------------------------------------------------------------------
    # CHECK 2: SUFFICIENT SAMPLE SIZE
    # -------------------------------------------------------------------------
    
    sample_count = len(final_df)
    validation_results['sufficient_samples'] = (sample_count >= 100)
    
    if validation_results['sufficient_samples']:
        print(f"✓ CHECK 2 PASSED: Sufficient samples ({sample_count:,})")
        if sample_count >= 500:
            print(f"    → Excellent sample size for Random Forest")
        else:
            print(f"    → Adequate (500+ recommended, but {sample_count} is workable)")
    else:
        print(f"✗ CHECK 2 FAILED: Insufficient samples ({sample_count})")
        print(f"    Minimum: 100 samples required")
        print(f"    Recommended: 500+ samples")
    
    # -------------------------------------------------------------------------
    # CHECK 3: BINARY TARGET VARIABLE
    # -------------------------------------------------------------------------
    
    if validation_results['has_target_variable']:
        target_classes = final_df[target_column].dropna().nunique()
        validation_results['target_is_binary'] = (target_classes == 2)
        
        if validation_results['target_is_binary']:
            print(f"✓ CHECK 3 PASSED: Binary classification ({target_classes} classes)")
        else:
            print(f"✗ CHECK 3 FAILED: Expected 2 classes, found {target_classes}")
            print(f"    Classes: {final_df[target_column].unique()}")
    else:
        validation_results['target_is_binary'] = False
        print(f"⊘ CHECK 3 SKIPPED: No target variable to validate")
    
    # -------------------------------------------------------------------------
    # CHECK 4: TARGET CLASS BALANCE
    # -------------------------------------------------------------------------
    
    if validation_results['has_target_variable'] and validation_results['target_is_binary']:
        target_value_counts = final_df[target_column].value_counts()
        minority_class_pct = (target_value_counts.min() / target_value_counts.sum() * 100)
        
        validation_results['balanced_target'] = (minority_class_pct >= 20)
        
        print(f"\n  Target Distribution:")
        for class_val, count in target_value_counts.items():
            pct = (count / target_value_counts.sum() * 100)
            print(f"    • {class_val}: {count:,} ({pct:.1f}%)")
        
        if validation_results['balanced_target']:
            print(f"✓ CHECK 4 PASSED: Reasonably balanced (minority: {minority_class_pct:.1f}%)")
        else:
            print(f"⚠ CHECK 4 WARNING: Imbalanced (minority: {minority_class_pct:.1f}%)")
            print(f"    Recommendation: Use class_weight='balanced' in Random Forest")
            print(f"    Note: This is not a critical failure")
    else:
        validation_results['balanced_target'] = False
        print(f"⊘ CHECK 4 SKIPPED: No valid target variable")
    
    # -------------------------------------------------------------------------
    # CHECK 5: NO DUPLICATE ROWS
    # -------------------------------------------------------------------------
    
    duplicates = final_df.duplicated(subset=['game_id', 'play_id']).sum()
    validation_results['no_duplicate_rows'] = (duplicates == 0)
    
    if validation_results['no_duplicate_rows']:
        print(f"✓ CHECK 5 PASSED: No duplicate play records")
    else:
        print(f"✗ CHECK 5 FAILED: Found {duplicates} duplicate plays")
        print(f"    Action: Remove duplicates before training")
    
    # -------------------------------------------------------------------------
    # CHECK 6: NUMERIC FEATURES VALID
    # -------------------------------------------------------------------------
    
    numeric_cols = final_df.select_dtypes(include=[np.number]).columns
    invalid_features = []
    
    for col in numeric_cols:
        # Skip identifier columns
        if col in ['game_id', 'play_id', 'nfl_id', 'frame_id']:
            continue
        
        # Check for infinite values
        if np.isinf(final_df[col]).any():
            invalid_features.append((col, 'infinite values'))
        
        # Check for excessive missing values
        missing_pct = (final_df[col].isna().sum() / len(final_df) * 100)
        if missing_pct > 50:
            invalid_features.append((col, f'{missing_pct:.1f}% missing'))
    
    validation_results['numeric_features_valid'] = (len(invalid_features) == 0)
    
    if validation_results['numeric_features_valid']:
        print(f"✓ CHECK 6 PASSED: Numeric features valid ({len(numeric_cols)} features)")
    else:
        print(f"✗ CHECK 6 FAILED: Issues with {len(invalid_features)} features")
        for feature, issue in invalid_features[:5]:  # Show first 5
            print(f"    • {feature}: {issue}")
        if len(invalid_features) > 5:
            print(f"    ... and {len(invalid_features) - 5} more")
    
    # -------------------------------------------------------------------------
    # OVERALL ML READINESS
    # -------------------------------------------------------------------------
    
    # Required checks (all must pass)
    required_checks = [
        'has_target_variable',
        'sufficient_samples',
        'target_is_binary',
        'no_duplicate_rows',
        'numeric_features_valid'
    ]
    
    validation_results['ready_for_ml'] = all(
        validation_results.get(check, False) for check in required_checks
    )
    
    print("\n" + "="*70)
    if validation_results['ready_for_ml']:
        print("✓✓✓ DATASET READY FOR ML PIPELINE ✓✓✓")
        print("="*70)
        print("\nRecommended Next Steps:")
        print("  1. Data Cleaning & Preprocessing")
        print("     - Handle outliers (IQR method)")
        print("     - Impute missing values (if any)")
        print("     - Encode categorical features")
        print("  2. Exploratory Data Analysis (EDA)")
        print("     - Feature distributions")
        print("     - Correlation analysis")
        print("     - Feature-target relationships")
        print("  3. Feature Selection")
        print("     - Remove highly correlated features")
        print("     - Feature importance analysis")
        print("  4. Model Training")
        print("     - Train/Test split (80/20)")
        print("     - Random Forest Classifier")
        print("     - Cross-validation")
        print("  5. Model Tuning & Evaluation")
        print("     - GridSearchCV for hyperparameters")
        print("     - Confusion matrix, ROC curve")
        print("     - Feature importance")
    else:
        print("⚠⚠⚠ DATASET NEEDS ATTENTION BEFORE ML ⚠⚠⚠")
        print("="*70)
        print("\nFailed Checks (must be resolved):")
        for check, passed in validation_results.items():
            if not passed and check != 'ready_for_ml' and check != 'balanced_target':
                print(f"  ✗ {check}")
        
        print("\nAction Required:")
        print("  1. Address failed checks above")
        print("  2. Re-run validation")
        print("  3. Proceed only after all checks pass")
    
    print("="*70)
    
    return validation_results

# Pipeline

### Frame Level - QB Seperation

In [24]:
# Calculate frame-level minimum separation for Quarterbacks
print(f"\n{'='*70}")
print("STEP 5: CALCULATING FRAME-LEVEL MINIMUM SEPARATIONS FOR QUARTERBACKS")
print("="*70)
print("Processing all frames using vectorized distance calculations...")
qb_separation_frame_level = calculate_frame_level_separation_qb(input_data)
print("✓ Frame-level separation calculations complete")
print(f"✓ Returned {len(qb_separation_frame_level)} Quarterback records with separation data")
print(f"✓ Columns: {list(qb_separation_frame_level.columns)}")


STEP 5: CALCULATING FRAME-LEVEL MINIMUM SEPARATIONS FOR QUARTERBACKS
Processing all frames using vectorized distance calculations...
✓ Frame-level separation calculations complete
✓ Returned 396765 Quarterback records with separation data
✓ Columns: ['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_name', 'player_role', 'player_position', 'qb_min_separation']


### Frame Level - QB Pressure Velocity

In [25]:
# Calculate Frame level QB pressure velocity
print("\n[1/6] Calculating QB pressure velocity...")
qb_velocity_frame_level = calculate_frame_level_pressure_velocity_qb(qb_separation_frame_level)
print(f"    ✓ Added columns: separation_velocity, pressure_acceleration")
print(f"    ✓ Data shape: {qb_velocity_frame_level.shape}")
print(f"✓ Columns: {list(qb_velocity_frame_level.columns)}")


[1/6] Calculating QB pressure velocity...
    ✓ Added columns: separation_velocity, pressure_acceleration
    ✓ Data shape: (396765, 10)
✓ Columns: ['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_name', 'player_role', 'player_position', 'qb_min_separation', 'separation_velocity', 'pressure_acceleration']


### Frame Level - QB Defender Convergence

In [26]:
# Calculate Frame level QB defender Convergence
convergence_frame_level = calculate_frame_level_defender_convergence(input_data)
display(convergence_frame_level[['frame_id', 'defenders_closing_zone', 'convergence_category']].head())
high_conv = convergence_frame_level[convergence_frame_level['convergence_category'] == 'High']
print(f"Frames with high convergence: {len(high_conv)}")

# List row with minimum separation is below 3 yards
min_sep_below_3yds = qb_velocity_frame_level[qb_velocity_frame_level['qb_min_separation'] < 3.0]
# display(min_sep_below_3yds[['game_id', 'play_id', 'frame_id', 'qb_min_separation', 'defenders_immediate_zone']])
print(f"Frames with QB separation below 3 yards: {len(min_sep_below_3yds)}")

CALCULATING DEFENDER CONVERGENCE
Pressure Zones: {'immediate': 3.0, 'closing': 5.0, 'potential': 7.0}
----------------------------------------------------------------------
Processing frames...
  Processed 1,000 / 396,914 frames...
  Processed 2,000 / 396,914 frames...
  Processed 3,000 / 396,914 frames...
  Processed 4,000 / 396,914 frames...
  Processed 5,000 / 396,914 frames...
  Processed 6,000 / 396,914 frames...
  Processed 7,000 / 396,914 frames...
  Processed 8,000 / 396,914 frames...
  Processed 9,000 / 396,914 frames...
  Processed 10,000 / 396,914 frames...
  Processed 11,000 / 396,914 frames...
  Processed 12,000 / 396,914 frames...
  Processed 13,000 / 396,914 frames...
  Processed 14,000 / 396,914 frames...
  Processed 15,000 / 396,914 frames...
  Processed 16,000 / 396,914 frames...
  Processed 17,000 / 396,914 frames...
  Processed 18,000 / 396,914 frames...
  Processed 19,000 / 396,914 frames...
  Processed 20,000 / 396,914 frames...
  Processed 21,000 / 396,914 frames

Unnamed: 0,frame_id,defenders_closing_zone,convergence_category
0,1,0,
1,2,0,
2,3,0,
3,4,0,
4,5,0,


Frames with high convergence: 9953
Frames with QB separation below 3 yards: 61


In [27]:
min_sep_belo3yds = convergence_frame_level[convergence_frame_level['closest_defender_distance'] < 3.0]
display(min_sep_belo3yds.head())

Unnamed: 0,game_id,play_id,frame_id,nfl_id,player_name,player_role,player_position,defenders_immediate_zone,defenders_closing_zone,defenders_potential_zone,total_converging_defenders,convergence_category,closest_defender_distance,total_defenders_on_field
21205,2023091013,1737,49,47789,Daniel Jones,Passer,QB,1,0,0,1,Moderate,2.726243,6
21206,2023091013,1737,50,47789,Daniel Jones,Passer,QB,1,0,0,1,Moderate,2.394097,6
21207,2023091013,1737,51,47789,Daniel Jones,Passer,QB,1,0,0,1,Moderate,2.087223,6
44216,2023091800,1854,59,55865,Bryce Young,Passer,QB,1,0,0,1,Moderate,2.748545,8
110235,2023100810,2049,50,44822,Patrick Mahomes,Passer,QB,1,0,0,1,Moderate,2.877846,7


### Frame Level - QB Merged Data Separation + Convergence

In [28]:
# -----------------------------------------------------------------------------
# Merge Frame-Level QB Separation with Convergence
# -----------------------------------------------------------------------------

print("\n[1/4] Merging QB separation with convergence metrics...")
print("Input: qb_velocity_frame_level + convergence_frame_level")
print("Output: Merged pressure features (NO collapse indicators yet)")

try:
    qb_features_frame_level = merge_qb_separation_with_convergence(
        qb_velocity_frame_level,    # QB separation + velocity (from earlier steps)
        convergence_frame_level,    # Convergence metrics (from Step A.2)
        verbose=True
    )
    
    print(f"\n✓ Frame-level merge completed successfully")
    print(f"✓ Output shape: {qb_features_frame_level.columns} columns, {len(qb_features_frame_level):,} rows")
    print(f"✓ Ready for collapse detection")
    
except Exception as e:
    print(f"\n✗ Error merging frame-level data: {e}")
    print(f"   Check that qb_velocity_frame_level and convergence_frame_level exist")
    raise


[1/4] Merging QB separation with convergence metrics...
Input: qb_velocity_frame_level + convergence_frame_level
Output: Merged pressure features (NO collapse indicators yet)
MERGING QB SEPARATION WITH CONVERGENCE (FRAME-LEVEL)
QB separation data: 396,765 frames
Convergence data: 396,765 frames

Purpose: Combine pressure metrics for collapse detection
Note: Collapse indicators created in next step

Merge Results:
  • Total frames after merge: 396,765
  • Frames with convergence data: 396,765
  • Match rate: 100.0%
  • Total columns: 16
  ✓ Good match rate - data sources aligned

✓ Frame-level merge completed successfully
✓ Output shape: Index(['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_name',
       'player_role', 'player_position', 'qb_min_separation',
       'separation_velocity', 'pressure_acceleration',
       'defenders_immediate_zone', 'defenders_closing_zone',
       'defenders_potential_zone', 'total_converging_defenders',
       'convergence_category', 'total_defende

In [29]:
test = qb_features_frame_level[qb_features_frame_level['qb_min_separation'] < 3.0]
# display only relevant columns
display(test[['game_id', 'play_id', 'frame_id', 'qb_min_separation','defenders_immediate_zone','defenders_closing_zone', 'defenders_potential_zone',
                'convergence_category', 'total_converging_defenders' ]].head())

Unnamed: 0,game_id,play_id,frame_id,qb_min_separation,defenders_immediate_zone,defenders_closing_zone,defenders_potential_zone,convergence_category,total_converging_defenders
21205,2023091013,1737,49,2.73,1,0,0,Moderate,1
21206,2023091013,1737,50,2.39,1,0,0,Moderate,1
21207,2023091013,1737,51,2.09,1,0,0,Moderate,1
44216,2023091800,1854,59,2.75,1,0,0,Moderate,1
110235,2023100810,2049,50,2.88,1,0,0,Moderate,1


### Frame Level - TR Separation

In [30]:
# Calculate frame-level minimum separation
print(f"\n{'='*70}")
print(" CALCULATING FRAME-LEVEL MINIMUM SEPARATIONS")
print("="*70)
print("Processing all frames using vectorized distance calculations...")
tr_separation_df = calculate_frame_level_separation_tr(input_data)
print("✓ Frame-level separation calculations complete")
print(f"✓ Returned {len(tr_separation_df)} Targeted Receiver records with separation data")
print(f"✓ Columns: {list(tr_separation_df.columns)}")


 CALCULATING FRAME-LEVEL MINIMUM SEPARATIONS
Processing all frames using vectorized distance calculations...
✓ Frame-level separation calculations complete
✓ Returned 396879 Targeted Receiver records with separation data
✓ Columns: ['game_id', 'play_id', 'nfl_id', 'frame_id', 'player_name', 'player_role', 'player_position', 'tr_min_separation', 'route_of_targeted_receiver', 'dir']


### Play Level - QB Aggregation

In [31]:
# Play Level aggregation (QB + convergence + collapse)
complete_qb_play_features = aggregate_qb_features_to_play_level(
    qb_features_frame_level  # ← Complete frame data
)

print(complete_qb_play_features.columns)

AGGREGATING QB FRAME-LEVEL FEATURES TO PLAY-LEVEL
Input: 396,765 frames
✓ Aggregated to 14,105 rows
✓ Features: 20 columns
Output: 14,105 plays × 20 features
Reduction: 28.1:1 (frames to plays)

Feature Groups:
  • Separation     :  3 features
  • Velocity       :  4 features
  • Convergence    :  7 features
Index(['game_id', 'play_id', 'nfl_id', 'qb_play_min_separation',
       'qb_play_avg_separation', 'qb_play_var_separation',
       'max_pressure_velocity', 'pressure_velocity_avg', 'pressure_volatility',
       'max_pressure_acceleration', 'pressure_acceleration_avg',
       'max_converging_defenders', 'avg_converging_defenders',
       'std_converging_defenders', 'max_defenders_immediate_zone',
       'max_defenders_closing_zone', 'avg_defenders_closing_zone',
       'max_defenders_potential_zone', 'player_name', 'player_role'],
      dtype='object')


### Play Level - TR Separation

In [32]:
# Aggregate TR separately
tr_play_features = tr_calculate_play_level_aggregates(tr_separation_df)
print(tr_play_features.columns)
print(complete_qb_play_features.columns)

TR Play-level aggregates shape: (14107, 12)
Index(['game_id', 'play_id', 'nfl_id', 'tr_play_min_separation',
       'tr_play_avg_separation', 'tr_play_var_separation', 'player_name',
       'player_role', 'player_position', 'route_of_targeted_receiver',
       'nums_frame_pre_throw', 'time_to_throw', 'tr_min_separation_first',
       'tr_min_separation_last', 'tr_last_dir'],
      dtype='object')
Index(['game_id', 'play_id', 'nfl_id', 'qb_play_min_separation',
       'qb_play_avg_separation', 'qb_play_var_separation',
       'max_pressure_velocity', 'pressure_velocity_avg', 'pressure_volatility',
       'max_pressure_acceleration', 'pressure_acceleration_avg',
       'max_converging_defenders', 'avg_converging_defenders',
       'std_converging_defenders', 'max_defenders_immediate_zone',
       'max_defenders_closing_zone', 'avg_defenders_closing_zone',
       'max_defenders_potential_zone', 'player_name', 'player_role'],
      dtype='object')


### Play Level - QB + TR + Supplementary Features

In [33]:
# Merge play-level (clean, complete)
final_ml_features = merge_play_level_features(
    complete_qb_play_features,  # ← Complete QB features (all in one)
    tr_play_features,           # ← TR features
    supplementary_data          # ← Context + target
)

# Missing values summary
missing_summary = final_ml_features.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]
print("\nMissing Values Summary:")
print(missing_summary)

# save final dataset with QB Pressure index as csv
final_ml_features.to_csv('data/output/final_ml_features_with_qb_pressure.csv', index=False)

# Verify no missing time_to_throw
print("\nVERIFICATION:")
print(f"Original TR time_to_throw missing: {tr_play_features['time_to_throw'].isna().sum()}")
print(f"Final corrected time_to_throw missing: {final_ml_features['time_to_throw'].isna().sum()}")
print(f"Final dataset shape: {final_ml_features.shape}")




CORRECTED: TRACKING DATA ONLY - DROP INCOMPLETE PLAYS
Strategy: Inner joins - only keep plays with QB AND TR tracking data

[STEP 1] QB pressure data as base:
  • QB plays with pressure metrics: 14,105
  • These have actual tracking-derived features

[STEP 2] After QB + TR inner join:
  • Plays with BOTH QB and TR data: 14,105
  • QB-only plays dropped: 0
  • Retention rate: 100.0%
  • time_to_throw missing: 0 (should be 0)

[STEP 3] After adding supplementary data (inner join):
  • Final plays with complete data: 14,105
  • Tracking-only plays dropped: 0
  • Total supplementary plays: 18,009
  • Supplementary plays used: 14,105 (78.3%)
  • Plays with target variable: 14,105
  ✓ All plays have target variable (perfect)

[SUMMARY] Data Quality:
  ✓ All plays have QB pressure metrics (tracking-derived)
  ✓ All plays have TR coverage metrics (tracking-derived)
  ✓ All plays have game context (supplementary)
  ✓ No missing time_to_throw (from TR tracking)
  ✓ High-quality dataset: 14,105 

In [34]:
print("Data Flow Tracking:")
print(f"1. Supplementary data: {len(supplementary_data):,} plays")
print(f"2. QB frame-level: {len(qb_separation_frame_level):,} QB-frames")
print(f"3. QB play-level: {len(complete_qb_play_features):,} plays")  
print(f"4. TR play-level: {len(tr_play_features):,} plays")
print(f"5. Final ML features: {len(final_ml_features):,} plays")

# Check week distribution
if 'week' in supplementary_data.columns:
    week_counts = supplementary_data['week'].value_counts().sort_index()
    print(f"\nWeek distribution in supplementary:")
    print(week_counts)


Data Flow Tracking:
1. Supplementary data: 18,009 plays
2. QB frame-level: 396,765 QB-frames
3. QB play-level: 14,105 plays
4. TR play-level: 14,107 plays
5. Final ML features: 14,105 plays

Week distribution in supplementary:
week
1      819
2      850
3      904
4      779
5      742
6      793
7      693
8      827
9      711
10     721
11     707
12     854
13     666
14    1460
15    1594
16    1686
17    1603
18    1600
Name: count, dtype: int64


### Pressure components score Calculation

In [40]:
# ============================================================================
# EXECUTE THE FUNCTION
# ============================================================================

# Create the pressure component scores
if 'final_ml_features' in locals():
    final_ml_features_with_scores = create_pressure_component_scores(
        final_ml_features,
        verbose=True
    )
    
    print(f"\n✓ Pressure component scores added to dataset")
    print(f"✓ New columns: spatial_pressure_score, temporal_pressure_score, coverage_pressure_score")
    print(f"✓ Dataset shape: {final_ml_features_with_scores.shape}")
    
    # Display sample results
    print(f"\nSample Results (First 10 plays):")
    sample_cols = [
        'game_id', 'play_id',
        'convergence_pressure_score', 'velocity_pressure_score', 'tr_coverage_pressure_score',
        'pass_result'
    ]
    available_sample_cols = [col for col in sample_cols if col in final_ml_features_with_scores.columns]
    
    if available_sample_cols:
        display(final_ml_features_with_scores[available_sample_cols].head(10))
    
    # Save the enhanced dataset
    output_path = 'data/output/final_ml_features_with_component_scores.csv'
    final_ml_features_with_scores.to_csv(output_path, index=False)
    print(f"  Shape: {final_ml_features_with_scores.shape}")
    print(f"  Contains: All features for ML pipeline")
    
else:
    print("⚠ final_ml_features not found. Please run the previous pipeline steps first.")

CREATING PRESSURE COMPONENT SCORES
Components:
  1. Convergence Pressure: Defender convergence
  2. Velocity Pressure: Velocity pressure
  3. TR Coverage Pressure: TR separation
----------------------------------------------------------------------
[CONVERGENCE PRESSURE]
  • Source: max_converging_defenders
  • Raw range: 0.0 - 5.0
  • 95th percentile: 3.0
  • Score range: 0.0 - 100.0
  • Score mean: 13.4

[VELOCITY PRESSURE]
  • Source: max_pressure_velocity
  • Raw range: -10.4 - 2.4 yds/sec
  • Clipped range: -10.4 - 0.0
  • Score range: 0.0 - 69.3
  • Score mean: 5.6

[COVERAGE PRESSURE]
  • Source: tr_play_min_separation
  • Raw range: 0.0 - 19.8 yards
  • Score range: 0.0 - 100.0
  • Score mean: 67.5
  • Logic: Lower separation = Higher pressure

COMPONENT SCORES SUMMARY
Component Statistics:
  • Spatial (Convergence):  13.4 ± 25.3 (range:  0.0-100.0)
  • Temporal (Velocity) :   5.6 ±  9.2 (range:  0.0-69.3)
  • Coverage (TR Sep)   :  67.5 ± 25.3 (range:  0.0-100.0)

Component Co

Unnamed: 0,game_id,play_id,convergence_pressure_score,velocity_pressure_score,tr_coverage_pressure_score,pass_result
0,2023090700,101,0.0,0.0,89.3,I
1,2023090700,194,33.3,10.7,90.0,C
2,2023090700,219,0.0,0.0,50.9,C
3,2023090700,361,33.3,8.7,78.6,C
4,2023090700,436,0.0,10.7,71.3,C
5,2023090700,461,100.0,0.0,76.7,C
6,2023090700,530,0.0,0.7,65.6,C
7,2023090700,621,0.0,14.7,85.3,C
8,2023090700,713,0.0,0.0,86.1,I
9,2023090700,736,0.0,0.0,79.2,C


  Shape: (14105, 43)
  Contains: All features for ML pipeline


In [42]:
# Analyze component relationships
pressure_components = [
    'convergence_pressure_score',
    'velocity_pressure_score', 
    'tr_coverage_pressure_score'
]

print("\nComponent Analysis:")
for component in pressure_components:
    if component in final_ml_features_with_scores.columns:
        mean_val = final_ml_features_with_scores[component].mean()
        print(f"  • {component:25s}: {mean_val:.1f} avg")

# Check for excessive correlation
if all(col in final_ml_features_with_scores.columns for col in pressure_components):
    correlation_matrix = final_ml_features_with_scores[pressure_components].corr()
    print(f"\nComponent Correlations:")
    print(correlation_matrix.round(2))
    
    # Flag high correlations (> 0.7)
    high_corr_pairs = []
    for i in range(len(pressure_components)):
        for j in range(i+1, len(pressure_components)):
            corr = correlation_matrix.iloc[i, j]
            if abs(corr) > 0.7:
                high_corr_pairs.append((pressure_components[i], pressure_components[j], corr))
    
    if high_corr_pairs:
        print(f"\nHigh Correlation Warnings (>0.7):")
        for comp1, comp2, corr in high_corr_pairs:
            print(f"  ⚠ {comp1} ↔ {comp2}: {corr:.3f}")
    else:
        print(f"\n✓ No excessive correlations found")


Component Analysis:
  • convergence_pressure_score: 13.4 avg
  • velocity_pressure_score  : 5.6 avg
  • tr_coverage_pressure_score: 67.5 avg

Component Correlations:
                            convergence_pressure_score  \
convergence_pressure_score                        1.00   
velocity_pressure_score                           0.11   
tr_coverage_pressure_score                        0.05   

                            velocity_pressure_score  \
convergence_pressure_score                     0.11   
velocity_pressure_score                        1.00   
tr_coverage_pressure_score                     0.14   

                            tr_coverage_pressure_score  
convergence_pressure_score                        0.05  
velocity_pressure_score                           0.14  
tr_coverage_pressure_score                        1.00  

✓ No excessive correlations found


### Validate and Display sample results

In [43]:
# -----------------------------------------------------------------------------
# Validate Final Features for ML Readiness
# -----------------------------------------------------------------------------

print("\n[4/4] Validating final features for ML pipeline...")

try:
    validation_results = validate_final_features(
        final_ml_features_with_scores,
        target_column='pass_result'
    )
    
    if validation_results['ready_for_ml']:
        print(f"\n✓ Validation complete - READY FOR ML PIPELINE ✓")
    else:
        print(f"\n⚠ Validation complete - REVIEW WARNINGS ABOVE ⚠")
    
except Exception as e:
    print(f"\n✗ Error during validation: {e}")
    print("⚠ Proceeding without full validation (not critical)")


[4/4] Validating final features for ML pipeline...

VALIDATING FINAL FEATURES FOR ML PIPELINE
Target column: 'pass_result'
----------------------------------------------------------------------
✓ CHECK 1 PASSED: Target variable 'pass_result' available
    Missing: 0 (0.0%)
✓ CHECK 2 PASSED: Sufficient samples (14,105)
    → Excellent sample size for Random Forest
✗ CHECK 3 FAILED: Expected 2 classes, found 3
    Classes: ['I' 'C' 'IN']
⊘ CHECK 4 SKIPPED: No valid target variable
✓ CHECK 5 PASSED: No duplicate play records
✓ CHECK 6 PASSED: Numeric features valid (32 features)

⚠⚠⚠ DATASET NEEDS ATTENTION BEFORE ML ⚠⚠⚠

Failed Checks (must be resolved):
  ✗ target_is_binary

Action Required:
  1. Address failed checks above
  2. Re-run validation
  3. Proceed only after all checks pass



In [44]:
# -----------------------------------------------------------------------------
# DISPLAY SAMPLE RESULTS
# -----------------------------------------------------------------------------

print("\n" + "="*70)
print("SAMPLE: FINAL ML FEATURES")
print("="*70)

sample_cols = [
    'game_id', 'play_id', 'player_name_qb',
    'qb_play_avg_separation', 'max_converging_defenders',
    'tr_play_avg_separation', 'time_to_throw',
    'pass_result'
]

available_sample_cols = [col for col in sample_cols if col in final_ml_features_with_scores.columns]

if available_sample_cols:
    print(final_ml_features[available_sample_cols].head(10).to_string(index=False))
else:
    print("⚠ Sample columns not available, showing first 5 columns:")
    print(final_ml_features_with_scores.iloc[:, :5].head(10))

# -----------------------------------------------------------------------------
# COMPLETION SUMMARY
# -----------------------------------------------------------------------------

print("\n" + "="*70)
print("✓ STEP A.3 COMPLETE - CONVERGENCE INTEGRATION FINISHED")
print("="*70)
print("\n🎉 DEFENDER CONVERGENCE FEATURE ENGINEERING COMPLETE! 🎉")
print("\nAll Steps Summary:")
print("  ✓ A.1 - Data Validation (defender positions confirmed)")
print("  ✓ A.2 - Core Convergence Calculation (frame + play level)")
print("  ✓ A.3 - Integration with Pocket Collapse (enhanced detection)")
print("\nDataset Statistics:")
print(f"  • Frame-level records: {qb_features_frame_level.shape[0]:,}")
print(f"  • Play-level records: {final_ml_features_with_scores.shape[0]:,}")


SAMPLE: FINAL ML FEATURES
   game_id  play_id  qb_play_avg_separation  max_converging_defenders  tr_play_avg_separation  time_to_throw pass_result
2023090700      101               10.694615                         0                    2.77            2.6           I
2023090700      194                7.291563                         1                    4.19            3.2           C
2023090700      219               10.027059                         0                    5.93            1.7           C
2023090700      361               13.231961                         1                    6.06            5.1           C
2023090700      436                8.061500                         0                    4.65            2.0           C
2023090700      461                7.239130                         3                    3.18            2.3           C
2023090700      530               10.251500                         0                    5.07            2.0           C
20230