# NFL Big Data Bowl 2026 - Comprehensive EDA
## Detailed Exploratory Data Analysis with Results Export

This notebook performs a complete analysis of the NFL player tracking data to understand:
- Data structure and quality
- Player movement patterns
- Ball trajectory characteristics
- Temporal patterns
- Feature distributions and correlations

All results are saved to `eda_results.json` for future reference.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import glob
import warnings
from pathlib import Path
from collections import defaultdict
from datetime import datetime

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Create output directories
Path('eda_outputs').mkdir(exist_ok=True)
Path('eda_outputs/plots').mkdir(exist_ok=True)

print("Libraries imported successfully!")
print(f"Analysis started at: {datetime.now()}")

## 1. Data Loading

In [None]:
# Load all training data
input_files = sorted(glob.glob('train/input_2023_w*.csv'))
output_files = sorted(glob.glob('train/output_2023_w*.csv'))

print(f"Found {len(input_files)} input files")
print(f"Found {len(output_files)} output files")

# Load sample first to understand structure
print("\nLoading first week for initial inspection...")
sample_input = pd.read_csv(input_files[0])
sample_output = pd.read_csv(output_files[0])

print(f"\nSample input shape: {sample_input.shape}")
print(f"Sample output shape: {sample_output.shape}")

In [None]:
# Load all data
print("Loading all training data...")
input_dfs = []
output_dfs = []

for i, (inp_file, out_file) in enumerate(zip(input_files, output_files), 1):
    print(f"Loading week {i:02d}...", end='\r')
    inp_df = pd.read_csv(inp_file)
    out_df = pd.read_csv(out_file)
    
    inp_df['week'] = i
    out_df['week'] = i
    
    input_dfs.append(inp_df)
    output_dfs.append(out_df)

df_input = pd.concat(input_dfs, ignore_index=True)
df_output = pd.concat(output_dfs, ignore_index=True)

print(f"\n\nTotal input records: {len(df_input):,}")
print(f"Total output records: {len(df_output):,}")
print(f"Memory usage - Input: {df_input.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Memory usage - Output: {df_output.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

## 2. Data Structure Analysis

In [None]:
# Initialize results dictionary
eda_results = {
    'metadata': {
        'analysis_date': str(datetime.now()),
        'total_weeks': len(input_files)
    },
    'data_structure': {},
    'data_quality': {},
    'feature_analysis': {},
    'play_characteristics': {},
    'player_analysis': {},
    'movement_patterns': {},
    'ball_trajectory': {},
    'correlations': {}
}

# Basic structure
eda_results['data_structure'] = {
    'input_shape': list(df_input.shape),
    'output_shape': list(df_output.shape),
    'input_columns': list(df_input.columns),
    'output_columns': list(df_output.columns),
    'input_dtypes': df_input.dtypes.astype(str).to_dict(),
    'output_dtypes': df_output.dtypes.astype(str).to_dict()
}

print("INPUT DATA COLUMNS:")
print(df_input.columns.tolist())
print("\nOUTPUT DATA COLUMNS:")
print(df_output.columns.tolist())

print("\n" + "="*80)
print("INPUT DATA INFO:")
print("="*80)
print(df_input.info())

print("\n" + "="*80)
print("OUTPUT DATA INFO:")
print("="*80)
print(df_output.info())

In [None]:
# Display sample data
print("SAMPLE INPUT DATA:")
display(df_input.head(10))

print("\nSAMPLE OUTPUT DATA:")
display(df_output.head(10))

## 3. Data Quality Analysis

In [None]:
# Missing values
input_missing = df_input.isnull().sum()
output_missing = df_output.isnull().sum()

input_missing_pct = (input_missing / len(df_input) * 100).round(2)
output_missing_pct = (output_missing / len(df_output) * 100).round(2)

eda_results['data_quality']['input_missing_values'] = {
    col: {'count': int(input_missing[col]), 'percentage': float(input_missing_pct[col])}
    for col in df_input.columns if input_missing[col] > 0
}

eda_results['data_quality']['output_missing_values'] = {
    col: {'count': int(output_missing[col]), 'percentage': float(output_missing_pct[col])}
    for col in df_output.columns if output_missing[col] > 0
}

print("INPUT DATA - MISSING VALUES:")
print(input_missing[input_missing > 0])
print(f"\nPercentage:")
print(input_missing_pct[input_missing_pct > 0])

print("\n" + "="*80)
print("OUTPUT DATA - MISSING VALUES:")
print(output_missing[output_missing > 0])
print(f"\nPercentage:")
print(output_missing_pct[output_missing_pct > 0])

In [None]:
# Unique counts
unique_games = df_input['game_id'].nunique()
unique_plays = df_input.groupby('game_id')['play_id'].nunique().sum()
unique_players = df_input['nfl_id'].nunique()

eda_results['data_quality']['unique_counts'] = {
    'unique_games': int(unique_games),
    'unique_plays': int(unique_plays),
    'unique_players': int(unique_players),
    'total_play_instances': int(df_input.groupby(['game_id', 'play_id']).ngroups)
}

print(f"Unique games: {unique_games:,}")
print(f"Total unique plays: {unique_plays:,}")
print(f"Unique players: {unique_players:,}")
print(f"Unique game-play combinations: {df_input.groupby(['game_id', 'play_id']).ngroups:,}")

## 4. Play Characteristics Analysis

In [None]:
# Players per play
players_per_play = df_input.groupby(['game_id', 'play_id'])['nfl_id'].nunique()

eda_results['play_characteristics']['players_per_play'] = {
    'mean': float(players_per_play.mean()),
    'median': float(players_per_play.median()),
    'min': int(players_per_play.min()),
    'max': int(players_per_play.max()),
    'std': float(players_per_play.std())
}

print("PLAYERS PER PLAY:")
print(players_per_play.describe())

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
players_per_play.value_counts().sort_index().plot(kind='bar', ax=ax)
ax.set_title('Distribution of Players per Play', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Players')
ax.set_ylabel('Frequency')
plt.tight_layout()
plt.savefig('eda_outputs/plots/01_players_per_play.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Input frames per play
input_frames_per_play = df_input.groupby(['game_id', 'play_id'])['frame_id'].max()

eda_results['play_characteristics']['input_frames_per_play'] = {
    'mean': float(input_frames_per_play.mean()),
    'median': float(input_frames_per_play.median()),
    'min': int(input_frames_per_play.min()),
    'max': int(input_frames_per_play.max()),
    'std': float(input_frames_per_play.std())
}

print("INPUT FRAMES PER PLAY:")
print(input_frames_per_play.describe())

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(input_frames_per_play, bins=50, edgecolor='black', alpha=0.7)
ax.axvline(input_frames_per_play.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {input_frames_per_play.mean():.1f}')
ax.axvline(input_frames_per_play.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {input_frames_per_play.median():.1f}')
ax.set_title('Distribution of Input Frames per Play (Pre-throw)', fontsize=14, fontweight='bold')
ax.set_xlabel('Number of Frames')
ax.set_ylabel('Frequency')
ax.legend()
plt.tight_layout()
plt.savefig('eda_outputs/plots/02_input_frames_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Output frames to predict
output_frames = df_input.groupby(['game_id', 'play_id', 'nfl_id'])['num_frames_output'].first()

eda_results['play_characteristics']['output_frames_to_predict'] = {
    'mean': float(output_frames.mean()),
    'median': float(output_frames.median()),
    'min': int(output_frames.min()),
    'max': int(output_frames.max()),
    'std': float(output_frames.std())
}

print("OUTPUT FRAMES TO PREDICT (per player):")
print(output_frames.describe())

# Verify with actual output data
actual_output_frames = df_output.groupby(['game_id', 'play_id', 'nfl_id'])['frame_id'].max()
print("\nACTUAL OUTPUT FRAMES (verification):")
print(actual_output_frames.describe())

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

axes[0].hist(output_frames, bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(output_frames.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {output_frames.mean():.1f}')
axes[0].set_title('Distribution of Output Frames to Predict', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Frames')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Time in air (frames * 0.1 seconds)
time_in_air = output_frames / 10  # Convert frames to seconds
axes[1].hist(time_in_air, bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(time_in_air.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {time_in_air.mean():.2f}s')
axes[1].set_title('Distribution of Ball Time in Air', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Time (seconds)')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('eda_outputs/plots/03_output_frames_and_time.png', dpi=150, bbox_inches='tight')
plt.show()

eda_results['play_characteristics']['ball_time_in_air_seconds'] = {
    'mean': float(time_in_air.mean()),
    'median': float(time_in_air.median()),
    'min': float(time_in_air.min()),
    'max': float(time_in_air.max()),
    'std': float(time_in_air.std())
}

In [None]:
# Play direction analysis
play_direction_counts = df_input.groupby(['game_id', 'play_id'])['play_direction'].first().value_counts()

eda_results['play_characteristics']['play_direction'] = {
    str(k): int(v) for k, v in play_direction_counts.items()
}

print("PLAY DIRECTION DISTRIBUTION:")
print(play_direction_counts)

# Yardline analysis
yardline_stats = df_input.groupby(['game_id', 'play_id'])['absolute_yardline_number'].first()

eda_results['play_characteristics']['absolute_yardline'] = {
    'mean': float(yardline_stats.mean()),
    'median': float(yardline_stats.median()),
    'min': float(yardline_stats.min()),
    'max': float(yardline_stats.max()),
    'std': float(yardline_stats.std())
}

print("\nABSOLUTE YARDLINE STATISTICS:")
print(yardline_stats.describe())

## 5. Player Role Analysis

In [None]:
# Player positions
position_counts = df_input['player_position'].value_counts()
print("PLAYER POSITIONS:")
print(position_counts)

eda_results['player_analysis']['position_distribution'] = position_counts.to_dict()

# Player sides
side_counts = df_input['player_side'].value_counts()
print("\nPLAYER SIDES:")
print(side_counts)

eda_results['player_analysis']['side_distribution'] = side_counts.to_dict()

# Player roles
role_counts = df_input['player_role'].value_counts()
print("\nPLAYER ROLES:")
print(role_counts)

eda_results['player_analysis']['role_distribution'] = role_counts.to_dict()

# Plot
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Top positions
position_counts.head(15).plot(kind='barh', ax=axes[0, 0], color='steelblue')
axes[0, 0].set_title('Top 15 Player Positions', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Count')

# Player sides
side_counts.plot(kind='bar', ax=axes[0, 1], color=['#2ecc71', '#e74c3c'])
axes[0, 1].set_title('Player Sides (Offense vs Defense)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_xticklabels(axes[0, 1].get_xticklabels(), rotation=0)

# Player roles
role_counts.plot(kind='barh', ax=axes[1, 0], color='coral')
axes[1, 0].set_title('Player Roles in Pass Plays', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Count')

# Players to predict
predict_counts = df_input['player_to_predict'].value_counts()
axes[1, 1].pie(predict_counts, labels=['Not Predicted', 'To Predict'], autopct='%1.1f%%', 
               colors=['#95a5a6', '#3498db'], startangle=90)
axes[1, 1].set_title('Players to Predict Distribution', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('eda_outputs/plots/04_player_roles_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

eda_results['player_analysis']['players_to_predict'] = {
    'total_to_predict': int(df_input['player_to_predict'].sum()),
    'total_not_predict': int((~df_input['player_to_predict']).sum()),
    'percentage_to_predict': float(df_input['player_to_predict'].mean() * 100)
}

In [None]:
# Role by side analysis
role_side = pd.crosstab(df_input['player_role'], df_input['player_side'])
print("PLAYER ROLE BY SIDE:")
print(role_side)

eda_results['player_analysis']['role_by_side'] = role_side.to_dict()

## 6. Movement & Tracking Features Analysis

In [None]:
# Analyze position, speed, acceleration, orientation, direction
tracking_features = ['x', 'y', 's', 'a', 'o', 'dir']

print("TRACKING FEATURES STATISTICS (INPUT DATA):")
print("="*80)
stats_df = df_input[tracking_features].describe()
print(stats_df)

for feat in tracking_features:
    eda_results['feature_analysis'][f'{feat}_stats'] = {
        'mean': float(df_input[feat].mean()),
        'median': float(df_input[feat].median()),
        'std': float(df_input[feat].std()),
        'min': float(df_input[feat].min()),
        'max': float(df_input[feat].max()),
        'q25': float(df_input[feat].quantile(0.25)),
        'q75': float(df_input[feat].quantile(0.75))
    }

In [None]:
# Plot distributions
fig, axes = plt.subplots(3, 2, figsize=(16, 18))
axes = axes.flatten()

for idx, feat in enumerate(tracking_features):
    # Sample data for faster plotting
    sample_data = df_input[feat].dropna().sample(min(100000, len(df_input)), random_state=42)
    
    axes[idx].hist(sample_data, bins=100, edgecolor='black', alpha=0.7)
    axes[idx].axvline(sample_data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {sample_data.mean():.2f}')
    axes[idx].axvline(sample_data.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {sample_data.median():.2f}')
    axes[idx].set_title(f'Distribution of {feat.upper()}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feat)
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()

plt.tight_layout()
plt.savefig('eda_outputs/plots/05_tracking_features_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Speed analysis by role
speed_by_role = df_input.groupby('player_role')['s'].describe()
print("SPEED STATISTICS BY PLAYER ROLE:")
print(speed_by_role)

eda_results['movement_patterns']['speed_by_role'] = speed_by_role.to_dict()

# Acceleration analysis by role
accel_by_role = df_input.groupby('player_role')['a'].describe()
print("\nACCELERATION STATISTICS BY PLAYER ROLE:")
print(accel_by_role)

eda_results['movement_patterns']['acceleration_by_role'] = accel_by_role.to_dict()

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Speed by role
df_input.boxplot(column='s', by='player_role', ax=axes[0])
axes[0].set_title('Speed Distribution by Player Role', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Player Role')
axes[0].set_ylabel('Speed (yards/sec)')
axes[0].get_figure().suptitle('')  # Remove default title
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# Acceleration by role
df_input.boxplot(column='a', by='player_role', ax=axes[1])
axes[1].set_title('Acceleration Distribution by Player Role', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Player Role')
axes[1].set_ylabel('Acceleration (yards/secÂ²)')
axes[1].get_figure().suptitle('')  # Remove default title
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('eda_outputs/plots/06_speed_accel_by_role.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Field Position Analysis

In [None]:
# Sample a few plays for visualization
sample_plays = df_input.groupby(['game_id', 'play_id']).first().sample(5, random_state=42)

fig, axes = plt.subplots(1, 5, figsize=(25, 5))

for idx, ((game_id, play_id), play_data) in enumerate(sample_plays.iterrows()):
    # Get all players for this play at last input frame
    play_input = df_input[(df_input['game_id'] == game_id) & (df_input['play_id'] == play_id)]
    last_frame = play_input['frame_id'].max()
    play_snapshot = play_input[play_input['frame_id'] == last_frame]
    
    # Plot field
    axes[idx].set_xlim(0, 120)
    axes[idx].set_ylim(0, 53.3)
    axes[idx].set_aspect('equal')
    
    # Plot players
    offense = play_snapshot[play_snapshot['player_side'] == 'Offense']
    defense = play_snapshot[play_snapshot['player_side'] == 'Defense']
    
    axes[idx].scatter(offense['x'], offense['y'], c='blue', s=100, alpha=0.6, label='Offense', edgecolors='black')
    axes[idx].scatter(defense['x'], defense['y'], c='red', s=100, alpha=0.6, label='Defense', edgecolors='black')
    
    # Highlight targeted receiver
    target = play_snapshot[play_snapshot['player_role'] == 'Targeted Receiver']
    if not target.empty:
        axes[idx].scatter(target['x'], target['y'], c='gold', s=200, marker='*', 
                         edgecolors='black', linewidths=2, label='Target', zorder=5)
    
    # Ball landing location
    ball_x = play_data['ball_land_x']
    ball_y = play_data['ball_land_y']
    axes[idx].scatter(ball_x, ball_y, c='green', s=300, marker='X', 
                     edgecolors='black', linewidths=2, label='Ball Land', zindex=6)
    
    axes[idx].set_title(f'Play {idx+1}\nGame: {game_id}, Play: {play_id}', fontsize=10)
    axes[idx].set_xlabel('X (yards)')
    if idx == 0:
        axes[idx].set_ylabel('Y (yards)')
    axes[idx].grid(True, alpha=0.3)
    axes[idx].legend(loc='upper right', fontsize=8)

plt.suptitle('Sample Play Snapshots at Throw Time', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_outputs/plots/07_sample_plays_field_position.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Overall position heatmap
fig, axes = plt.subplots(1, 2, figsize=(20, 6))

# Sample for faster plotting
sample_input = df_input.sample(min(50000, len(df_input)), random_state=42)

# Offense
offense_data = sample_input[sample_input['player_side'] == 'Offense']
axes[0].hexbin(offense_data['x'], offense_data['y'], gridsize=50, cmap='Blues', mincnt=1)
axes[0].set_xlim(0, 120)
axes[0].set_ylim(0, 53.3)
axes[0].set_title('Offensive Player Position Heatmap', fontsize=14, fontweight='bold')
axes[0].set_xlabel('X (yards)')
axes[0].set_ylabel('Y (yards)')
axes[0].set_aspect('equal')

# Defense
defense_data = sample_input[sample_input['player_side'] == 'Defense']
axes[1].hexbin(defense_data['x'], defense_data['y'], gridsize=50, cmap='Reds', mincnt=1)
axes[1].set_xlim(0, 120)
axes[1].set_ylim(0, 53.3)
axes[1].set_title('Defensive Player Position Heatmap', fontsize=14, fontweight='bold')
axes[1].set_xlabel('X (yards)')
axes[1].set_ylabel('Y (yards)')
axes[1].set_aspect('equal')

plt.tight_layout()
plt.savefig('eda_outputs/plots/08_position_heatmaps.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Ball Trajectory Analysis

In [None]:
# Get unique ball landing positions per play
ball_positions = df_input.groupby(['game_id', 'play_id'])[['ball_land_x', 'ball_land_y']].first()

print("BALL LANDING POSITION STATISTICS:")
print(ball_positions.describe())

eda_results['ball_trajectory']['ball_land_x'] = {
    'mean': float(ball_positions['ball_land_x'].mean()),
    'median': float(ball_positions['ball_land_x'].median()),
    'std': float(ball_positions['ball_land_x'].std()),
    'min': float(ball_positions['ball_land_x'].min()),
    'max': float(ball_positions['ball_land_x'].max())
}

eda_results['ball_trajectory']['ball_land_y'] = {
    'mean': float(ball_positions['ball_land_y'].mean()),
    'median': float(ball_positions['ball_land_y'].median()),
    'std': float(ball_positions['ball_land_y'].std()),
    'min': float(ball_positions['ball_land_y'].min()),
    'max': float(ball_positions['ball_land_y'].max())
}

In [None]:
# Calculate pass distance (approximate from yardline to ball land)
play_info = df_input.groupby(['game_id', 'play_id']).first()
play_info['pass_distance'] = np.abs(play_info['ball_land_x'] - play_info['absolute_yardline_number'])

print("PASS DISTANCE STATISTICS:")
print(play_info['pass_distance'].describe())

eda_results['ball_trajectory']['pass_distance'] = {
    'mean': float(play_info['pass_distance'].mean()),
    'median': float(play_info['pass_distance'].median()),
    'std': float(play_info['pass_distance'].std()),
    'min': float(play_info['pass_distance'].min()),
    'max': float(play_info['pass_distance'].max())
}

# Plot
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Ball landing X
axes[0, 0].hist(ball_positions['ball_land_x'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].axvline(ball_positions['ball_land_x'].mean(), color='red', linestyle='--', linewidth=2)
axes[0, 0].set_title('Ball Landing X Position Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('X (yards)')
axes[0, 0].set_ylabel('Frequency')

# Ball landing Y
axes[0, 1].hist(ball_positions['ball_land_y'], bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[0, 1].axvline(ball_positions['ball_land_y'].mean(), color='red', linestyle='--', linewidth=2)
axes[0, 1].set_title('Ball Landing Y Position Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Y (yards)')
axes[0, 1].set_ylabel('Frequency')

# 2D heatmap
axes[1, 0].hexbin(ball_positions['ball_land_x'], ball_positions['ball_land_y'], 
                  gridsize=40, cmap='YlOrRd', mincnt=1)
axes[1, 0].set_xlim(0, 120)
axes[1, 0].set_ylim(0, 53.3)
axes[1, 0].set_title('Ball Landing Position Heatmap', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('X (yards)')
axes[1, 0].set_ylabel('Y (yards)')
axes[1, 0].set_aspect('equal')

# Pass distance
axes[1, 1].hist(play_info['pass_distance'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 1].axvline(play_info['pass_distance'].mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {play_info["pass_distance"].mean():.1f} yards')
axes[1, 1].set_title('Pass Distance Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Distance (yards)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('eda_outputs/plots/09_ball_trajectory_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 9. Movement Patterns - Input vs Output

In [None]:
# Compare input and output positions
# Get last input frame and first output frame for same players
last_input = df_input.groupby(['game_id', 'play_id', 'nfl_id']).last()[['x', 'y', 's', 'a']]
first_output = df_output.groupby(['game_id', 'play_id', 'nfl_id']).first()[['x', 'y']]

# Merge
transition = last_input.join(first_output, how='inner', rsuffix='_out')
transition['dx'] = transition['x_out'] - transition['x']
transition['dy'] = transition['y_out'] - transition['y']
transition['displacement'] = np.sqrt(transition['dx']**2 + transition['dy']**2)

print("MOVEMENT FROM LAST INPUT TO FIRST OUTPUT FRAME:")
print(transition[['dx', 'dy', 'displacement']].describe())

eda_results['movement_patterns']['input_to_output_transition'] = {
    'dx_mean': float(transition['dx'].mean()),
    'dx_std': float(transition['dx'].std()),
    'dy_mean': float(transition['dy'].mean()),
    'dy_std': float(transition['dy'].std()),
    'displacement_mean': float(transition['displacement'].mean()),
    'displacement_std': float(transition['displacement'].std())
}

# Note: 1 frame = 0.1 seconds
print(f"\nAverage displacement per frame: {transition['displacement'].mean():.3f} yards")
print(f"This corresponds to average speed: {transition['displacement'].mean() / 0.1:.3f} yards/sec")

In [None]:
# Analyze total displacement in output sequences
output_first = df_output.groupby(['game_id', 'play_id', 'nfl_id']).first()[['x', 'y']]
output_last = df_output.groupby(['game_id', 'play_id', 'nfl_id']).last()[['x', 'y']]

total_movement = output_first.join(output_last, how='inner', rsuffix='_end')
total_movement['total_dx'] = total_movement['x_end'] - total_movement['x']
total_movement['total_dy'] = total_movement['y_end'] - total_movement['y']
total_movement['total_displacement'] = np.sqrt(total_movement['total_dx']**2 + total_movement['total_dy']**2)

print("\nTOTAL MOVEMENT DURING OUTPUT SEQUENCE:")
print(total_movement[['total_dx', 'total_dy', 'total_displacement']].describe())

eda_results['movement_patterns']['total_output_movement'] = {
    'total_dx_mean': float(total_movement['total_dx'].mean()),
    'total_dx_std': float(total_movement['total_dx'].std()),
    'total_dy_mean': float(total_movement['total_dy'].mean()),
    'total_dy_std': float(total_movement['total_dy'].std()),
    'total_displacement_mean': float(total_movement['total_displacement'].mean()),
    'total_displacement_std': float(total_movement['total_displacement'].std())
}

# Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Frame-to-frame displacement
axes[0].hist(transition['displacement'], bins=50, edgecolor='black', alpha=0.7, color='steelblue')
axes[0].axvline(transition['displacement'].mean(), color='red', linestyle='--', linewidth=2, 
                label=f'Mean: {transition["displacement"].mean():.3f} yards')
axes[0].set_title('Frame-to-Frame Displacement', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Displacement (yards)')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Total displacement
axes[1].hist(total_movement['total_displacement'], bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[1].axvline(total_movement['total_displacement'].mean(), color='red', linestyle='--', linewidth=2,
                label=f'Mean: {total_movement["total_displacement"].mean():.2f} yards')
axes[1].set_title('Total Displacement During Ball Flight', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Total Displacement (yards)')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('eda_outputs/plots/10_displacement_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 10. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_features = ['x', 'y', 's', 'a', 'o', 'dir', 'absolute_yardline_number', 
                     'num_frames_output', 'ball_land_x', 'ball_land_y']

# Sample for correlation calculation
sample_for_corr = df_input[numerical_features].dropna().sample(min(50000, len(df_input)), random_state=42)
corr_matrix = sample_for_corr.corr()

print("CORRELATION MATRIX:")
print(corr_matrix)

# Save to results
eda_results['correlations']['feature_correlation_matrix'] = corr_matrix.to_dict()

# Plot
fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('eda_outputs/plots/11_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 11. Temporal Patterns

In [None]:
# Analyze trends over weeks
weekly_stats = df_input.groupby('week').agg({
    'game_id': 'nunique',
    'play_id': 'nunique',
    's': 'mean',
    'a': 'mean',
    'num_frames_output': 'mean'
}).rename(columns={'game_id': 'num_games', 'play_id': 'num_plays'})

print("WEEKLY STATISTICS:")
print(weekly_stats)

eda_results['temporal_patterns'] = {'weekly_stats': weekly_stats.to_dict()}

# Plot
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

axes[0, 0].plot(weekly_stats.index, weekly_stats['num_games'], marker='o', linewidth=2)
axes[0, 0].set_title('Number of Games per Week', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Week')
axes[0, 0].set_ylabel('Number of Games')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(weekly_stats.index, weekly_stats['num_plays'], marker='o', linewidth=2, color='coral')
axes[0, 1].set_title('Number of Plays per Week', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Week')
axes[0, 1].set_ylabel('Number of Plays')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].plot(weekly_stats.index, weekly_stats['s'], marker='o', linewidth=2, color='green')
axes[1, 0].set_title('Average Speed per Week', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Week')
axes[1, 0].set_ylabel('Speed (yards/sec)')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(weekly_stats.index, weekly_stats['num_frames_output'], marker='o', linewidth=2, color='purple')
axes[1, 1].set_title('Average Output Frames per Week', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Week')
axes[1, 1].set_ylabel('Number of Frames')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('eda_outputs/plots/12_temporal_patterns.png', dpi=150, bbox_inches='tight')
plt.show()

## 12. Player Physical Attributes

In [None]:
# Analyze player height and weight
unique_players = df_input.groupby('nfl_id').first()

print("PLAYER PHYSICAL ATTRIBUTES:")
print(unique_players[['player_height', 'player_weight']].describe())

eda_results['player_analysis']['physical_attributes'] = {
    'height_stats': unique_players['player_height'].describe().to_dict(),
    'weight_stats': unique_players['player_weight'].dropna().describe().to_dict()
}

# Age calculation (if birth date available)
if 'player_birth_date' in unique_players.columns:
    unique_players['player_birth_date'] = pd.to_datetime(unique_players['player_birth_date'], errors='coerce')
    unique_players['age'] = (pd.Timestamp('2023-09-01') - unique_players['player_birth_date']).dt.days / 365.25
    print("\nPLAYER AGE DISTRIBUTION:")
    print(unique_players['age'].describe())
    eda_results['player_analysis']['age_stats'] = unique_players['age'].describe().to_dict()

## 13. Target Analysis - Distance to Ball

In [None]:
# Calculate distance from player to ball landing location
df_input['distance_to_ball'] = np.sqrt(
    (df_input['x'] - df_input['ball_land_x'])**2 + 
    (df_input['y'] - df_input['ball_land_y'])**2
)

# Get initial distance (first frame) for each player
initial_distance = df_input.groupby(['game_id', 'play_id', 'nfl_id'])['distance_to_ball'].first()

print("INITIAL DISTANCE TO BALL LANDING:")
print(initial_distance.describe())

# By role
df_input_with_role = df_input.merge(
    df_input.groupby(['game_id', 'play_id', 'nfl_id'])['player_role'].first().reset_index(),
    on=['game_id', 'play_id', 'nfl_id'],
    suffixes=('', '_role')
)

distance_by_role = df_input.groupby('player_role')['distance_to_ball'].describe()
print("\nDISTANCE TO BALL BY ROLE:")
print(distance_by_role)

eda_results['ball_trajectory']['distance_to_ball_stats'] = initial_distance.describe().to_dict()
eda_results['ball_trajectory']['distance_by_role'] = distance_by_role.to_dict()

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
df_input.boxplot(column='distance_to_ball', by='player_role', ax=ax)
ax.set_title('Distance to Ball Landing Location by Player Role', fontsize=14, fontweight='bold')
ax.set_xlabel('Player Role')
ax.set_ylabel('Distance (yards)')
ax.get_figure().suptitle('')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha='right')
plt.tight_layout()
plt.savefig('eda_outputs/plots/13_distance_to_ball_by_role.png', dpi=150, bbox_inches='tight')
plt.show()

## 14. Data Summary & Key Insights

In [None]:
# Generate summary insights
summary_insights = {
    'dataset_overview': {
        'total_input_records': int(len(df_input)),
        'total_output_records': int(len(df_output)),
        'unique_games': int(unique_games),
        'unique_plays': int(unique_plays),
        'unique_players': int(unique_players),
        'weeks_covered': len(input_files)
    },
    'key_statistics': {
        'avg_players_per_play': float(players_per_play.mean()),
        'avg_input_frames': float(input_frames_per_play.mean()),
        'avg_output_frames': float(output_frames.mean()),
        'avg_ball_time_in_air_sec': float(time_in_air.mean()),
        'avg_speed_yards_per_sec': float(df_input['s'].mean()),
        'avg_acceleration': float(df_input['a'].mean()),
        'avg_pass_distance_yards': float(play_info['pass_distance'].mean())
    },
    'prediction_task': {
        'records_to_predict': int(df_input['player_to_predict'].sum()),
        'avg_frames_to_predict_per_player': float(output_frames.mean()),
        'avg_displacement_per_frame': float(transition['displacement'].mean()),
        'avg_total_displacement': float(total_movement['total_displacement'].mean())
    }
}

eda_results['summary'] = summary_insights

print("="*80)
print("KEY INSIGHTS SUMMARY")
print("="*80)
print(json.dumps(summary_insights, indent=2))

## 15. Save Results to JSON

In [None]:
# Save complete EDA results
output_file = 'eda_outputs/eda_results.json'

with open(output_file, 'w') as f:
    json.dump(eda_results, f, indent=2)

print(f"EDA results saved to: {output_file}")
print(f"Total plots saved: {len(list(Path('eda_outputs/plots').glob('*.png')))}")
print(f"\nFile size: {Path(output_file).stat().st_size / 1024:.2f} KB")
print("\n" + "="*80)
print("EDA COMPLETE!")
print("="*80)
print(f"\nCompleted at: {datetime.now()}")

## 16. Additional Visualizations - Trajectory Examples

In [None]:
# Visualize actual trajectories for a few sample players
sample_play_ids = df_output.groupby(['game_id', 'play_id']).size().sample(3, random_state=42).index

fig, axes = plt.subplots(1, 3, figsize=(24, 6))

for idx, (game_id, play_id) in enumerate(sample_play_ids):
    # Get output trajectory for this play
    play_output = df_output[(df_output['game_id'] == game_id) & (df_output['play_id'] == play_id)]
    play_input = df_input[(df_input['game_id'] == game_id) & (df_input['play_id'] == play_id)]
    
    # Get ball landing
    ball_x = play_input['ball_land_x'].iloc[0]
    ball_y = play_input['ball_land_y'].iloc[0]
    
    # Plot trajectories for each player
    for nfl_id in play_output['nfl_id'].unique()[:10]:  # Limit to 10 players for clarity
        player_traj = play_output[play_output['nfl_id'] == nfl_id].sort_values('frame_id')
        player_info = play_input[play_input['nfl_id'] == nfl_id].iloc[0]
        
        color = 'blue' if player_info['player_side'] == 'Offense' else 'red'
        alpha = 0.8 if player_info['player_role'] == 'Targeted Receiver' else 0.4
        linewidth = 3 if player_info['player_role'] == 'Targeted Receiver' else 1.5
        
        axes[idx].plot(player_traj['x'], player_traj['y'], 
                      color=color, alpha=alpha, linewidth=linewidth, marker='o', markersize=3)
        
        # Mark start and end
        axes[idx].scatter(player_traj['x'].iloc[0], player_traj['y'].iloc[0], 
                         color=color, s=100, marker='o', edgecolors='black', zorder=5)
        axes[idx].scatter(player_traj['x'].iloc[-1], player_traj['y'].iloc[-1], 
                         color=color, s=100, marker='s', edgecolors='black', zorder=5)
    
    # Plot ball landing
    axes[idx].scatter(ball_x, ball_y, c='green', s=400, marker='X', 
                     edgecolors='black', linewidths=3, label='Ball Landing', zorder=10)
    
    axes[idx].set_xlim(0, 120)
    axes[idx].set_ylim(0, 53.3)
    axes[idx].set_aspect('equal')
    axes[idx].set_title(f'Player Trajectories\nGame: {game_id}, Play: {play_id}', fontsize=11, fontweight='bold')
    axes[idx].set_xlabel('X (yards)')
    if idx == 0:
        axes[idx].set_ylabel('Y (yards)')
    axes[idx].grid(True, alpha=0.3)
    axes[idx].legend(['Ball Landing'], loc='upper right')

plt.suptitle('Sample Player Trajectories During Ball Flight', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('eda_outputs/plots/14_sample_trajectories.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTrajectory visualization complete!")

In [None]:
print("\n" + "="*80)
print("COMPREHENSIVE EDA COMPLETE")
print("="*80)
print(f"\nOutputs saved to:")
print(f"  - JSON: eda_outputs/eda_results.json")
print(f"  - Plots: eda_outputs/plots/ ({len(list(Path('eda_outputs/plots').glob('*.png')))} images)")
print(f"\nAnalysis completed at: {datetime.now()}")