# Feature Engineering:

Creating new features and transforming existing ones for further model development and analysis.

In [12]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Function creation
def create_baseline_features(df):
    '''Create high-priority features for baseline model'''
    
    # Velocity components
    df['velocity_x'] = df['s'] * np.cos(np.radians(df['dir']))
    df['velocity_y'] = df['s'] * np.sin(np.radians(df['dir']))
    
    # Circular encoding of angles
    df['dir_sin'] = np.sin(np.radians(df['dir']))
    df['dir_cos'] = np.cos(np.radians(df['dir']))
    df['o_sin'] = np.sin(np.radians(df['o']))
    df['o_cos'] = np.cos(np.radians(df['o']))
    
    # Distance to ball landing
    df['dist_to_ball'] = np.sqrt(
        (df['x'] - df['ball_land_x'])**2 + 
        (df['y'] - df['ball_land_y'])**2
    )
    
    # Angle to ball
    df['angle_to_ball'] = np.arctan2(
        df['ball_land_y'] - df['y'],
        df['ball_land_x'] - df['x']
    )
    
    # Position relative to LOS
    df['x_from_los'] = df['x'] - df['absolute_yardline_number']
    df['dist_to_sideline'] = np.minimum(df['y'], 53.3 - df['y'])
    
    # Frame-to-frame changes (requires groupby)
    df = df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id'])
    df['speed_change'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['s'].diff()
    df['dir_change'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['dir'].diff()
    
    # Rolling averages
    df['speed_ma3'] = df.groupby(['game_id', 'play_id', 'nfl_id'])['s'].transform(
        lambda x: x.rolling(3, min_periods=1).mean()
    )
    
    # Player role encoding
    df['is_targeted_receiver'] = (df['player_role'] == 'Targeted Receiver').astype(int)
    df['is_passer'] = (df['player_role'] == 'Passer').astype(int)
    df['is_defense'] = (df['player_side'] == 'defense').astype(int)
    
    return df

In [14]:
# Load datasets
input_df = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\train\\input_2023_w01.csv')
output_df = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\train\\output_2023_w01.csv')
supp_df = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\supplementary_data.csv')

In [15]:
# Merge supplementary data
input_df = input_df.merge(supp_df, on=['game_id', 'play_id'], how='left')

# Sort by game, play, player, and frame for temporal features
input_df = input_df.sort_values(['game_id', 'play_id', 'nfl_id', 'frame_id']).reset_index(drop=True)

# Apply baseline features
input_df = create_baseline_features(input_df)

In [16]:
# 1. TEMPORAL FEATURES
input_df['max_frame_id'] = input_df.groupby(['game_id', 'play_id'])['frame_id'].transform('max')
input_df['time_to_throw'] = input_df['max_frame_id'] - input_df['frame_id']
input_df['frame_pct'] = input_df['frame_id'] / input_df['max_frame_id']
input_df['is_early_play'] = (input_df['frame_id'] < 5).astype(int)
input_df['is_late_play'] = (input_df['frame_id'] > 20).astype(int)
input_df['frames_since_snap'] = input_df['frame_id'] - 1

# 2. ADDITIONAL VELOCITY & MOTION FEATURES
input_df['accel_change'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['a'].diff()
input_df['orientation_change'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['o'].diff()
input_df['speed_ma_5'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['s'].transform(lambda x: x.rolling(5, min_periods=1).mean())
input_df['acceleration_ma_3'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['a'].transform(lambda x: x.rolling(3, min_periods=1).mean())
input_df['jerk'] = input_df['accel_change']
input_df['is_accelerating'] = (input_df['a'] > 0.5).astype(int)
input_df['is_decelerating'] = (input_df['a'] < -0.5).astype(int)

# 3. ADDITIONAL POSITION & DISTANCE FEATURES
input_df['dist_to_los'] = np.abs(input_df['x'] - input_df['absolute_yardline_number'])
input_df['dist_to_endzone'] = np.where(input_df['play_direction'] == 'left', 110 - input_df['x'], input_df['x'] - 10)
input_df['dist_to_center'] = np.abs(input_df['y'] - 26.65)
input_df['is_behind_los'] = (input_df['x'] < input_df['absolute_yardline_number']).astype(int)
input_df['is_in_endzone'] = ((input_df['x'] > 110) | (input_df['x'] < 10)).astype(int)
input_df['is_red_zone'] = (input_df['absolute_yardline_number'] >= 80).astype(int)

# 4. ADDITIONAL BALL-RELATIVE FEATURES
input_df['moving_toward_ball'] = (np.cos(input_df['angle_to_ball'] - np.radians(input_df['dir'])) > 0).astype(int)
input_df['facing_ball'] = (np.cos(input_df['angle_to_ball'] - np.radians(input_df['o'])) > 0).astype(int)
input_df['ball_x_diff'] = input_df['ball_land_x'] - input_df['x']
input_df['ball_y_diff'] = input_df['ball_land_y'] - input_df['y']

# 5. DIRECTIONAL FEATURES
input_df['dir_alignment'] = np.cos(np.radians(input_df['dir'] - input_df['o']))

# 6. PLAYER-SPECIFIC FEATURES
le = LabelEncoder()
input_df['player_position_encoded'] = le.fit_transform(input_df['player_position'].astype(str))
input_df['player_role_encoded'] = le.fit_transform(input_df['player_role'].astype(str))
input_df['player_side_encoded'] = (input_df['player_side'] == 'offense').astype(int)
input_df['play_direction_encoded'] = (input_df['play_direction'] == 'left').astype(int)

# 7. PLAY CONTEXT FEATURES
input_df['is_3rd_down'] = (input_df['down'] == 3).astype(int)
input_df['is_4th_down'] = (input_df['down'] == 4).astype(int)
input_df['is_short_yardage'] = (input_df['yards_to_go'] <= 3).astype(int)
input_df['is_long_yardage'] = (input_df['yards_to_go'] >= 10).astype(int)
input_df['is_man_coverage'] = (input_df['team_coverage_man_zone'] == 'Man').astype(int)
input_df['is_zone_coverage'] = (input_df['team_coverage_man_zone'] == 'Zone').astype(int)
input_df['play_action_encoded'] = (input_df['play_action'] == True).astype(int)

# Encode categorical features
for col in ['offense_formation', 'route_of_targeted_receiver', 'dropback_type', 'pass_location_type']:
    if col in input_df.columns:
        input_df[f'{col}_encoded'] = le.fit_transform(input_df[col].astype(str))

# 8. ROUTE & PATTERN FEATURES
input_df['x_displacement_5'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['x'].diff(5)
input_df['y_displacement_5'] = input_df.groupby(['game_id', 'play_id', 'nfl_id'])['y'].diff(5)

print("Feature engineering complete!")
print(f"Total features: {input_df.shape[1]}")
print(f"Total rows: {input_df.shape[0]}")

Feature engineering complete!
Total features: 119
Total rows: 285714
