In [1]:
import pandas as pd
import numpy as np

In [34]:
def compute_output_features(input_df, output_df):
    """
    Processes the output dataframe to add new features based on input data.

    This function:
    1. Gets the unique ball landing spot (x, y) for each play from the input_df.
    2. Merges this landing spot info into the output_df.
    3. Calculates 'time_left_s': (max_frame - current_frame) / 10.
    4. Calculates 'dist_to_ball_land': Euclidean distance from player (x, y)
       to the (ball_land_x, ball_land_y).
    5. Calculates 'speed', 'accel' (tangential), 'direction' (of motion),
       and 'accel_direction' (direction of total acceleration vector)
       by linking the last input frame to the first output frame.

    Args:
        input_df (pd.DataFrame): The complete dataframe from an 'input_*.csv' file.
        output_df (pd.DataFrame): The complete dataframe from an 'output_*.csv' file.

    Returns:
        pd.DataFrame: A copy of the output_df with the new
                      'time_left_s' and 'dist_to_ball_land' columns added.
    """
    
    # --- 1. Get unique ball landing spots from input data ---
    # We assume 'ball_land_x' and 'ball_land_y' are constant for a given play.
    play_landing_spots = input_df[
        ['game_id', 'play_id', 'ball_land_x', 'ball_land_y']
    ].drop_duplicates()

    # --- 2. Merge landing spots into output data ---
    # This adds 'ball_land_x' and 'ball_land_y' to every row in output_df
    data = pd.merge(
        output_df, 
        play_landing_spots, 
        on=['game_id', 'play_id'], 
        how='left'
    )
    
    # Handle cases where a play in output might not have ball_land data
    if data['ball_land_x'].isnull().any():
        print("Warning: Some output plays had no matching ball_land data in input.")
        # Fill with 0 or np.nan if you prefer to handle it differently
        data['ball_land_x'] = data['ball_land_x'].fillna(0)
        data['ball_land_y'] = data['ball_land_y'].fillna(0)

    # --- 3. Calculate max frame for each play ---
    # We use transform('max') to create a new column where every row
    # for a play contains the max 'frame_id' for that play.
    data['max_frame'] = data.groupby(
        ['game_id', 'play_id']
    )['frame_id'].transform('max')

    # --- 4. Calculate 'time_left_s' ---
    # (max_frame - current_frame) / 10 (assuming 10 frames per second)
    data['time_left_s'] = (data['max_frame'] - data['frame_id']) / 10.0

    # --- 5. Calculate 'dist_to_ball_land' ---
    # Euclidean distance: sqrt((x2-x1)^2 + (y2-y1)^2)
    data['dist_to_ball_land'] = np.sqrt(
        (data['x'] - data['ball_land_x'])**2 + 
        (data['y'] - data['ball_land_y'])**2
    )
    # --- 6. Get last input frame (x,y,s,dir) for velocity/accel calculation ---
    # We need the last known position (x,y), speed (s), and direction (dir)
    # from the input data to calculate the first frame of the output data.
    
    # Find the index of the max frame_id for each player/play
    try:
        idx = input_df.groupby(
            ['game_id', 'play_id', 'nfl_id']
        )['frame_id'].idxmax()
        
        # Get the x, y, s, and dir data at that last frame
        last_input_frames = input_df.loc[idx, [
            'game_id', 'play_id', 'nfl_id', 'x', 'y', 's', 'dir'
        ]].rename(columns={
            'x': 'last_input_x', 
            'y': 'last_input_y',
            's': 'last_input_speed',
            'dir': 'last_input_dir'
        })

        # Merge this last (x,y,s,dir) into our output data
        data = pd.merge(
            data, 
            last_input_frames, 
            on=['game_id', 'play_id', 'nfl_id'], 
            how='left'
        )
    except KeyError:
        # This can happen if input_df is missing 'frame_id', 'x', 'y', 's', or 'dir'
        print("Warning: Could not find 'frame_id','x','y','s', or 'dir' in input_df.")
        data['last_input_x'] = np.nan
        data['last_input_y'] = np.nan
        data['last_input_speed'] = np.nan
        data['last_input_dir'] = np.nan

    # --- 7. Calculate velocities and angles ---
    # Sort values to ensure 'shift' operations work correctly within each group
    data = data.sort_values(by=['game_id', 'play_id', 'nfl_id', 'frame_id'])
    
    # Group by player
    grouped = data.groupby(['game_id', 'play_id', 'nfl_id'])
    
    # Get the lagged x and y (i.e., position from the previous frame)
    data['lag_x'] = grouped['x'].shift(1)
    data['lag_y'] = grouped['y'].shift(1)
    
    # Fill the *first* frame's lag with the *last input frame's* position
    data['lag_x'] = np.where(
        data['lag_x'].isnull(), 
        data['last_input_x'], 
        data['lag_x']
    )
    data['lag_y'] = np.where(
        data['lag_y'].isnull(), 
        data['last_input_y'], 
        data['lag_y']
    )
    
    # Now calculate derivatives
    data['dx'] = data['x'] - data['lag_x']
    data['dy'] = data['y'] - data['lag_y']
    data['dt'] = 0.1  # 10 frames per second
    
    # --- Motion (Velocity) Calculations ---
    data['speed'] = np.sqrt(data['dx']**2 + data['dy']**2) / data['dt']
    # Get direction in degrees
    data['direction'] = np.arctan2(data['dy'], data['dx']) * 180 / np.pi
    # Get velocity components
    data['v_x'] = data['dx'] / data['dt']
    data['v_y'] = data['dy'] / data['dt']


    # --- Acceleration Calculations ---
    
    # Get lag_speed to calculate tangential acceleration
    data['lag_speed'] = grouped['speed'].shift(1)
    
    # Fill the *first* frame's lag_speed with the *last input frame's* speed ('s')
    data['lag_speed'] = np.where(
        data['lag_speed'].isnull(), 
        data['last_input_speed'], 
        data['lag_speed']
    )
    
    # This is tangential acceleration (change in speed magnitude)
    data['accel'] = (data['speed'] - data['lag_speed']) / data['dt']
    
    # --- Acceleration Direction (from vector change) ---
    
    # Get lagged velocity components
    data['lag_v_x'] = grouped['v_x'].shift(1)
    data['lag_v_y'] = grouped['v_y'].shift(1)
    
    # Calculate v_x and v_y for the last input frame
    data['last_input_dir_rad'] = data['last_input_dir'] * (np.pi / 180)
    data['last_input_v_x'] = data['last_input_speed'] * np.cos(data['last_input_dir_rad'])
    data['last_input_v_y'] = data['last_input_speed'] * np.sin(data['last_input_dir_rad'])

    # Fill the *first* frame's lag_v_x/y with the *last input frame's* v_x/y
    data['lag_v_x'] = np.where(
        data['lag_v_x'].isnull(), 
        data['last_input_v_x'], 
        data['lag_v_x']
    )
    data['lag_v_y'] = np.where(
        data['lag_v_y'].isnull(), 
        data['last_input_v_y'], 
        data['lag_v_y']
    )

    # Calculate change in velocity components
    data['dv_x'] = data['v_x'] - data['lag_v_x']
    data['dv_y'] = data['v_y'] - data['lag_v_y']

    # Get direction of the acceleration vector
    data['accel_direction'] = np.arctan2(data['dv_y'], data['dv_x']) * 180 / np.pi

#np.arctan2(data['dy'], data['dx']) * 180 / np.pi
    data['optimal_angle'] = np.arctan2(data['ball_land_y']-data['y'], data['ball_land_x']-data['x']) * 180 /np.pi
    data['angle_diff'] = abs(data['direction'] - data['optimal_angle'])
    
    # --- 8. Clean up and return ---
    # Drop all the temporary helper columns
    final_data = data.drop(columns=[
        'max_frame', 'last_input_x', 'last_input_y', 'last_input_speed', 
        'last_input_dir', 'last_input_dir_rad', 'last_input_v_x', 'last_input_v_y',
        'lag_x', 'lag_y', 'dx', 'dy', 'dt', 'lag_speed',
        'v_x', 'v_y', 'lag_v_x', 'lag_v_y', 'dv_x', 'dv_y'
    ])
    
    return final_data

def generate_clean_data(week_num):
    """
    Loads the input and output data for a specific week and
    computes the clean output features.

    Args:
        week_num (int): The week number (1 through 18).

    Returns:
        pd.DataFrame: Processed output data with new features.
    """
    # Format week number as a two-digit string (e.g., 1 -> "01")
    week_str = str(week_num).zfill(2)
    
    # Use forward slashes for cross-platform path compatibility
    input_path = f'raw_data/input_2023_w{week_str}.csv'
    output_path = f'raw_data/output_2023_w{week_str}.csv'

    print(f"Loading data for week {week_num}...")
    print(f"Input file: {input_path}")
    print(f"Output file: {output_path}")

    raw_data_input = pd.read_csv(input_path)
    raw_data_output = pd.read_csv(output_path)

    # Call the processing function to get the final data
    return compute_output_features(raw_data_input, raw_data_output)

In [35]:
clean_data_1 = generate_clean_data(1)
clean_data_1
#range(clean_data_1['angle_diff'])

Loading data for week 1...
Input file: raw_data/input_2023_w01.csv
Output file: raw_data/output_2023_w01.csv


Unnamed: 0,game_id,play_id,nfl_id,frame_id,x,y,ball_land_x,ball_land_y,time_left_s,dist_to_ball_land,speed,direction,accel,accel_direction,optimal_angle,angle_diff
42,2023090700,101,44930,1,53.20,13.98,63.259998,-0.22,2.0,17.402401,7.864477,-11.738571,-0.355229,-46.333038,-54.684313,42.945742
43,2023090700,101,44930,2,53.96,13.78,63.259998,-0.22,1.9,16.807438,7.858753,-14.743563,-0.057240,-104.036243,-56.404485,41.660922
44,2023090700,101,44930,3,54.70,13.54,63.259998,-0.22,1.8,16.205282,7.779460,-17.969140,-0.792929,-116.565051,-58.114550,40.145410
45,2023090700,101,44930,4,55.41,13.27,63.259998,-0.22,1.7,15.607773,7.596052,-20.820893,-1.834085,-135.000000,-59.804344,38.983451
46,2023090700,101,44930,5,56.09,12.95,63.259998,-0.22,1.6,14.995258,7.515318,-25.201124,-0.807339,-120.963757,-61.435287,36.234164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32072,2023091100,3987,46211,7,73.27,15.19,73.870003,11.48,0.4,3.758205,4.045986,-39.986886,4.654829,-68.198591,-80.813337,40.826451
32073,2023091100,3987,46211,8,73.59,14.86,73.870003,11.48,0.3,3.391578,4.596738,-45.881404,5.507523,-81.869898,-85.264371,39.382967
32074,2023091100,3987,46211,9,73.90,14.48,73.870003,11.48,0.2,3.000150,4.904080,-50.792796,3.073420,-101.309932,-90.572886,39.780090
32075,2023091100,3987,46211,10,74.21,14.05,73.870003,11.48,0.1,2.592393,5.300943,-54.211027,3.968634,-90.000000,-97.536162,43.325135


In [36]:
print(clean_data_1['angle_diff'].describe())

print(clean_data_1['optimal_angle'].describe())

print(clean_data_1['direction'].describe())

count    32088.000000
mean        70.347727
std         78.881522
min          0.000003
25%         15.990332
50%         38.771104
75%         91.972988
max        358.321878
Name: angle_diff, dtype: float64
count    32088.000000
mean         0.434888
std        102.417919
min       -180.000000
25%        -87.729069
50%         -0.149328
75%         89.841653
max        179.999999
Name: optimal_angle, dtype: float64
count    32088.000000
mean        -2.812285
std         99.788259
min       -179.341457
25%        -88.619646
50%         -7.650651
75%         86.987212
max        180.000000
Name: direction, dtype: float64
