In [1]:
# import packages

import os
import re
import glob
import json
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import ElasticNet

In [2]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [3]:
files_input = sorted(glob.glob("train/input_2023_w*.csv"))
files_output = sorted(glob.glob("train/output_2023_w*.csv"))

INPUT_DFS = []
OUTPUT_DFS = []

for f_in, f_out in zip(files_input, files_output):
    df_in = pd.read_csv(f_in)
    df_out = pd.read_csv(f_out)

    week_match = re.search(r"w(\d+)", f_out)
    week = int(week_match.group(1)) if week_match else None

    df_in["week"] = week
    df_out["week"] = week

    INPUT_DFS.append(df_in)
    OUTPUT_DFS.append(df_out)

    print(f"{f_in}: {df_in.shape}, {f_out}: {df_out.shape}, week: {week}")

train\input_2023_w01.csv: (285714, 24), train\output_2023_w01.csv: (32088, 7), week: 1
train\input_2023_w02.csv: (288586, 24), train\output_2023_w02.csv: (32180, 7), week: 2
train\input_2023_w03.csv: (297757, 24), train\output_2023_w03.csv: (36080, 7), week: 3
train\input_2023_w04.csv: (272475, 24), train\output_2023_w04.csv: (30147, 7), week: 4
train\input_2023_w05.csv: (254779, 24), train\output_2023_w05.csv: (29319, 7), week: 5
train\input_2023_w06.csv: (270676, 24), train\output_2023_w06.csv: (31162, 7), week: 6
train\input_2023_w07.csv: (233597, 24), train\output_2023_w07.csv: (27443, 7), week: 7
train\input_2023_w08.csv: (281011, 24), train\output_2023_w08.csv: (33017, 7), week: 8
train\input_2023_w09.csv: (252796, 24), train\output_2023_w09.csv: (28291, 7), week: 9
train\input_2023_w10.csv: (260372, 24), train\output_2023_w10.csv: (29008, 7), week: 10
train\input_2023_w11.csv: (243413, 24), train\output_2023_w11.csv: (27623, 7), week: 11
train\input_2023_w12.csv: (294940, 24), t

In [4]:
input_1 = INPUT_DFS[0]
output_1 = OUTPUT_DFS[0]

print(input_1.shape, output_1.shape)

(285714, 24) (32088, 7)


In [5]:
eda_df = pd.concat(INPUT_DFS, ignore_index=True)
eda_df.shape

(4880579, 24)

In [6]:
# Geometry mirroring

In [7]:
X_LIMIT = 120
Y_LIMIT = 53.3

right_eda = eda_df[eda_df["play_direction"] == "right"].copy()
left_eda = eda_df[eda_df["play_direction"] == "left"].copy()

right_eda["was_left"] = 0
left_eda["was_left"]  = 1

right_eda.shape, left_eda.shape

((2459074, 25), (2421505, 25))

In [8]:
left_eda["x"] = X_LIMIT - left_eda["x"]
left_eda["y"] = Y_LIMIT - left_eda["y"]

left_eda["ball_land_x"] = X_LIMIT - left_eda["ball_land_x"]
left_eda["ball_land_y"] = Y_LIMIT - left_eda["ball_land_y"]

left_eda["dir"] = (left_eda["dir"] + 180) % 360
left_eda["o"] = (left_eda["o"] + 180) % 360

left_eda["play_direction"] = "right"

In [9]:
eda_df_normalized = pd.concat([right_eda, left_eda], ignore_index=True)

eda_df_normalized["play_direction"].value_counts()

off_norm_left  = eda_df_normalized[eda_df_normalized["play_direction"]=="left"]["x"]
off_norm_right = eda_df_normalized[eda_df_normalized["play_direction"]=="right"]["x"]
print("Mean x  left : ", off_norm_left.mean())
print("Mean x right : ", off_norm_right.mean())

Mean x  left :  nan
Mean x right :  60.82776974002468


In [10]:
sample_frac = 0.2
idx = eda_df.sample(frac=sample_frac, random_state=17).index
df_orig_sample = eda_df.loc[idx]
df_norm_sample = eda_df_normalized.loc[idx]

In [11]:
eda_df_normalized = pd.concat([right_eda, left_eda], ignore_index=True)

eda_df_normalized["play_direction"].value_counts()

play_direction
right    4880579
Name: count, dtype: int64

In [12]:
df = eda_df_normalized.copy()

df = df[df["x"].between(0, X_LIMIT)]
df = df[df["y"].between(0, Y_LIMIT)]

df = df[df["ball_land_x"].between(0, X_LIMIT)]
df = df[df["ball_land_y"].between(0, Y_LIMIT)]

In [13]:
targets = df[df["player_to_predict"] == True].copy()

In [14]:
# Remove unnecessary columns
exclude_cols = [
    "player_name", "player_position", "player_role", "player_side", "play_direction",
    "player_height", "player_birth_date",
    "game_id", "play_id", "nfl_id",  "ball_land_x", "ball_land_y", "x_out", "y_out", "frame_id_in", "frame_id_out",
    "target_dx", "target_dy",
]

# Start with a model targeting only defensive players

In [15]:
# keep the defense players to predict for the first model
df_def_players = df[(df["player_side"] == "Defense") & 
                    (df["player_to_predict"] == True)].copy()

df_def_players.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,was_left
336,2023090700,361,True,46137,1,right,22,Justin Reid,6-1,204,...,35.12,0.79,1.72,200.79,246.56,16,26.1,49.18,1,0
337,2023090700,361,True,46137,2,right,22,Justin Reid,6-1,204,...,35.06,0.55,2.08,216.05,253.62,16,26.1,49.18,1,0
338,2023090700,361,True,46137,3,right,22,Justin Reid,6-1,204,...,35.05,0.46,2.78,270.62,263.88,16,26.1,49.18,1,0
339,2023090700,361,True,46137,4,right,22,Justin Reid,6-1,204,...,35.08,0.7,3.16,307.3,270.71,16,26.1,49.18,1,0
340,2023090700,361,True,46137,5,right,22,Justin Reid,6-1,204,...,35.14,1.11,3.51,321.64,273.19,16,26.1,49.18,1,0


In [16]:
# Prepare features to include in the model based on Uliana's notebook

# Direction
df_def_players["dir_rad"] = np.deg2rad(df_def_players["dir"])

# Body orientation
df_def_players["o_rad"] = np.deg2rad(df_def_players["o"])

# Velocity components
df_def_players["vx"] = df_def_players["s"] * np.cos(df_def_players["dir_rad"])
df_def_players["vy"] = df_def_players["s"] * np.sin(df_def_players["dir_rad"])

# Acceleration components
df_def_players["ax"] = df_def_players["a"] * np.cos(df_def_players["dir_rad"])
df_def_players["ay"] = df_def_players["a"] * np.sin(df_def_players["dir_rad"])

# Ball geometry
dx_ball = df_def_players["ball_land_x"] - df_def_players["x"]
dy_ball = df_def_players["ball_land_y"] - df_def_players["y"]

df_def_players["dist_to_ball"] = np.sqrt(dx_ball**2 + dy_ball**2)
df_def_players["angle_to_ball"] = np.arctan2(dy_ball, dx_ball)

# Alignment with ball
df_def_players["angle_diff"] = (
    (df_def_players["dir_rad"] - df_def_players["angle_to_ball"] + np.pi)
    % (2*np.pi)
    - np.pi
)

df_def_players["cos_angle_diff"] = np.cos(df_def_players["angle_diff"])
df_def_players["sin_angle_diff"] = np.sin(df_def_players["angle_diff"])

df_def_players["target_dx"] = df_def_players["ball_land_x"] - df_def_players["x"]
df_def_players["target_dy"] = df_def_players["ball_land_y"] - df_def_players["y"]

target_cols = ["target_dx", "target_dy"]

In [17]:
# create the list of features which will be used in the model
feature_cols = [
    "x", "y", "s", "a",
    "vx", "vy", "ax", "ay", "dir_rad", "o_rad",
    "dist_to_ball", "angle_to_ball",
    "angle_diff", "cos_angle_diff", "sin_angle_diff",
]

X = df_def_players[feature_cols].astype(float)
y = df_def_players[target_cols].astype(float)


groups = df_def_players["game_id"]

gkf = GroupKFold(n_splits=5)
rmses = []

for tr, va in gkf.split(X, y, groups):
    Xtr, Xva = X.iloc[tr], X.iloc[va]
    ytr, yva = y.iloc[tr], y.iloc[va]

    model = MultiOutputRegressor(ElasticNet(alpha=0.1, l1_ratio=0.5))
    model.fit(Xtr, ytr)

    pred = model.predict(Xva)

    rmse_dx = np.sqrt(mean_squared_error(yva["target_dx"], pred[:,0]))
    rmse_dy = np.sqrt(mean_squared_error(yva["target_dy"], pred[:,1]))
    rmse_total = np.sqrt(mean_squared_error(yva.values, pred))

    rmses.append([rmse_dx, rmse_dy, rmse_total])

rmse_array = np.array(rmses)
print("\nRMSE dx:", rmse_array[:,0].mean())
print("RMSE dy:", rmse_array[:,1].mean())
print("OVERALL RMSE:", rmse_array[:,2].mean())


RMSE dx: 5.424456479255109
RMSE dy: 7.301358686819365
OVERALL RMSE: 6.432027521377416


# Create a model only for the offense players

In [18]:
# keep the offense players
df_off_players = df[(df["player_side"] == "Offense") & 
                    (df["player_to_predict"] == True)].copy()

df_off_players.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,week,was_left
744,2023090700,361,True,38696,1,right,22,Marvin Jones,6-2,198,...,37.9,0.07,0.63,93.26,51.79,16,26.1,49.18,1,0
745,2023090700,361,True,38696,2,right,22,Marvin Jones,6-2,198,...,37.91,0.23,1.25,68.27,50.26,16,26.1,49.18,1,0
746,2023090700,361,True,38696,3,right,22,Marvin Jones,6-2,198,...,37.94,0.5,1.87,61.64,50.26,16,26.1,49.18,1,0
747,2023090700,361,True,38696,4,right,22,Marvin Jones,6-2,198,...,37.97,0.79,3.02,57.49,52.68,16,26.1,49.18,1,0
748,2023090700,361,True,38696,5,right,22,Marvin Jones,6-2,198,...,38.06,1.34,4.1,46.71,52.68,16,26.1,49.18,1,0


In [19]:
# Prepare features to include in the model based on Uliana's notebook

# Direction
df_off_players["dir_rad"] = np.deg2rad(df_off_players["dir"])

# that model won't include orientation as these players don't move backwards

# Velocity components
df_off_players["vx"] = df_off_players["s"] * np.cos(df_off_players["dir_rad"])
df_off_players["vy"] = df_off_players["s"] * np.sin(df_off_players["dir_rad"])

# Acceleration components
df_off_players["ax"] = df_off_players["a"] * np.cos(df_off_players["dir_rad"])
df_off_players["ay"] = df_off_players["a"] * np.sin(df_off_players["dir_rad"])

# Ball geometry
dx_ball = df_off_players["ball_land_x"] - df_off_players["x"]
dy_ball = df_off_players["ball_land_y"] - df_off_players["y"]

df_off_players["dist_to_ball"] = np.sqrt(dx_ball**2 + dy_ball**2)
df_off_players["angle_to_ball"] = np.arctan2(dy_ball, dx_ball)

# Alignment with ball
df_off_players["angle_diff"] = (
    (df_off_players["dir_rad"] - df_off_players["angle_to_ball"] + np.pi)
    % (2*np.pi)
    - np.pi
)

df_off_players["cos_angle_diff"] = np.cos(df_off_players["angle_diff"])
df_off_players["sin_angle_diff"] = np.sin(df_off_players["angle_diff"])

df_off_players["target_dx"] = df_off_players["ball_land_x"] - df_off_players["x"]
df_off_players["target_dy"] = df_off_players["ball_land_y"] - df_off_players["y"]

target_cols = ["target_dx", "target_dy"]

# create a list with the selected features
feature_cols = [
    "x", "y", "s", "a", "dir",
    "vx", "vy", "ax", "ay", "dir_rad",
    "dist_to_ball", "angle_to_ball",
    "angle_diff", "cos_angle_diff", "sin_angle_diff",
]

y = df_off_players[target_cols].astype(float)
X = df_off_players[feature_cols].astype(float)

groups = df_off_players["game_id"]

gkf = GroupKFold(n_splits=5)
rmses = []

for tr, va in gkf.split(X, y, groups):
    Xtr, Xva = X.iloc[tr], X.iloc[va]
    ytr, yva = y.iloc[tr], y.iloc[va]

    model = MultiOutputRegressor(ElasticNet(alpha=0.1, l1_ratio=0.5))
    model.fit(Xtr, ytr)

    pred = model.predict(Xva)

    rmse_dx = np.sqrt(mean_squared_error(yva["target_dx"], pred[:,0]))
    rmse_dy = np.sqrt(mean_squared_error(yva["target_dy"], pred[:,1]))
    rmse_total = np.sqrt(mean_squared_error(yva.values, pred))

    rmses.append([rmse_dx, rmse_dy, rmse_total])

rmse_array = np.array(rmses)
print("\nRMSE dx:", rmse_array[:,0].mean())
print("RMSE dy:", rmse_array[:,1].mean())
print("OVERALL RMSE:", rmse_array[:,2].mean())


RMSE dx: 3.5529263367803265
RMSE dy: 5.892856591564595
OVERALL RMSE: 4.865680798948894
