In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.multioutput import RegressorChain
from sklearn.metrics import mean_squared_error
import glob
import re
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor




path = "C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/"

In [2]:
def create_training_rows(input_df: pd.DataFrame, output_df: pd.DataFrame) -> pd.DataFrame:
    agg = (
        input_df.sort_values(["game_id","play_id","nfl_id","frame_id"])
                .groupby(["game_id","play_id","nfl_id"], as_index=False)
                .tail(1)
                .reset_index(drop=True)
                .rename(columns={"frame_id":"last_frame_id"})
    )

    out = output_df.copy()
    out = out.rename(columns={"x":"target_x","y":"target_y"})
    out["id"] = (
        out["game_id"].astype(str) + "_" +
        out["play_id"].astype(str) + "_" +
        out["nfl_id"].astype(str) + "_" +
        out["frame_id"].astype(str)
    )
    m = out.merge(agg, on=["game_id","play_id","nfl_id"], how="left", suffixes=("","_last"))
    m["delta_frames"] = (m["frame_id"] - m["last_frame_id"]).clip(lower=0).astype(float)
    m["delta_t"] = m["delta_frames"] / 10.0
    return m

class Data():
    def __init__(self, path=""):
        files_input = sorted(glob.glob(path + "train/input_2023_w*.csv"))
        files_output = sorted(glob.glob(path + "train/output_2023_w*.csv"))

        INPUT_DFS = []
        OUTPUT_DFS = []

        for f_in, f_out in zip(files_input, files_output):
            df_in = pd.read_csv(f_in)
            df_out = pd.read_csv(f_out)

            week_match = re.search(r"w(\d+)", f_out)
            week = int(week_match.group(1)) if week_match else None

            df_in["week"] = week
            df_out["week"] = week

            INPUT_DFS.append(df_in)
            OUTPUT_DFS.append(df_out)

            print(f"{f_in}: {df_in.shape}, {f_out}: {df_out.shape}, week: {week}")
        input_df = pd.concat(INPUT_DFS, ignore_index=True)
        output_df = pd.concat(OUTPUT_DFS, ignore_index=True)

        
        df = create_training_rows(input_df, output_df)

        self.data = df


    def preproc(self, type=""):
        X_LIMIT = 120
        Y_LIMIT = 53.3

        right_eda = self.data[self.data["play_direction"] == "right"].copy()
        left_eda = self.data[self.data["play_direction"] == "left"].copy()

        right_eda["was_left"] = 0
        left_eda["was_left"]  = 1

        left_eda["x"] = X_LIMIT - left_eda["x"]
        left_eda["y"] = Y_LIMIT - left_eda["y"]

        left_eda["ball_land_x"] = X_LIMIT - left_eda["ball_land_x"]
        left_eda["ball_land_y"] = Y_LIMIT - left_eda["ball_land_y"]

        left_eda["dir"] = (left_eda["dir"] + 180) % 360
        left_eda["o"] = (left_eda["o"] + 180) % 360

        left_eda["play_direction"] = "right"

        df = pd.concat([right_eda, left_eda], ignore_index=True)

        df = df[df["player_to_predict"]==True]

        targets = df[df["player_to_predict"] == True].copy()

        #df["target_dx"] = df["ball_land_x"] - df["x"]
        #df["target_dy"] = df["ball_land_y"] - df["y"]

        exclude_cols = [
            "player_name", "player_position", "player_role", "player_side", "play_direction",
            "player_height", "player_birth_date",
            "game_id", "play_id", "nfl_id",
            "x_out", "y_out", "frame_id_in", "frame_id_out",
            "target_dx", "target_dy",
        ]

        feature_cols = [c for c in self.data.columns if c not in exclude_cols]

        self.feature_cols = feature_cols

        self.preprocessed = df

In [3]:
dat = Data(path = path)

C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\input_2023_w01.csv: (285714, 24), C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\output_2023_w01.csv: (32088, 7), week: 1
C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\input_2023_w02.csv: (288586, 24), C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\output_2023_w02.csv: (32180, 7), week: 2
C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\input_2023_w03.csv: (297757, 24), C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\output_2023_w03.csv: (36080, 7), week: 3
C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\input_2023_w04.csv: (272475, 24), C:/Users/volte/Documents/kaggle_superbowl/nfl-big-data-bowl-2026-prediction/train\output_2023_w04.csv: (30147, 7), week: 4
C:/Users/volte/Documents/kaggle_superbowl/nf

In [4]:
dat.preproc()
dat.preprocessed.head()

dat.preprocessed.columns
#print(dat.preprocessed[["x","y", "target_x", "target_y"]])

Index(['game_id', 'play_id', 'nfl_id', 'frame_id', 'target_x', 'target_y',
       'week', 'id', 'player_to_predict', 'last_frame_id', 'play_direction',
       'absolute_yardline_number', 'player_name', 'player_height',
       'player_weight', 'player_birth_date', 'player_position', 'player_side',
       'player_role', 'x', 'y', 's', 'a', 'dir', 'o', 'num_frames_output',
       'ball_land_x', 'ball_land_y', 'week_last', 'delta_frames', 'delta_t',
       'was_left'],
      dtype='object')

In [5]:
dat.preprocessed.drop(["player_birth_date", "player_height", "player_name", "id"], axis=1, inplace=True)

In [6]:
dat.preprocessed.columns

Index(['game_id', 'play_id', 'nfl_id', 'frame_id', 'target_x', 'target_y',
       'week', 'player_to_predict', 'last_frame_id', 'play_direction',
       'absolute_yardline_number', 'player_weight', 'player_position',
       'player_side', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o',
       'num_frames_output', 'ball_land_x', 'ball_land_y', 'week_last',
       'delta_frames', 'delta_t', 'was_left'],
      dtype='object')

In [7]:
dat.preprocessed.shape

(562936, 28)

In [9]:

df = dat.preprocessed.sort_values(
    ["game_id", "play_id", "nfl_id", "frame_id"]
).reset_index(drop=True)

df = df[df["player_to_predict"] == True]

df["group"] = df["game_id"].astype(str) + "_" + df["play_id"].astype(str)

targets = ["target_x", "target_y"]

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in df.columns if c not in num_cols]

cat_cols = [c for c in cat_cols if c not in targets + ["group"]]

df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=True)

X = df_enc[[c for c in df_enc.columns if c not in targets]]
y = df_enc[targets].copy()

groups = df_enc["group"].values

gkf = GroupKFold(n_splits=2)
oof = np.zeros((len(df_enc), 2))

rmse_x_folds = []
rmse_y_folds = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\nFold {fold+1}")

    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    base = KNeighborsRegressor(n_neighbors=3)
    chain = RegressorChain(base, order=[0, 1])
    chain.fit(X_tr, y_tr)

    preds_val = chain.predict(X_val)
    oof[val_idx] = preds_val

    rmse_x = mean_squared_error(y_val["target_x"], preds_val[:, 0])
    rmse_y = mean_squared_error(y_val["target_y"], preds_val[:, 1])

    rmse_x_folds.append(rmse_x)
    rmse_y_folds.append(rmse_y)

    print(f"RMSE target_x: {rmse_x:.4f}")
    print(f"RMSE target_y: {rmse_y:.4f}")


oof_df = pd.DataFrame(oof, columns=["oof_target_x", "oof_target_y"])

print("target_x:", np.mean(rmse_x_folds))
print("target_y:", np.mean(rmse_y_folds))


final_base = KNeighborsRegressor(n_neighbors=3)
final_chain = RegressorChain(final_base, order=[0, 1])
final_chain.fit(X, y)




Fold 1
RMSE target_x: 1157.8821
RMSE target_y: 296.8185

Fold 2
RMSE target_x: 1123.9075
RMSE target_y: 285.6713
target_x: 1140.894815774865
target_y: 291.2448886030379


0,1,2
,estimator,KNeighborsReg...n_neighbors=3)
,order,"[0, 1]"
,cv,
,random_state,
,verbose,False
,base_estimator,'deprecated'

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,
