# Transforming Data:

In [19]:
# Necessary Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Modeling Libraries:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss

In [10]:
# Load the datasets of choice, can change the weeks input
inputs = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\train\\input_2023_w01.csv')
outputs = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\train\\output_2023_w01.csv')
supp = pd.read_csv('C:\\Users\\jrzem\\Downloads\\NFL-Big-Data-Bowl-2026-Analytics-Challenge\\Kaggle Data\\supplementary_data.csv')

# Distance Modeling:

### Identify Targeted Receivers and Defenders:

In [None]:
# Restrict to targeted throws only
targets = inputs[inputs["player_role"].str.contains("Targeted Receiver", na=False)].copy()

# Throw frame = last input frame for that player/play
targets_throw = (
    targets
    .sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
    .groupby(["game_id", "play_id", "nfl_id"])
    .tail(1)
    .rename(columns={"frame_id": "throw_frame"})
)

# Get all coverage defenders at throw time (same play & frame)
defenders_throw = (
    inputs[inputs["player_role"].str.contains("Defensive Coverage", na=False)]
    .merge(
        targets_throw[["game_id", "play_id", "throw_frame"]],
        on=["game_id", "play_id"],
        how="inner"
    )
)

defenders_throw = defenders_throw[defenders_throw["frame_id"] == defenders_throw["throw_frame"]]

# Join with supplementary (play-level context)
plays = (
    targets_throw
    .merge(
        supp,
        on=["game_id", "play_id"],
        how="left",
        suffixes=("", "_supp")
    )
)

### Use Euclidean Distance:

In [13]:
FRAME_RATE = 10.0  # NFL tracking is 10 Hz → 0.1s per frame
DT = 1.0 / FRAME_RATE

def distance_to_ball(df, x_ball_col="ball_land_x", y_ball_col="ball_land_y"):
    return np.sqrt(
        (df["x"] - df[x_ball_col])**2 + 
        (df["y"] - df[y_ball_col])**2
    )

At the throw frame, assume each player runs straight toward the landing point at their current scalar speed s (from input). Ignore acceleration/orientation for a first pass:

In [14]:
V_MIN = 1.0  # to avoid dividing by ~0; tune

# Compute receiver time-to-arrive at release
targets_throw["dist_to_ball_release"] = distance_to_ball(targets_throw)
targets_throw["T_WR_release"] = (
    targets_throw["dist_to_ball_release"] / 
    targets_throw["s"].clip(lower=V_MIN)
)

# Compute defenders' time-to-arrive at release
defenders_throw["dist_to_ball_release"] = distance_to_ball(defenders_throw)
defenders_throw["T_def_release"] = (
    defenders_throw["dist_to_ball_release"] /
    defenders_throw["s"].clip(lower=V_MIN)
)

# Aggregate to per-throw metrics
def_race = (
    defenders_throw
    .groupby(["game_id", "play_id"])
    .agg(
        T_def_min_release=("T_def_release", "min"),
        T_def_mean_release=("T_def_release", "mean"),
        n_defenders_release=("nfl_id", "nunique")
    )
    .reset_index()
)

race_release = (
    targets_throw
    .merge(def_race, on=["game_id", "play_id"], how="left")
)

race_release["deltaT_release"] = (
    race_release["T_def_min_release"] - race_release["T_WR_release"]
)

deltaT_release > 0: WR is projected to get there first.

deltaT_release < 0: some defender is projected to beat him there.

In [15]:
R_THRESH = 1.0  # yards, within “catchable” radius

# 1. Tag WR vs defenders using input info
wr_ids = targets_throw[["game_id", "play_id", "nfl_id"]].copy()
wr_ids["is_wr"] = 1

def_ids = defenders_throw[["game_id", "play_id", "nfl_id"]].copy()
def_ids["is_wr"] = 0

role_tags = pd.concat([wr_ids, def_ids], ignore_index=True).drop_duplicates()

# 2. Join role tags into output tracking
output = outputs.merge(role_tags, on=["game_id", "play_id", "nfl_id"], how="inner")

# 3. Compute distance to ball for all tagged players
# Need ball_land_x/y per (game_id, play_id, nfl_id) from input (they’re identical across players)
ball_land = (
    inputs[["game_id", "play_id", "nfl_id", "ball_land_x", "ball_land_y"]]
    .drop_duplicates(["game_id", "play_id", "nfl_id"])
)

output = output.merge(
    ball_land[["game_id", "play_id", "nfl_id", "ball_land_x", "ball_land_y"]],
    on=["game_id", "play_id", "nfl_id"],
    how="left"
)

output["dist_to_ball"] = distance_to_ball(output)

# 4. First-arrival frame per player
arrivals = (
    output[output["dist_to_ball"] <= R_THRESH]
    .sort_values(["game_id", "play_id", "nfl_id", "frame_id"])
    .groupby(["game_id", "play_id", "nfl_id"])
    .agg(first_arrival_frame=("frame_id", "first"))
    .reset_index()
)

arrivals["T_arrive"] = arrivals["first_arrival_frame"] * DT  # relative to output frame 0

# 5. Aggregate to WR vs nearest defender per play
arrivals = arrivals.merge(role_tags, on=["game_id", "play_id", "nfl_id"], how="left")

wr_arrivals = arrivals[arrivals["is_wr"] == 1].rename(columns={"T_arrive": "T_WR_emp"})
def_arrivals = (
    arrivals[arrivals["is_wr"] == 0]
    .groupby(["game_id", "play_id"])
    .agg(T_def_min_emp=("T_arrive", "min"))
    .reset_index()
)

race_emp = (
    wr_arrivals[["game_id", "play_id", "T_WR_emp"]]
    .merge(def_arrivals, on=["game_id", "play_id"], how="left")
)

race_emp["deltaT_empirical"] = race_emp["T_def_min_emp"] - race_emp["T_WR_emp"]

# Probability Modeling:

#### Building a completion probability model
Now build a per-play, per-target row with:

- Response:
  - is_complete = (pass_result == "C")
- Race features:
  - deltaT_release, deltaT_empirical
  - T_WR_release, T_def_min_release
  - T_WR_emp, T_def_min_emp
  - optional: distance-based features (min separation, etc.)
- Context features from supplementary:
  - pass_length, yards_to_go, down, quarter, expected_points
  - team_coverage_man_zone, team_coverage_type
  - pass_location_type, dropback_type
  - plus team & game state variables.

In [16]:
# Basic binary label
plays["is_complete"] = (plays["pass_result"] == "C").astype(int)

# Merge race features
model_df = (
    plays
    .merge(race_release[[
        "game_id", "play_id", "nfl_id",
        "T_WR_release", "T_def_min_release", "deltaT_release"
    ]], on=["game_id", "play_id", "nfl_id"], how="left")
    .merge(race_emp[[
        "game_id", "play_id",
        "T_WR_emp", "T_def_min_emp", "deltaT_empirical"
    ]], on=["game_id", "play_id"], how="left")
)

# Optional: filter to canonical pass types (exclude sacks, scrambles, penalties, etc.)
model_df = model_df[model_df["play_nullified_by_penalty"] != "Y"]
model_df = model_df[model_df["pass_result"].isin(["C", "I", "IN"])]

Start with logistic regression to see whether ΔT is meaningful:

In [None]:
feature_cols_num = [
    "deltaT_release", "deltaT_empirical",
    "T_WR_release", "T_def_min_release",
    "pass_length", "yards_to_go", "expected_points"
]

feature_cols_cat = [
    "team_coverage_man_zone", "team_coverage_type",
    "pass_location_type", "dropback_type"
]



X = model_df[feature_cols_num + feature_cols_cat]
y = model_df["is_complete"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", feature_cols_num),
        ("cat", OneHotEncoder(handle_unknown="ignore"), feature_cols_cat),
    ]
)

logit = LogisticRegression(max_iter=500, penalty="l2")

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", logit)
])

clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, y_pred_proba)
brier = brier_score_loss(y_test, y_pred_proba)
print("AUC:", auc, "Brier:", brier)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values