### Data Processing Pipeline
  
This notebook performs the following steps for the 2025 NFL Big Data Bowl competition:
1. Load raw data from `players.csv`, `player_play.csv`, `plays.csv`, and `tracking_week_X.csv`.
2. Clean and preprocess data. 
3. Save data to be used later on downstream tasks.

In [1]:
import random
import os
import numpy as np
import polars as pl
root_dir = os.getcwd()
print(root_dir)

# Go back a directory to access the data folder
os.chdir(os.path.join(root_dir, '..'))
print(os.getcwd())

from data.scripts.data_cleaning import clean_data, aggregate_data, strip_unused_data


C:\Users\chess\OneDrive\Desktop\Non PhD work\NFLbigdatabowl\BigData25\notebooks
C:\Users\chess\OneDrive\Desktop\Non PhD work\NFLbigdatabowl\BigData25


In [2]:
# Set paths to local data files
game_fname = os.path.join("data/raw/games.csv")

players_fname = os.path.join("data/raw/players.csv")
plays_fname = os.path.join("data/raw/plays.csv")
player_play_fname = os.path.join("data/raw/player_play.csv")
tracking_fname_list = [os.path.join(f"data/raw/tracking_week_{1}.csv")]
df = aggregate_data(
        game_fname=game_fname,
        players_fname=players_fname, 
        plays_fname=plays_fname,
        player_play_fname=player_play_fname,
        #tracking_fname_list=None
        tracking_fname_list=tracking_fname_list,
        )
# Preprocess and clean the data 
df_clean = clean_data(df, 'at_snap') # ['at_snap', 'presnap', 'postsnap', 'all']
# Reduce the size of the dataframe by removing unnecessary columns
useful_columns = [
        "gameId",
        "playId",
        "frameId",
        "nflId",
        "displayName",
        "position",
        "club",
        "homeTeamAbbr",
        "possessionTeam",
        "defensiveTeam",
        "preSnapHomeScore",
        "preSnapVisitorScore",
        "quarter",
        "gameClock",
        "down",
        "yardsToGo",
        "yardlineNumber",
        "yardlineSide",
        "offenseFormation",
        "receiverAlignment",
        "preSnapHomeTeamWinProbability",
        "preSnapVisitorTeamWinProbability",
        "o_clean",
        "a_clean",
        "s_clean",
        "x_clean",
        "y_clean",
        "dir_clean",
        "playDescription",
        "passLocationType",
        "rushLocationType",
        "pff_runConceptPrimary",
        "yardsGained",
        "wasInitialPassRusher",
        "event",
        "shiftSinceLineset",
        "homeTeamWinProbabilityAdded",
]

df_base = strip_unused_data(df_clean, useful_columns)



#Done for memory savings. Load and merge each at a time. 
for i in range(2,10):
    tracking_fname_list = [os.path.join(f"data/raw/tracking_week_{i}.csv")]
    df = aggregate_data(
        game_fname=game_fname,
        players_fname=players_fname, 
        plays_fname=plays_fname,
        player_play_fname=player_play_fname,
        #tracking_fname_list=None
        tracking_fname_list=tracking_fname_list,
        )
    
    # Preprocess and clean the data 
    df_clean = clean_data(df, 'at_snap') # ['at_snap', 'presnap', 'postsnap', 'all']
    
    # Reduce the size of the dataframe by removing unnecessary columns

    df_reduced = strip_unused_data(df_clean, useful_columns)
    #Memory_savings
    df_base=pl.concat([df_base,df_reduced,],how="vertical")
    # Aggregate data from the plays.csv, players.csv, and any tracking data into one aggregate dataframe.

INFO: Aggregating data from players, play data, tracking data, and players data into a master dataframe...
INFO: Loaded 16124 rows of plays, 354727 rows of player plays, and 7104700 rows of player tracking data
INFO: Aggregated dataframe has 7107583 rows
INFO: Removing inactive frames...
INFO: 7022421 rows removed
INFO: Removing garbage time frames...
INFO: 12386 rows removed
INFO: Transforming orientation and direction angles so that 0° points from left to right, and increasing angle goes counterclockwise...
INFO: Flipping plays so that they all run from left to right...
INFO: Removing QB kneels, spikes, sneaks...
INFO: 946 rows removed
INFO: Converting geometry variables from floats to int...
INFO: Removing unused columns from dataframe...
INFO: 99 columns removed
INFO: Aggregating data from players, play data, tracking data, and players data into a master dataframe...
INFO: Loaded 16124 rows of plays, 354727 rows of player plays, and 6704339 rows of player tracking data
INFO: Aggreg

In [3]:
csv = True
df_base=df_base.with_columns((df_base["playId"].cast(str)+df_base["gameId"].cast(str)).alias("playgameID"))
if csv:
    # Save the cleaned data to a csv file
    df_base.write_csv(os.path.join("data/processed/df_clean.csv"))

shape: (591_204,)
Series: 'homeTeamAbbr' [str]
[
	"LA"
	"LA"
	"LA"
	"LA"
	"LA"
	…
	"NO"
	"NO"
	"NO"
	"NO"
	"NO"
]
