## Initialization

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt

In [None]:
years = list(range(1999, 2024))

In [None]:
csv_files = [f"./Data/play_by_play_{year}.csv" for year in years]

In [None]:
# load each csv file as a dataframe and collect them in a list
dataframes = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    dataframes.append(df)

In [None]:
# combine all dataframes into a single one
combined_df = pd.concat(dataframes, axis=0)

# reset the index of the combined dataframe
combined_df.reset_index(drop=True, inplace=True)

## Data Validity Check
Tok & Joel

In [None]:
# plausibility check of the data set
display(combined_df.shape)
print("number of duplicates:", combined_df.duplicated().sum())
print("number of distinct teams ('posteam'):", combined_df['posteam'].nunique())
print("different 'posteam_type' attributes:", list(combined_df['posteam_type'].unique()))
print("different 'play_type' attributes:", list(combined_df["play_type"].unique()))
display(combined_df[(combined_df['yardline_100'] <= 0) | (combined_df['yardline_100'] >= 100)].shape)
display(combined_df[(combined_df['season'] < 1999) | (combined_df['season'] > 2023)].shape)
display(combined_df[(combined_df['game_seconds_remaining'] < 0) | (combined_df['game_seconds_remaining'] > 3600)].shape)
display(combined_df[(combined_df['half_seconds_remaining'] < 0) | (combined_df['half_seconds_remaining'] > 1800)].shape)
display(combined_df[(combined_df['quarter_seconds_remaining'] < 0) | (combined_df['quarter_seconds_remaining'] > 900)].shape)
display(combined_df[(combined_df['down'] < 1) | (combined_df['down'] > 4)].shape)
display(combined_df[combined_df['ydstogo'] > 99].shape)
display(combined_df[combined_df['yards_gained'] > 99].shape)
display(combined_df[(combined_df['play_type'] == 'run') & (combined_df['interception'] == 1)].shape)
display(combined_df[combined_df['incomplete_pass'] == 1].shape)

In [None]:
# detect NAs
for column in combined_df.columns:
    print(f"{column}: {sum(combined_df[column].isna())}")

In [None]:
# remove non-pass and -run plays from dataframes
combined_df.drop(
    combined_df[~combined_df["play_type"].isin(["pass", "run"])].index,
    axis=0,
    inplace=True,
)

In [None]:
# detect NAs
display(combined_df.shape)

for column in combined_df.columns:
    print(f"{column}: {sum(combined_df[column].isna())}")

In [None]:
pd.set_option("display.max_columns", None)

display(combined_df.describe())

mean = combined_df['yards_gained'].mean()
median = combined_df['yards_gained'].median()

# Histogram after removal of Extremal values
plt.hist(combined_df['yards_gained'],range=(-20, 40), bins=60, alpha=0.7,)
plt.axvline(mean, color = 'red', linestyle = 'dashed', linewidth=2, label='Mean')
plt.axvline(median, color = 'green', linestyle = 'dashed', linewidth=2, label='Median')
plt.title('Histogram: yards_gained')
plt.show()

# Calculate skew using scipy.stats.skew
skewness = combined_df['yards_gained'].skew()

print(f'Skewness using scipy.stats.skew: {skewness}')
print('Mean', mean)
print('Median', median)
print(combined_df['yards_gained'].kurt())

pd.reset_option("display.max_columns")

## Removal of Exceptional Observations
Tok

In [None]:
# with penalty = 1 higher avg yards_gained -> biased (free plays etc)
pd.set_option("display.max_columns", None)

display(combined_df[combined_df['penalty'] == 1].describe())
display(combined_df[combined_df['penalty'] == 0].describe())

pd.reset_option("display.max_columns")

In [None]:
# drop plays with penalties
combined_df.drop(combined_df[combined_df["penalty"] == 1].index, axis=0, inplace=True)

In [None]:
# drop two point conversion plays
combined_df.drop(
    combined_df[~combined_df["two_point_conv_result"].isna()].index, axis=0, inplace=True
)

In [None]:
# 75 perc. quant. is at 48 sec / half -> only end of half -> biased

display(combined_df[combined_df['lateral_reception'] == 1].describe())

lateral_reception_entries = combined_df[combined_df['lateral_reception'] == 1]

# grouped by 'half_seconds_remaining' and count number of occurrences
grouped_data = lateral_reception_entries.groupby('game_seconds_remaining').size().reset_index(name='count')

plt.figure(figsize=(10, 6))
plt.hist(lateral_reception_entries['game_seconds_remaining'], bins=50, edgecolor='black')
plt.title('Number of plays with lateral receptions plotted against remaining game seconds')
plt.xlabel('Seconds remaining in the game')
plt.ylabel('Number of plays with lateral receptions')
plt.grid(True)
plt.show()

In [None]:
# drop plays with laterals
combined_df.drop(
    combined_df[combined_df["lateral_reception"] == 1].index, axis=0, inplace=True
)

In [None]:
# don't drop -> lateral rushs also end around or trick plays -> same reasoning as above, but other way around

display(combined_df[combined_df['lateral_rush'] == 1].describe())

lateral_rush_entries = combined_df[combined_df['lateral_rush'] == 1]

# grouped by 'half_seconds_remaining' and count number of occurrences
grouped_data = lateral_rush_entries.groupby('half_seconds_remaining').size().reset_index(name='count')

plt.figure(figsize=(10, 6))
plt.hist(lateral_rush_entries['half_seconds_remaining'], bins=20, edgecolor='black')
plt.title('Number of plays with lateral rushes plotted against remaining half seconds')
plt.xlabel('Seconds remaining in the half')
plt.ylabel('Number of plays with lateral receptions')
plt.grid(True)
plt.show()

In [None]:
# don't drop -> doesn't change anything if replayed or not

display(combined_df[combined_df['replay_or_challenge'] == 1].describe())

In [None]:
# plays where the snap is mishandled or dropped etc
display(combined_df[combined_df['aborted_play'] == 1].describe())

In [None]:
# drop aborted plays
combined_df.drop(
    combined_df[combined_df["aborted_play"] == 1].index, axis=0, inplace=True
)

In [None]:
# adjust the spread line to the view of the team with possession of the ball
combined_df.loc[combined_df['posteam_type'] == 'away', 'spread_line'] *= -1

## Removal of Non-Decisive Features
Thilo & Tok

In [None]:
drop_columns1 = [
    "play_id", # -> only for identification
    # "game_id",
    "old_game_id", # -> only for identification
    "home_team", # -> correlated with posteam/defteam
    "away_team", # -> correlated with posteam/defteam
    # "season_type",
    "week", # -> only for identification
    "side_of_field", # -> correlated with yardline_100
    "game_date", # -> only for identification
    # "quarter_seconds_remaining", # -> correlated with game time
    "quarter_end", # -> not known before the play
    "sp", # Binary indicator for whether or not a score occurred on the play -> not known before the play
    "time", # -> exactly the same as quarter seconds remaining, only in other format
    "yrdln", # -> perfectly correlated with yardline_100
    "ydsnet", # -> only known at the end of the drive
    "qb_kneel", # -> all plays already dropped -> always 0
    "qb_spike", # -> all plays already dropped -> always 0
    "pass_length", # -> not known before the play
    "pass_location", # -> not known before the play
    "run_location", # -> not known before the play
    "run_gap", # -> not known before the play
    "field_goal_result", # -> all plays already dropped -> always 0
    "kick_distance", # -> all plays already dropped -> always 0
    "extra_point_result", # -> all plays already dropped -> always 0
    "two_point_conv_result", # -> all plays already dropped -> always 0
    "home_timeouts_remaining", # -> correlated with posteam/defteam
    "away_timeouts_remaining", # -> correlated with posteam/defteam
    "timeout", # -> all plays already dropped -> always 0
    "timeout_team", # -> all plays already dropped -> always 0
    "td_team", # -> not known before the play
    "td_player_name", # -> not known before the play
    "td_player_id", # -> not known before the play
    # "posteam_timeouts_remaining",
    # "defteam_timeouts_remaining",
    "total_home_score", # Score for the home team at the end of the play -> not known before the play
    "total_away_score", # Score for the away team at the end of the play -> not known before the play
    "posteam_score", # -> not known before the play
    "defteam_score", # -> not known before the play
    "posteam_score_post", # -> not known before the play
    "defteam_score_post", # -> not known before the play
    "score_differential_post", # -> not known before the play
    "no_score_prob", # -> based on expected points model -> bias for our model
    "opp_fg_prob", # -> based on expected points model -> bias for our model
    "opp_safety_prob", # -> based on expected points model -> bias for our model
    "opp_td_prob", # -> based on expected points model -> bias for our model
    "fg_prob", # -> based on expected points model -> bias for our model
    "safety_prob", # -> based on expected points model -> bias for our model
    "extra_point_prob", # -> based on expected points model -> bias for our model
    "two_point_conversion_prob", # -> based on expected points model -> bias for our model
    "total_home_epa", # -> based on expected points model -> bias for our model
    "total_away_epa", # -> based on expected points model -> bias for our model
    "total_home_rush_epa", # -> based on expected points model -> bias for our model
    "total_away_rush_epa", # -> based on expected points model -> bias for our model
    "total_home_pass_epa", # -> based on expected points model -> bias for our model
    "total_away_pass_epa", # -> based on expected points model -> bias for our model
    "air_epa", # -> based on expected points model -> bias for our model
    "yac_epa", # -> based on expected points model -> bias for our model
    "comp_air_epa", # -> based on expected points model -> bias for our model
    "comp_yac_epa", # -> based on expected points model -> bias for our model
    "total_home_comp_air_epa", # -> based on expected points model -> bias for our model
    "total_away_comp_air_epa", # -> based on expected points model -> bias for our model
    "total_home_comp_yac_epa", # -> based on expected points model -> bias for our model
    "total_away_comp_yac_epa", # -> based on expected points model -> bias for our model
    "total_home_raw_air_epa", # -> based on expected points model -> bias for our model
    "total_away_raw_air_epa", # -> based on expected points model -> bias for our model
    "total_home_raw_yac_epa", # -> based on expected points model -> bias for our model
    "total_away_raw_yac_epa", # -> based on expected points model -> bias for our model
    "vegas_wpa",  # -> not known before the play
    "vegas_home_wpa",  # -> not known before the play
    "home_wp_post",  # -> not known before the play
    "away_wp_post",  # -> not known before the play
    "total_home_rush_wpa", # -> not known before the play
    "total_away_rush_wpa", # -> not known before the play
    "total_home_pass_wpa", # -> not known before the play
    "total_away_pass_wpa", # -> not known before the play
    "air_wpa", # -> not known before the play
    "yac_wpa", # -> not known before the play
    "comp_air_wpa", # -> not known before the play
    "comp_yac_wpa", # -> not known before the play
    "total_home_comp_air_wpa", # -> not known before the play
    "total_away_comp_air_wpa", # -> not known before the play
    "total_home_comp_yac_wpa", # -> not known before the play
    "total_away_comp_yac_wpa", # -> not known before the play
    "total_home_raw_air_wpa", # -> not known before the play
    "total_away_raw_air_wpa", # -> not known before the play
    "total_home_raw_yac_wpa", # -> not known before the play
    "total_away_raw_yac_wpa", # -> not known before the play
    "punt_blocked", # -> all plays already dropped -> always 0
    "touchback", # -> all plays already dropped -> always 0
]

In [None]:
drop_columns2 = [
    "punt_attempt", # -> all plays already dropped -> always 0
    "safety_player_name", # -> irrelevant for use case
    "punt_inside_twenty", # -> all plays already dropped -> always 0
    "kicker_player_name", # -> all plays already dropped -> always 0
    "passing_yards", # -> not known before the play
    "interception_player_name", # -> not known before the play
    "lateral_kickoff_returner_player_id", # -> all plays already dropped -> always 0
    "assist_tackle", # -> not known before the play
    "qb_hit_2_player_id", # -> not known before the play
    "penalty_team", # -> not known before the play
    "lateral_receiver_player_name", # -> not known before the play
    "lateral_recovery", # -> not known before the play
    "rush_touchdown", # -> not known before the play
    "defensive_two_point_conv", # -> all plays already dropped -> always 0
    "receiver_jersey_number", # -> not known before the play
    "tackle_with_assist", # -> not known before the play
    "fumbled_1_player_name", # -> not known before the play
    "tackle_with_assist_2_player_id", # -> not known before the play
    "two_point_attempt", # -> all plays already dropped -> always 0
    "drive_quarter_start", # Numeric value indicating in which quarter the given drive has started -> correlated with game time
    "lateral_receiving_yards", # -> not known before the play
    "end_yard_line", # -> not known before the play
    "defensive_extra_point_attempt", # -> all plays already dropped -> always 0
    "pass_touchdown", # -> not known before the play
    "lateral_rusher_player_name", # -> not known before the play
    "rusher_id", # -> not known before the play
    "aborted_play", # -> all plays already dropped -> always 0
    "drive_yards_penalized", # Numeric value of how many yards the offense gained or lost through penalties in the given drive -> irrelevant for use case
    "fumble_not_forced", # -> not known before the play
    "penalty_player_id", # -> not known before the play
    "tackle_for_loss_2_player_id", # -> irrelevant for use case
    "drive_end_yard_line", # -> not known before the play
    # "passer_id",
    "stadium_id", # -> irrelevant for use case
    "sack_player_name", # -> not known before the play
    "punt_out_of_bounds", # -> all plays already dropped -> always 0
    "tackle_with_assist_2_team", # -> not known before the play
    "kickoff_downed", # -> all plays already dropped -> always 0
    "extra_point_attempt", # -> all plays already dropped -> always 0
    "punt_fair_catch", # -> all plays already dropped -> always 0
    "assist_tackle_2_player_name", # -> not known before the play
    "fumble_forced", # -> not known before the play
    "special_teams_play", # -> all plays already dropped -> always 0
    "drive_ended_with_score", # -> not known before the play
    "half_sack_1_player_name", # -> not known before the play
    "stadium", # -> correlated with hometeam
    "lateral_receiver_player_id", # -> not known before the play
    "lateral_sack_player_name", # -> not known before the play
    "play", # -> not known before the play
    "tackle_with_assist_1_player_id", # -> not known before the play
    "forced_fumble_player_2_team", # -> not known before the play
    "home_coach", # -> irrelevant for use case (why home_coach and not pos_coach)
    "xyac_epa", # -> not known before the play
    "lateral_punt_returner_player_name", # -> not known before the play
    "fantasy", # -> irrelevant for use case
    "solo_tackle_1_team", # -> not known before the play
    "drive_start_transition", # -> irrelevant for use case
    "first_down", # -> not known before the play
    "first_down_rush", # -> not known before the play
    "first_down_pass", # -> not known before the play
    "game_stadium", # -> correlated with hometeam
    "xyac_fd", # -> not known before the play
    "drive_play_count",  # -> only known at the end of the drive
    "passer", # -> correlated with passer id
    "fumbled_1_player_id", # -> not known before the play
    "replay_or_challenge_result", # -> not known before the play
    "drive_real_start_time", # Local day time when the drive started -> irrelevant for use case
    "receiver_player_id", # -> not known before the play
    "solo_tackle_2_player_id", # -> not known before the play
    "fumbled_2_player_name", # -> not known before the play
    "qb_hit_1_player_name", # -> not known before the play
    "kickoff_attempt", # -> all plays already dropped -> always 0
    "xyac_success", # -> not known before the play
    # "season",
    "rush", # -> all other plays already dropped -> always 1 in the run dataframe (always 0 in pass df)
    "tackle_with_assist_2_player_name", # -> not known before the play
    "assist_tackle_2_team", # -> not known before the play
    "sack_player_id", # -> not known before the play
    "assist_tackle_1_team", # -> not known before the play
    "play_deleted", # -> all values for deleted plays are nan -> will be eliminated later -> always 0
    "rusher_jersey_number", # -> not known before the play
    "pass_oe", # -> only available after 2006
    "return_team", # -> all plays already dropped -> always nan
    "tackle_for_loss_2_player_name", # -> not known before the play
    "time_of_day", # -> only available after 2011
    "end_clock_time", # -> not known before the play
    "tackle_with_assist_1_team", # -> not known before the play
    "home_score", # -> not known before the play
    "tackle_with_assist_1_player_name", # -> not known before the play
    "kickoff_inside_twenty", # -> all plays already dropped -> always 0
    "own_kickoff_recovery_player_id", # -> all plays already dropped -> always 0
    "lateral_reception", # -> not known before the play
    "qb_hit_1_player_id", # -> not known before the play
    "own_kickoff_recovery_td", # -> all plays already dropped -> always 0
    "pass_defense_2_player_name", # -> not known before the play
    "jersey_number", # -> not known before the play
    "punter_player_name", # -> irrelevant for use case
    "blocked_player_name", # -> not known before the play
    "pass_defense_1_player_name", # -> not known before the play
    "xyac_median_yardage", # -> influence on model
    "st_play_type", # -> all plays already dropped -> always 0
    "success", # -> not known before the play
    "penalty_player_name", # -> not known before the play
    "punt_returner_player_name", # -> not known before the play
    "return_touchdown", # -> not known before the play
    "blocked_player_id", # -> not known before the play
    "assist_tackle_1_player_id", # -> not known before the play
    "receiving_yards", # -> not known before the play
    "half_sack_2_player_name", # -> not known before the play
    "drive_game_clock_start", # -> irrelevant for single play
    "rusher", # -> not known before the play
    "pass_defense_1_player_id", # -> not known before the play
    "touchdown", # -> not known before the play
    "assist_tackle_4_player_id", # -> not known before the play
    "lateral_return", # -> all plays already dropped -> always 0
    "solo_tackle_2_team", # -> not known before the play
    "kickoff_in_endzone", # -> all plays already dropped -> always 0
    "fumble_out_of_bounds", # -> not known before the play
    "return_yards", # -> not known before the play
    "punt_downed", # -> not known before the play
    "nfl_api_id", # -> only for identification
    "defensive_extra_point_conv", # -> all plays already dropped -> always 0
    "out_of_bounds", # -> not known before the play
    "lateral_interception_player_name", # -> not known before the play
    "lateral_rush", # -> not known before the play
    "interception_player_id", # -> not known before the play
    "assist_tackle_3_player_name", # -> not known before the play
    "pass_defense_2_player_id", # -> not known before the play
    "receiver_player_name", # -> not known before the play
    "away_score", # -> not known before the play
    "forced_fumble_player_2_player_name", # -> not known before the play
    "qb_hit_2_player_name", # -> not known before the play
    "order_sequence", # -> only available after 2011
    "lateral_rusher_player_id", # -> not known before the play
    "punt_returner_player_id", # -> not known before the play
    "cpoe", # -> correlated with cp (1-cp = cpoe if compl. and 0-cp if incompl.)
    "punt_in_endzone",  # -> not known before the play
    "fantasy_player_name", # -> not known before the play
    "passer_player_name", # -> correlated with passer id
    "xyac_mean_yardage", # -> irrelevant for use case
    "fixed_drive", # -> correlated with drive
    "forced_fumble_player_1_player_name", # -> not known before the play
    "lateral_interception_player_id", # -> not known before the play
    "solo_tackle", # -> not known before the play
    "kickoff_out_of_bounds", # -> all plays already dropped -> always 0
    "fumbled_2_player_id", # -> not known before the play
    "fumbled_1_team", # -> not known before the play
    "defensive_two_point_attempt", # -> all plays already dropped -> always 0
    # "spread_line",
    "drive_game_clock_end", # -> not known before the play
    "home_opening_kickoff", # -> all plays already dropped -> always 0
    "fantasy_id", # -> irrelevant for use case
    "forced_fumble_player_1_player_id", # -> not known before the play
    "away_coach", # -> irrelevant for use case (why away_coach and not pos_coach)
    "fumbled_2_team", # -> not known before the play
    "kickoff_fair_catch", # -> not known before the play
    "half_sack_1_player_id", # -> not known before the play
    "receiver", # -> not known before the play
    "punter_player_id", # -> not known before the play
    "xpass", # -> only available after 2006
    "replay_or_challenge", # -> no influence on play
    "rusher_player_name", # -> not known before the play
    "pass", # -> all other plays already dropped -> always 1 in the pass dataframe (always 0 in run df)
    "assist_tackle_1_player_name", # -> not known before the play
    "fixed_drive_result", # -> not known before the play
    "kickoff_returner_player_id", # -> all plays already dropped -> always 0
    "forced_fumble_player_1_team", # -> not known before the play
    "half_sack_2_player_id", # -> not known before the play
    "id", # -> only for identification
    "drive_end_transition", # -> not known before the play
    "passer_player_id", # -> correlated with passer id
    "tackle_for_loss_1_player_name", # -> not known before the play
    "field_goal_attempt", # -> all plays already dropped -> always 0
    "lateral_punt_returner_player_id", # -> all plays already dropped -> always 0
    "play_type_nfl", # -> correlated with play type
    "drive_first_downs", # -> correlated with drive play count
    "result", # -> home_score - away_score
    "receiver_id", # -> not known before the play
    "start_time", # -> too many unique values (175)
    "name", # -> -> only for identification
    "rusher_player_id", # -> not known before the play
    "passer_jersey_number", # -> correlated with passer id
    "fantasy_player_id", # -> not known before the play
    "tackle_for_loss_1_player_id", # -> not known before the play
    "own_kickoff_recovery_player_name", # -> all plays already dropped -> always 0
    "drive_time_of_possession", # -> not known before the play
    "forced_fumble_player_2_player_id", # -> not known before the play
    "assist_tackle_2_player_id", # -> not known before the play
    "own_kickoff_recovery", # -> all plays already dropped -> always 0
    "solo_tackle_1_player_name", # -> not known before the play
    "special", # -> all plays already dropped -> always 0
    "lateral_sack_player_id", # -> not known before the play
    "lateral_rushing_yards", # -> not known before the play
    "assist_tackle_3_team", # -> not known before the play
    "drive_quarter_end", # -> not known before the play
    "location", # -> only home or neutral -> neutral only includes the super bowl
    "total", # -> only known after the game
    "rushing_yards", # -> not known before the play
    # "total_line",
    "solo_tackle_1_player_id", # -> not known before the play
    "assist_tackle_4_player_name", # -> not known before the play
    "assist_tackle_4_team", # -> not known before the play
    "safety_player_id", # -> not known before the play
    # "drive_start_yard_line",
    "kicker_player_id", # -> all plays already dropped -> always nan
    "assist_tackle_3_player_id", # -> not known before the play
    "lateral_kickoff_returner_player_name", # -> not known before the play
    "kickoff_returner_player_name", # -> all plays already dropped -> always nan
    "solo_tackle_2_player_name", # -> not known before the play
]

In [None]:
drop_columns3 = [
    "drive_inside20", # -> not known before the play
    "penalty", # -> not known before the play
    "penalty_yards", # -> not known before the play
    "penalty_type", # -> not known before the play
    "drive_play_id_ended", # -> not known before the play
    "drive_play_id_started", # -> only for identification
    "first_down_penalty", # -> not known before the play
    "fourth_down_converted", # -> not known before the play
    "fourth_down_failed", # -> not known before the play
    "fumble_lost", # -> not known before the play
    "fumble_recovery_1_player_id", # -> not known before the play
    "fumble_recovery_1_player_name", # -> not known before the play
    "fumble_recovery_1_team", # -> not known before the play
    "fumble_recovery_1_yards", # -> not known before the play
    "fumble_recovery_2_player_id", # -> not known before the play
    "fumble_recovery_2_player_name", # -> not known before the play
    "fumble_recovery_2_team", # -> not known before the play
    "fumble_recovery_2_yards", # -> not known before the play
    "pass_attempt", # -> all other plays already dropped -> always 1 in the pass dataframe (always 0 in run df)
    "qb_epa", # -> not known before the play
    "rush_attempt", # -> all other plays already dropped -> always 0 in the pass dataframe (always 1 in run df)
    "safety", # -> not known before the play
    "series_result", # -> not known before the play
    "series_success", # -> not known before the play
    "third_down_converted", # -> not known before the play
    "third_down_failed", # -> not known before the play
    "complete_pass", # -> not known before the play
    "incomplete_pass", # -> not known before the play
    "sack", # -> not known before the play
    "tackled_for_loss", # -> not known before the play
    # "half_seconds_remaining", # -> correlated with game time
    "game_half", # -> correlated with game time
    "air_yards", # -> not known before the play
    "yards_after_catch", # -> not known before the play
    "epa", # -> not known before the play
    "wpa", # -> not known before the play
    "interception", # -> not known before the play
    "fumble", # -> not known before the play
    "qb_hit", # -> not known before the play
    "qb_scramble", # -> not known before the play
]

In [None]:
# drop columns
combined_df.drop(drop_columns1, axis=1, inplace=True)
combined_df.drop(drop_columns2, axis=1, inplace=True)
combined_df.drop(drop_columns3, axis=1, inplace=True)

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
display(combined_df.head(10))
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

In [None]:
combined_df.shape

## Splitting
Thilo

In [None]:
run_df = combined_df[combined_df["play_type"] == "run"]
run_df.drop(["play_type"], axis=1, inplace=True)
pass_df = combined_df[combined_df["play_type"] == "pass"]
pass_df.drop(["play_type"], axis=1, inplace=True)

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
display(run_df.head())
display(pass_df.head())
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

run_df.to_csv("Data/run_df.csv")
pass_df.to_csv("Data/pass_df.csv")

## Passer-based Outlier Removal
Tok

In [None]:
len(list(combined_df["passer_id"].unique()))

In [None]:
# Count the number of passes per passer and game
passer_game_counts = pass_df.groupby(['passer_id', 'game_id']).size().reset_index(name='count')

# Filter the passers with at least 14 pass attempts in at least one game
passers_with_14_passes = set(passer_game_counts[passer_game_counts['count'] >= 14]['passer_id'])

# Filter out the pass plays where the passer_id is not in the passers_with_14_passes set
filtered_qb_pass_plays_df = pass_df[pass_df['passer_id'].isin(passers_with_14_passes)]
filtered_noqb_pass_plays_df = pass_df[~pass_df['passer_id'].isin(passers_with_14_passes)]

pd.set_option("display.max_columns", None)

display(filtered_qb_pass_plays_df.describe())
display(filtered_noqb_pass_plays_df.describe())

In [None]:
# Count the number of passes per passer and game
passer_game_counts = pass_df.groupby(['passer_id', 'season']).size().reset_index(name='count')

# Filter the passers with at least 224 pass attempts in at least one season
passers_with_min_passes = set(passer_game_counts[passer_game_counts['count'] >= 224]['passer_id'])

# Filter out the pass plays where the passer_id is not in the passers_with_min_passes set
filtered_pass_plays_df = pass_df[pass_df['passer_id'].isin(passers_with_min_passes)]
filtered_nopass_plays_df = pass_df[~pass_df['passer_id'].isin(passers_with_min_passes)]

display(filtered_pass_plays_df.describe())
display(filtered_nopass_plays_df.describe())

pass_df = filtered_pass_plays_df

pd.reset_option("display.max_columns", None)

print(len(passers_with_min_passes))

## Column Transformation and Handling of Missing Values
Thilo & Tok

In [None]:
# detect NAs
for column in pass_df.columns:
    print(f"{column}: {sum(pass_df[column].isna())}")

In [None]:
for column in run_df.columns:
    print(f"{column}: {sum(run_df[column].isna())}")

In [None]:
# drop attributes with >5% of nan values
pass_df.drop(["cp", "weather", "play_clock", "wind", "temp", "surface"], axis=1, inplace=True)
run_df.drop(["cp", "weather", "play_clock", "wind", "temp", "surface"], axis=1, inplace=True)

In [None]:
display(pass_df[(pass_df['roof'].isna())].groupby('game_id').count())
display(run_df[(run_df['roof'].isna())].groupby('game_id').count())

In [None]:
"""
https://www.nfl.com/games/jaguars-at-texans-2021-reg-1 (closed)
https://www.nfl.com/games/eagles-at-falcons-2021-reg-1 (open)
https://www.nfl.com/games/seahawks-at-colts-2021-reg-1 (closed)
https://www.nfl.com/games/rams-at-colts-2021-reg-2 (closed)
https://www.nfl.com/games/panthers-at-texans-2021-reg-3 (closed)
https://www.nfl.com/games/football-team-at-falcons-2021-reg-4 (open)
https://www.nfl.com/games/patriots-at-texans-2021-reg-5  (closed)
https://www.nfl.com/games/texans-at-colts-2021-reg-6 (open)
https://www.nfl.com/games/panthers-at-falcons-2021-reg-8 (open)
https://www.nfl.com/games/rams-at-texans-2021-reg-8 (closed)
https://www.nfl.com/games/titans-at-colts-2021-reg-8 (open)
https://www.nfl.com/games/jets-at-colts-2021-reg-9 (closed)
https://www.nfl.com/games/jaguars-at-colts-2021-reg-10 (closed)
https://www.nfl.com/games/patriots-at-falcons-2021-reg-11 (closed)
https://www.nfl.com/games/jets-at-texans-2021-reg-12 (closed)
https://www.nfl.com/games/buccaneers-at-colts-2021-reg-12 (closed)
https://www.nfl.com/games/colts-at-texans-2021-reg-13 (closed)
https://www.nfl.com/games/buccaneers-at-falcons-2021-reg-13 (closed)
https://www.nfl.com/games/seahawks-at-texans-2021-reg-14 (open)
https://www.nfl.com/games/patriots-at-colts-2021-reg-15 (closed)
https://www.nfl.com/games/lions-at-falcons-2021-reg-16 (open)
https://www.nfl.com/games/chargers-at-texans-2021-reg-16 (closed)
https://www.nfl.com/games/raiders-at-colts-2021-reg-17 (closed)
https://www.nfl.com/games/saints-at-falcons-2021-reg-18 (closed)
https://www.nfl.com/games/titans-at-texans-2021-reg-18 (closed)
"""


closed_roof = [
    "2021_01_JAX_HOU",
    "2021_01_SEA_IND",
    "2021_02_LA_IND",
    "2021_03_CAR_HOU",
    "2021_05_NE_HOU",
    "2021_08_LA_HOU",
    "2021_09_NYJ_IND",
    "2021_10_JAX_IND",
    "2021_11_NE_ATL",
    "2021_12_NYJ_HOU",
    "2021_12_TB_IND",
    "2021_13_IND_HOU",
    "2021_13_TB_ATL",
    "2021_15_NE_IND",
    "2021_16_LAC_HOU",
    "2021_17_LV_IND",
    "2021_18_NO_ATL",
    "2021_18_TEN_HOU"
]

open_roof = [
    "2021_01_PHI_ATL",
    "2021_04_WAS_ATL",
    "2021_06_HOU_IND",
    "2021_08_CAR_ATL",
    "2021_08_TEN_IND",
    "2021_14_SEA_HOU",
    "2021_16_DET_ATL"
]

def update_roof(row):
    if any(game_id in row['game_id'] for game_id in open_roof):
        return "open"
    if any(game_id in row['game_id'] for game_id in closed_roof):
        return "closed"
    return row['roof']

# Apply the function to update 'roof'
pass_df.loc[combined_df['roof'].isna(), 'roof'] = pass_df[pass_df['roof'].isna()].apply(update_roof, axis=1)
run_df.loc[combined_df['roof'].isna(), 'roof'] = run_df[run_df['roof'].isna()].apply(update_roof, axis=1)



In [None]:
display(pass_df[(pass_df['roof'].isna())].groupby('game_id').count())
display(run_df[(run_df['roof'].isna())].groupby('game_id').count())

In [None]:
# drop passer and game id which is not needed anymore
run_df.drop(["passer_id", "game_id"], axis=1, inplace=True)
pass_df.drop(["passer_id", "game_id"], axis=1, inplace=True)

In [None]:
run_df = run_df.dropna()
pass_df = pass_df.dropna()

In [None]:
for column in pass_df.columns:
    print(f"{column}: {sum(pass_df[column].isna())}")

In [None]:
for column in run_df.columns:
    print(f"{column}: {sum(run_df[column].isna())}")

In [None]:
def transform_dsyl(row):
    value = row["drive_start_yard_line"]

    if pd.notna(value) and isinstance(value, str):  # Check if not NaN and is a string
        match = re.match(r"([A-Z]+)(\d+)", value)
        if match:
            team, number = match.groups()
            return int(number) if row["posteam"] == team else 100 - int(number)
        elif " " in value:
            # Handle the case where there is a space but no match
            return int(value.split()[1])
        else:
            # Handle the case where there is no space (e.g., '50')
            return int(value)
    else:
        # Handle the case where the value is NaN or not a string
        return value

pass_df["drive_start_yard_line"] = pass_df.apply(transform_dsyl, axis=1)
run_df["drive_start_yard_line"] = run_df.apply(transform_dsyl, axis=1)


## Encoding of Categorical Features
Thilo

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Plotting
plt.figure(figsize=(10, 6))

sns.barplot(x='season', y='yards_gained', data=run_df, errorbar=None, estimator='mean')
sns.regplot(x=np.array(range(len(run_df['season'].unique()))), y=run_df.groupby('season')['yards_gained'].mean().values, scatter=False, ax=plt.gca(), color='red', ci=None)

plt.xticks(rotation=45, ha='right')
plt.title('Average Rushing Yards Gained Per Season')
plt.xlabel('Season')
plt.ylabel('Average Yards Gained')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Plotting
plt.figure(figsize=(10, 6))

sns.barplot(x='season', y='yards_gained', data=pass_df, errorbar=None, estimator='mean')
sns.regplot(x=np.array(range(len(pass_df['season'].unique()))), y=pass_df.groupby('season')['yards_gained'].mean().values, scatter=False, ax=plt.gca(), color='red', ci=None)

plt.xticks(rotation=45, ha='right')
plt.title('Average Passing Yards Gained Per Season')
plt.xlabel('Season')
plt.ylabel('Average Yards Gained')
plt.show()

In [None]:
# encode categorical features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# create ColumnTransformer
encoder = ColumnTransformer(
    transformers=[
        (
            "encoder",
            OneHotEncoder(drop="first"),
            ["posteam", "posteam_type", "roof", "defteam", "season_type"],
        )
    ],
    remainder="passthrough",  # include non-transformed columns
)
encoded_data = encoder.fit_transform(run_df)
feature_names = [
    item.replace("encoder__", "").replace("remainder__", "")
    for item in encoder.get_feature_names_out()
]

# convert preprocessed data to DataFrame
encoded_run_df = pd.DataFrame(encoded_data, columns=feature_names)

In [None]:
# create ColumnTransformer
encoder = ColumnTransformer(
    transformers=[
        (
            "encoder",
            OneHotEncoder(drop="first"),
            ["posteam", "posteam_type", "roof", "defteam", "season_type"],
        )
    ],
    remainder="passthrough",  # include non-transformed columns
)
encoded_data = encoder.fit_transform(pass_df)
feature_names = [
    item.replace("encoder__", "").replace("remainder__", "")
    for item in encoder.get_feature_names_out()
]

# convert preprocessed data to DataFrame
encoded_pass_df = pd.DataFrame(encoded_data, columns=feature_names)

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
display(encoded_run_df.head())
display(encoded_pass_df.head())
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

In [None]:
# remove remaining for modelling useless features
encoded_run_df.drop(["desc"], axis=1, inplace=True)
encoded_pass_df.drop(["desc"], axis=1, inplace=True)

## Correlation Analysis
Thilo

In [None]:
import plotly.graph_objects as go
import seaborn as sns

correlation_matrix = encoded_run_df.corr()
column_list=encoded_run_df.columns
fig = go.Figure(
    data=go.Heatmap(
        z=correlation_matrix,
        x=list(column_list),
        y=list(column_list),
        colorscale="rdylbu",
        zmin=-1,
        zmax=1

    )
)

fig.update_layout(
    showlegend=False, width=800, height=800, autosize=False, title="Correlation matrix"
)

fig.update_yaxes(showticklabels=False, autorange="reversed")
fig.update_xaxes(showticklabels=False)
sns.heatmap(correlation_matrix,cmap=sns.color_palette("coolwarm", as_cmap=True))

fig.show()

In [None]:
correlation_matrix = encoded_pass_df.corr()
column_list=encoded_pass_df.columns
fig = go.Figure(
    data=go.Heatmap(
        z=correlation_matrix,
        x=list(column_list),
        y=list(column_list),
        colorscale="rdylbu",
        zmin=-1,
        zmax=1

    )
)

sns.heatmap(correlation_matrix,cmap=sns.color_palette("coolwarm", as_cmap=True))

fig.update_layout(
    showlegend=False, width=800, height=800, autosize=False, title="Correlation matrix"
)

fig.update_yaxes(showticklabels=False, autorange="reversed")
fig.update_xaxes(showticklabels=False)

fig.show()

In [None]:
# removing highly correlated features (not both but one)
encoded_pass_df.drop(['ep', 'wp', 'series', 'qtr', 'drive', 'def_wp', 'vegas_wp', 'home_wp', 'away_wp', 'vegas_home_wp'], axis= 1, inplace= True)
encoded_run_df.drop(['ep', 'wp', 'series', 'qtr', 'drive', 'def_wp', 'vegas_wp', 'home_wp', 'away_wp', 'vegas_home_wp'], axis= 1, inplace= True)

## Outlier Removal
Joel

In [None]:
# Imports
import matplotlib.pyplot as plt
import seaborn as sns

# set options to work with
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# show first 5 rows of dataframe
display(encoded_pass_df.head())

# Convert the datatypes of the attributes in the DataFrame
for column in encoded_pass_df.columns:
    try:
        encoded_pass_df[column] = pd.to_numeric(encoded_pass_df[column])
    except ValueError:
        encoded_pass_df[column] = encoded_pass_df[column].apply(str)

# Seperate boolean from numeric values, as they are irrelevant for the purpose filtering outliers
boolean_variables = [
    'posteam_ATL', 'posteam_BAL', 'posteam_BUF', 'posteam_CAR', 'posteam_CHI', 'posteam_CIN', 'posteam_CLE', 'posteam_DAL', 'posteam_DEN', 
    'posteam_DET', 'posteam_GB', 'posteam_HOU', 'posteam_IND', 'posteam_JAX', 'posteam_KC', 'posteam_LA', 'posteam_LAC', 'posteam_LV', 'posteam_MIA', 'posteam_MIN',
    'posteam_NE', 'posteam_NO', 'posteam_NYG', 'posteam_NYJ', 'posteam_PHI', 'posteam_PIT', 'posteam_SEA', 'posteam_SF', 'posteam_TB', 'posteam_TEN', 'posteam_WAS', 
    'roof_dome', 'roof_open', 'roof_outdoors', 'goal_to_go', 'shotgun', 'no_huddle', 'qb_dropback', 'div_game',
    'posteam_type_home', 'defteam_ATL', 'defteam_BAL', 'defteam_BUF', 'defteam_CAR', 'defteam_CHI', 'defteam_CIN', 'defteam_CLE', 'defteam_DAL', 'defteam_DEN', 
    'defteam_DET', 'defteam_GB', 'defteam_HOU', 'defteam_IND', 'defteam_JAX', 'defteam_KC', 'defteam_LA', 'defteam_LAC', 'defteam_LV', 'defteam_MIA', 'defteam_MIN', 
    'defteam_NE', 'defteam_NO', 'defteam_NYG', 'defteam_NYJ', 'defteam_PHI', 'defteam_PIT', 'defteam_SEA', 'defteam_SF', 'defteam_TB', 'defteam_TEN', 'defteam_WAS'
    ]

# boolean_variables = encoded_df.select_dtypes(include='bool').columns.tolist()

for variable in boolean_variables:
    encoded_pass_df[variable] = encoded_pass_df[variable].astype(bool)

# 1. Create a Copy to work with
numeric_df = encoded_pass_df.select_dtypes(include=['number']).copy()

print(numeric_df)

# 2. Checking ranges and distributions
ranges_df = pd.DataFrame(columns=['min','max','mean', 'median', 'quantile1', 'quantile3', 'iqr', 'lower', 'upper'])

for column in numeric_df.columns:
    min_values = numeric_df[column].min()
    max_values = numeric_df[column].max()
    mean = numeric_df[column].mean()
    median = numeric_df[column].median()

    #set quantile
    quantile_value = 0.25
    q1 = numeric_df[column].quantile(quantile_value)
    q3 = numeric_df[column].quantile(1-quantile_value)
    iqr = q3-q1
    lower_bound = q1 - 3.0 * iqr
    upper_bound = q3 + 3.0 * iqr
    

    ranges_df.loc[column] = [min_values, max_values, mean, median, q1, q3, iqr, lower_bound, upper_bound]

    # Display the distributions of each column + the quantiles
    # Histogram
    plt.hist(numeric_df[column], bins=20, alpha=0.7)
    plt.axvline(q1, color='red', linestyle='dashed', linewidth=2, label='quantile 1')
    plt.axvline(q3, color='blue', linestyle='dashed', linewidth=2, label='quantile 3')
    plt.axvline(lower_bound, color='black', linestyle='dashed', linewidth=2, label='lower_bound')
    plt.axvline(upper_bound, color='black', linestyle='dashed', linewidth=2, label='upper_bound')
    plt.legend()
    plt.title('Histogram: ' + column)
    plt.show()

# Summary of ranges
display(ranges_df)

# Analyzing the Diagrams the following statements can be made
# a) Most of the values outliers can be explained and are therefor meaningfull for the dataset
#       --> Get rid of attributes that are negligible for outlier identification
drop_columns = ['yardline_100', 'game_seconds_remaining', 'down']
numeric_df.drop(drop_columns, axis=1, inplace=True)

# b) the data is already well defined and does not contain too many outliers
#       --> Keep only the rows where the values are within 3.0 times the IQR from Q1 and Q3
ranges_updated_df = pd.DataFrame(columns=['min','max','mean', 'median'])
discarded_rows = pd.DataFrame(columns=encoded_pass_df.columns)

# 3. Remove the Rows that hold outliers 
for column in numeric_df.columns:
    lower_bound = ranges_df['lower'][column]
    upper_bound = ranges_df['upper'][column]

    outliers = encoded_pass_df.loc[(numeric_df[column] < lower_bound) | (numeric_df[column] > upper_bound)]
    discarded_rows = pd.concat([discarded_rows, outliers])
    encoded_pass_df = encoded_pass_df.loc[~((encoded_pass_df[column] < lower_bound) | (encoded_pass_df[column] > upper_bound))]
    numeric_df = numeric_df.loc[~((numeric_df[column]< lower_bound) | (numeric_df[column] > upper_bound))]

    # Histogram after removal of Extremal values
    plt.hist(numeric_df[column], bins=20, alpha=0.7)
    plt.title('Histogram: ' + column)
    plt.show()

    min_values = numeric_df[column].min()
    max_values = numeric_df[column].max()
    mean = numeric_df[column].mean()
    median = numeric_df[column].median()

    ranges_updated_df.loc[column] = [min_values, max_values, mean, median]

# 4. Display results
display(ranges_updated_df)
display(discarded_rows.head())
display(encoded_pass_df.head())

# reset options
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

In [None]:
# show first 5 rows of dataframe
display(encoded_run_df.head())

# Convert the datatypes of the attributes in the DataFrame
for column in encoded_run_df.columns:
    try:
        encoded_run_df[column] = pd.to_numeric(encoded_run_df[column])
    except ValueError:
        encoded_run_df[column] = encoded_run_df[column].apply(str)

# Seperate boolean from numeric values, as they are irrelevant for the purpose filtering outliers
boolean_variables = [
    'posteam_ATL', 'posteam_BAL', 'posteam_BUF', 'posteam_CAR', 'posteam_CHI', 'posteam_CIN', 'posteam_CLE', 'posteam_DAL', 'posteam_DEN', 
    'posteam_DET', 'posteam_GB', 'posteam_HOU', 'posteam_IND', 'posteam_JAX', 'posteam_KC', 'posteam_LA', 'posteam_LAC', 'posteam_LV', 'posteam_MIA', 'posteam_MIN',
    'posteam_NE', 'posteam_NO', 'posteam_NYG', 'posteam_NYJ', 'posteam_PHI', 'posteam_PIT', 'posteam_SEA', 'posteam_SF', 'posteam_TB', 'posteam_TEN', 'posteam_WAS', 
    'roof_dome', 'roof_open', 'roof_outdoors', 'goal_to_go', 'shotgun', 'no_huddle', 'qb_dropback', 'div_game',
    'posteam_type_home', 'defteam_ATL', 'defteam_BAL', 'defteam_BUF', 'defteam_CAR', 'defteam_CHI', 'defteam_CIN', 'defteam_CLE', 'defteam_DAL', 'defteam_DEN', 
    'defteam_DET', 'defteam_GB', 'defteam_HOU', 'defteam_IND', 'defteam_JAX', 'defteam_KC', 'defteam_LA', 'defteam_LAC', 'defteam_LV', 'defteam_MIA', 'defteam_MIN', 
    'defteam_NE', 'defteam_NO', 'defteam_NYG', 'defteam_NYJ', 'defteam_PHI', 'defteam_PIT', 'defteam_SEA', 'defteam_SF', 'defteam_TB', 'defteam_TEN', 'defteam_WAS'
    ]

# boolean_variables = encoded_df.select_dtypes(include='bool').columns.tolist()

for variable in boolean_variables:
    encoded_run_df[variable] = encoded_run_df[variable].astype(bool)

# 1. Create a Copy to work with
numeric_df = encoded_run_df.select_dtypes(include=['number']).copy()

print(numeric_df)

# 2. Checking ranges and distributions
ranges_df = pd.DataFrame(columns=['min','max','mean', 'median', 'quantile1', 'quantile3', 'iqr', 'lower', 'upper'])

for column in numeric_df.columns:
    min_values = numeric_df[column].min()
    max_values = numeric_df[column].max()
    mean = numeric_df[column].mean()
    median = numeric_df[column].median()

    #set quantile
    quantile_value = 0.25
    q1 = numeric_df[column].quantile(quantile_value)
    q3 = numeric_df[column].quantile(1-quantile_value)
    iqr = q3-q1
    lower_bound = q1 - 3.0 * iqr
    upper_bound = q3 + 3.0 * iqr
    

    ranges_df.loc[column] = [min_values, max_values, mean, median, q1, q3, iqr, lower_bound, upper_bound]

    # Display the distributions of each column + the quantiles
    # Histogram
    plt.hist(numeric_df[column], bins=20, alpha=0.7)
    plt.axvline(q1, color='red', linestyle='dashed', linewidth=2, label='quantile 1')
    plt.axvline(q3, color='blue', linestyle='dashed', linewidth=2, label='quantile 3')
    plt.axvline(lower_bound, color='black', linestyle='dashed', linewidth=2, label='lower_bound')
    plt.axvline(upper_bound, color='black', linestyle='dashed', linewidth=2, label='upper_bound')
    plt.legend()
    plt.title('Histogram: ' + column)
    plt.show()

# Summary of ranges
display(ranges_df)

# Analyzing the Diagrams the following statements can be made
# a) Most of the values outliers can be explained and are therefor meaningfull for the dataset
#       --> Get rid of attributes that are negligible for outlier identification
drop_columns = ['yardline_100', 'game_seconds_remaining', 'down']
numeric_df.drop(drop_columns, axis=1, inplace=True)

# b) the data is already well defined and does not contain too many outliers
#       --> Keep only the rows where the values are within 3.0 times the IQR from Q1 and Q3
ranges_updated_df = pd.DataFrame(columns=['min','max','mean', 'median'])
discarded_rows = pd.DataFrame(columns=encoded_run_df.columns)

# 3. Remove the Rows that hold outliers 
for column in numeric_df.columns:
    lower_bound = ranges_df['lower'][column]
    upper_bound = ranges_df['upper'][column]

    outliers = encoded_run_df.loc[(numeric_df[column] < lower_bound) | (numeric_df[column] > upper_bound)]
    discarded_rows = pd.concat([discarded_rows, outliers])
    encoded_run_df = encoded_run_df.loc[~((encoded_run_df[column] < lower_bound) | (encoded_run_df[column] > upper_bound))]
    numeric_df = numeric_df.loc[~((numeric_df[column]< lower_bound) | (numeric_df[column] > upper_bound))]

    # Histogram after removal of Extremal values
    plt.hist(numeric_df[column], bins=20, alpha=0.7)
    plt.title('Histogram: ' + column)
    plt.show()

    min_values = numeric_df[column].min()
    max_values = numeric_df[column].max()
    mean = numeric_df[column].mean()
    median = numeric_df[column].median()

    ranges_updated_df.loc[column] = [min_values, max_values, mean, median]

# 4. Display results
display(ranges_updated_df)
display(discarded_rows.head())
display(encoded_run_df.head())

# reset options
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

## Normalization
Thilo

In [None]:
# undo transformation to boolean values for binary features
for variable in boolean_variables:
    encoded_run_df[variable] = encoded_run_df[variable].astype(int)
    encoded_pass_df[variable] = encoded_pass_df[variable].astype(int)

In [None]:
numeric_features = [
    "yardline_100",
    "game_seconds_remaining",
    "down",
    "ydstogo",
    "score_differential",
    "td_prob",
    "spread_line",
    "total_line",
    "season",
    "posteam_timeouts_remaining",
    "defteam_timeouts_remaining",
    'quarter_seconds_remaining',
    'half_seconds_remaining',
    'drive_start_yard_line'
]

In [None]:
# plot histograms to see the approximate distribution
for column in numeric_features:
   lower_bound = ranges_df['lower'][column]
   upper_bound = ranges_df['upper'][column]
   numeric_df = encoded_run_df.loc[~((encoded_run_df[column] < lower_bound) | (encoded_run_df[column] > upper_bound))]

   # Histogram after removal of Extremal values
   plt.hist(numeric_df[column], bins=20, alpha=0.7)
   plt.title('Histogram: ' + column)
   plt.show()

In [None]:
# plot histograms to see the approximate distribution
for column in numeric_features:
   lower_bound = ranges_df['lower'][column]
   upper_bound = ranges_df['upper'][column]
   numeric_df = encoded_pass_df.loc[~((encoded_pass_df[column] < lower_bound) | (encoded_pass_df[column] > upper_bound))]

   # Histogram after removal of Extremal values
   plt.hist(numeric_df[column], bins=20, alpha=0.7)
   plt.title('Histogram: ' + column)
   plt.show()

only the score_differential looks similar to a normal distribution. Therefore, it will be normalized using mean normalization while all other numerical features will be normalized using min-max-scaling

In [None]:
numeric_features.remove('score_differential')

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
normalization= ColumnTransformer(
    transformers=[
        ('standardization', StandardScaler(),['score_differential']),
        ('minmax', MinMaxScaler(), numeric_features)
    ],
    remainder='passthrough'  # include non-transformed columns
    )

normalized_data= normalization.fit_transform(encoded_run_df)
feature_names = [item.replace('standardization__', '').replace('minmax__', '').replace('remainder__', '') for item in normalization.get_feature_names_out()]
normalized_run_df = pd.DataFrame(normalized_data, columns=feature_names)

In [None]:
normalization= ColumnTransformer(
    transformers=[
        ('standardization', StandardScaler(),['score_differential']),
        ('minmax', MinMaxScaler(), numeric_features)
    ],
    remainder='passthrough'  # include non-transformed columns
    )

normalized_data= normalization.fit_transform(encoded_pass_df)
feature_names = [item.replace('standardization__', '').replace('minmax__', '').replace('remainder__', '') for item in normalization.get_feature_names_out()]
normalized_pass_df = pd.DataFrame(normalized_data, columns=feature_names)

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
display(normalized_run_df.head())
display(normalized_pass_df.head())
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")