In [1]:
import pandas as pd
import numpy as np
from haversine import haversine, Unit
import data_agg_functions as daf

In [2]:
#Run the NFLfastR data pull.R file prior to running this code. The full play by play dataset is too large to be stored on github.
data = pd.read_csv('2008-2022.csv', encoding = 'latin1', low_memory=False)
data = data.rename(columns = {"Unnamed: 0": "index"})
data[["year", "week"]]=data['game_id'].str.split("_", expand=True)[[0,1]].astype('int32')
data = data[( (data['year'] >= 2021) & (data['week'] <=18) ) | ((data['year'] <=2020) & (data['week']<=17))]
data['temp2'] = data['weather'].str.split("Temp", expand=True)[1].str.extract('(\d+)')                         
data['temperature'] = np.where(data['temp'].notna(), data['temp'], data['temp2'])
data['temperature'] = data['temperature'].fillna(70).astype('int32')

In [3]:
home_away_data = daf.create_home_away_table(data)
score_data_final = daf.points_scored_allowed(data)
scored_data_for_mod = daf.summary_data(score_data_final, "points_scored", "points_allowed", home_away_data)
scored_momen_data_for_mod = daf.summary_data(score_data_final, "points_scored", "points_allowed", home_away_table = home_away_data, qtrs = 3)
scored_momen_data_for_mod = scored_momen_data_for_mod.rename(columns = {'avg_points_scored': 'avg_points_scored_momen', 'avg_points_allowed': 'avg_points_allowed_momen'})
yards_data_final = daf.yards_gained_allowed(data)
yards_data_for_mod = daf.summary_data(yards_data_final, "yards_gained", "yards_allowed", home_away_data)
spread_diff_data_final = daf.spread_diff_data(data)
spread_ind_for_mod = daf.beat_spread_prior_wk(spread_diff_data_final, data)
spread_diff_data_for_mod = daf.summary_data(spread_diff_data_final,'diff_from_spread', defense_column = False, home_away_table = home_away_data )
spread_diff_capped_data_for_mod = daf.summary_data(spread_diff_data_final,'diff_from_spread_capped', home_away_table = home_away_data, defense_column = False )
spread_diff_momen_capped_data_for_mod = daf.summary_data(spread_diff_data_final, 'diff_from_spread_capped', defense_column = False,home_away_table = home_away_data, qtrs = 3)
spread_diff_momen_capped_data_for_mod = spread_diff_momen_capped_data_for_mod.rename(columns = {'avg_diff_from_spread_capped': 'avg_diff_from_spread_3q'})
turnover_data_final = daf.turnover_data(data)
turnover_data_for_mod = daf.summary_data(turnover_data_final, "turnovers", "turnovers_forced", home_away_data)
qb_hit_data_final = daf.qb_hits(data)
qb_hit_data_for_mod = daf.summary_data(qb_hit_data_final, "qb_hits_allowed", "qb_hits", home_away_data)
two_min_data_final = daf.two_min_data(data)
two_min_data_for_mod = daf.summary_data(two_min_data_final, "two_min_scored", "two_min_allowed", home_away_data)
backup_data_for_mod = daf.backup_qb_data(data, home_away_data)
DVOA_for_mod = daf.DVOA_data(data, home_away_data)
distance_data = daf.distanced_traveled(data)
game_att = daf.game_attributes(data)
modeling_dataset = daf.finalize_modeling_dataset(game_att, scored_data_for_mod, yards_data_for_mod, turnover_data_for_mod,
    qb_hit_data_for_mod, two_min_data_for_mod, scored_momen_data_for_mod, spread_diff_capped_data_for_mod,
    spread_diff_momen_capped_data_for_mod, DVOA_for_mod, distance_data, backup_data_for_mod, spread_ind_for_mod)

In [4]:
modeling_dataset.to_csv('modeling_dataset.csv')