# 03 Data Prep

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# common imports
import numpy as np
import pandas as pd
from typing import List
# import matplotlib.pyplot as plt 
# import seaborn as sns
# import sys
# from datetime import datetime
# import sklearn

# will display all the columns in the df moving forward
pd.set_option('display.max_columns', 500)

In [3]:
from src.data_preparation import load_csv_data_from_disk
data = load_csv_data_from_disk(file_name='scraped_data.csv')

## Data Transformations

In [4]:
from src.data_preparation import fix_opponent_names
data = fix_opponent_names(data)

In [5]:
from src.data_preparation import map_team_abbreviations_to_names
data = map_team_abbreviations_to_names(data)

In [6]:
from src.data_preparation import add_home_or_away_column
data = add_home_or_away_column(data)

In [7]:

# super bowl games are not being included in the final df because both teams are being designated as the home team
# this is happening because we used '@' to determine away teams with everything else considered to be the home team
# this doesn't take into account for 'N' values in the @ column which designateds a neutral field
# even season yr = AFC team is the home team for the SB, odd season yr = NFC team is the home team for the SB
# the game is played on neutral ground where the location (typically a new stadium) is determined years in advance
# issue fixed 04/30/2023, utilized np.where to create conditions


In [8]:
from src.data_preparation import add_datetime_column
data = add_datetime_column(data)
# season = year is having issues 
# issue has been fixed 04/25/23 (added 1 to the year if games were played in Jan/Feb)

In [9]:
#data.drop(data[data['passyd'] == 'Canceled'].index, inplace = True)
#data.loc[data['passyd'] == 'Canceled']

# one game was canceled last year so I dropped the canceled game in the scraping.py file
# this line of code is verifying that it has been removed
# this was preventing me from engineering additional features from previous game stats

In [10]:
data

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,home_or_away,month,year,hour,date_time
0,2022,Buffalo Bills,1,8,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,AWAY,9,2022,20,2022-09-08 20:00:00
1,2022,Buffalo Bills,2,19,September 19,7:15PM ET,W,,2-0,,Tennessee Titans,41.0,7.0,23.0,414.0,313.0,101.0,,12.0,187.0,107.0,80.0,4.0,17.69,18.01,1.55,HOME,9,2022,19,2022-09-19 19:00:00
2,2022,Buffalo Bills,3,25,September 25,1:00PM ET,L,,2-1,@,Miami Dolphins,19.0,21.0,31.0,497.0,382.0,115.0,1.0,15.0,212.0,171.0,41.0,,15.88,-7.45,-4.86,AWAY,9,2022,13,2022-09-25 13:00:00
3,2022,Buffalo Bills,4,2,October 2,1:00PM ET,W,,3-1,@,Baltimore Ravens,23.0,20.0,22.0,326.0,201.0,125.0,2.0,22.0,296.0,134.0,162.0,2.0,2.10,2.66,-1.69,AWAY,10,2022,13,2022-10-02 13:00:00
4,2022,Buffalo Bills,5,9,October 9,1:00PM ET,W,,4-1,,Pittsburgh Steelers,38.0,3.0,21.0,552.0,432.0,120.0,2.0,23.0,364.0,310.0,54.0,2.0,20.66,9.42,3.54,HOME,10,2022,13,2022-10-09 13:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15315,1994,Los Angeles Rams,13,27,November 27,4:00PM ET,L,,4-8,@,Los Angeles Chargers,17.0,31.0,17.0,326.0,278.0,48.0,5.0,16.0,243.0,129.0,114.0,1.0,-15.08,12.27,-6.03,AWAY,11,1994,16,1994-11-27 16:00:00
15316,1994,Los Angeles Rams,14,4,December 4,4:00PM ET,L,,4-9,,New Orleans Saints,15.0,31.0,20.0,333.0,258.0,75.0,4.0,20.0,328.0,191.0,137.0,1.0,-1.17,-3.52,-13.22,HOME,12,1994,16,1994-12-04 16:00:00
15317,1994,Los Angeles Rams,15,11,December 11,1:00PM ET,L,,4-10,@,Tampa Bay Buccaneers,14.0,24.0,19.0,261.0,198.0,63.0,2.0,17.0,355.0,230.0,125.0,,-11.84,-12.38,5.25,AWAY,12,1994,13,1994-12-11 13:00:00
15318,1994,Los Angeles Rams,16,18,December 18,1:00PM ET,L,,4-11,@,Chicago Bears,13.0,27.0,13.0,243.0,206.0,37.0,1.0,19.0,298.0,135.0,163.0,,-1.42,-6.17,-8.99,AWAY,12,1994,13,1994-12-18 13:00:00


## Feature Engineering

In [11]:
from src.data_preparation import add_win_rates_last_n_games
data = add_win_rates_last_n_games(data, n_games=[1, 4, 8])

In [12]:
from src.data_preparation import add_passing_rates_last_n_games
data = add_passing_rates_last_n_games(data, n_games=[1, 4, 8])

In [13]:
from src.data_preparation import add_rushing_rates_last_n_games
data = add_rushing_rates_last_n_games(data, n_games=[1, 4, 8])

In [14]:
from src.data_preparation import add_passing_allowed_rates_last_n_games
data = add_passing_allowed_rates_last_n_games(data, n_games=[1, 4, 8])

In [15]:
from src.data_preparation import add_rushing_allowed_rates_last_n_games
data = add_rushing_allowed_rates_last_n_games(data, n_games=[1, 4, 8])

In [16]:
from src.data_preparation import add_ot_rates_last_n_games
data = add_ot_rates_last_n_games(data, n_games=[1, 4, 8])

In [17]:
from src.data_preparation import add_to_rates_last_n_games
data = add_to_rates_last_n_games(data, n_games=[1, 4, 8])

In [18]:
from src.data_preparation import add_to_forced_rates_last_n_games
data = add_to_forced_rates_last_n_games(data, n_games=[1, 4, 8])

In [19]:
from src.data_preparation import add_points_scored_rates_last_n_games
data = add_points_scored_rates_last_n_games(data, n_games=[1, 4, 8])

In [20]:
from src.data_preparation import add_points_allowed_rates_last_n_games
data = add_points_allowed_rates_last_n_games(data, n_games=[1, 4, 8])

In [21]:
from src.data_preparation import add_1st_down_rates_last_n_games
data = add_1st_down_rates_last_n_games(data, n_games=[1, 4, 8])

In [22]:
from src.data_preparation import add_1st_down_allowed_rates_last_n_games
data = add_1st_down_allowed_rates_last_n_games(data, n_games=[1, 4, 8])

In [23]:
data

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,home_or_away,month,year,hour,date_time,win,win_rate_last_1_games,win_rate_last_4_games,win_rate_last_8_games,pass_rate_last_1_games,pass_rate_last_4_games,pass_rate_last_8_games,rush_rate_last_1_games,rush_rate_last_4_games,rush_rate_last_8_games,pass_allowed_rate_last_1_games,pass_allowed_rate_last_4_games,pass_allowed_rate_last_8_games,rush_allowed_rate_last_1_games,rush_allowed_rate_last_4_games,rush_allowed_rate_last_8_games,ot_rate_last_1_games,ot_rate_last_4_games,ot_rate_last_8_games,to_rate_last_1_games,to_rate_last_4_games,to_rate_last_8_games,to_forced_rate_last_1_games,to_forced_rate_last_4_games,to_forced_rate_last_8_games,points_scored_rate_last_1_games,points_scored_rate_last_4_games,points_scored_rate_last_8_games,points_allowed_rate_last_1_games,points_allowed_rate_last_4_games,points_allowed_rate_last_8_games,1st_downs_rate_last_1_games,1st_downs_rate_last_4_games,1st_downs_rate_last_8_games,1st_downs_allowed_rate_last_1_games,1st_downs_allowed_rate_last_4_games,1st_downs_allowed_rate_last_8_games
0,1994,Arizona Cardinals,1,4,September 4,4:00PM ET,L,0,0-1,@,Los Angeles Rams,12.0,14.0,23.0,234.0,128.0,106.0,3.0,9.0,152.0,102.0,50.0,2.0,-15.09,17.92,1.36,AWAY,9,1994,16,1994-09-04 16:00:00,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1994,Arizona Cardinals,2,11,September 11,8:00PM ET,L,0,0-2,,New York Giants,17.0,20.0,11.0,174.0,135.0,39.0,3.0,19.0,206.0,88.0,118.0,2.0,-17.99,1.70,6.10,HOME,9,1994,20,1994-09-11 20:00:00,0,0.0,,,128.0,,,106.0,,,102.0,,,50.0,,,0.0,,,3.0,,,2.0,,,12.0,,,14.0,,,23.0,,,9.0,,
2,1994,Arizona Cardinals,3,18,September 18,1:00PM ET,L,0,0-3,@,Cleveland Browns,0.0,32.0,21.0,318.0,255.0,63.0,3.0,17.0,322.0,243.0,79.0,2.0,-23.88,-2.64,1.52,AWAY,9,1994,13,1994-09-18 13:00:00,0,0.0,,,135.0,,,39.0,,,88.0,,,118.0,,,0.0,,,3.0,,,2.0,,,17.0,,,20.0,,,11.0,,,19.0,,
3,1994,Arizona Cardinals,5,2,October 2,4:00PM ET,W,0,1-3,,Minnesota Vikings,17.0,7.0,21.0,309.0,200.0,109.0,2.0,19.0,358.0,340.0,18.0,4.0,0.47,13.72,2.86,HOME,10,1994,16,1994-10-02 16:00:00,1,0.0,,,255.0,,,63.0,,,243.0,,,79.0,,,0.0,,,3.0,,,2.0,,,0.0,,,32.0,,,21.0,,,17.0,,
4,1994,Arizona Cardinals,6,9,October 9,4:00PM ET,L,0,1-4,@,Dallas Cowboys,3.0,38.0,10.0,221.0,168.0,53.0,5.0,22.0,351.0,273.0,78.0,0.0,-26.39,-11.70,5.29,AWAY,10,1994,16,1994-10-09 16:00:00,0,1.0,0.25,,200.0,179.50,,109.0,79.25,,340.0,193.25,,18.0,66.25,,0.0,0.00,,2.0,2.75,,4.0,2.50,,17.0,11.50,,7.0,18.25,,21.0,19.00,,19.0,16.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15315,2022,Washington Commanders,13,4,December 4,1:00PM ET,T,1,7-5-1,@,New York Giants,20.0,20.0,25.0,411.0,246.0,165.0,1.0,20.0,316.0,182.0,134.0,1.0,2.87,2.71,-5.65,AWAY,12,2022,13,2022-12-04 13:00:00,0,1.0,0.75,0.750,138.0,158.25,190.625,176.0,154.50,131.375,165.0,176.75,172.50,167.0,84.50,106.625,0.0,0.00,0.000,1.0,1.00,0.875,1.0,2.00,1.625,19.0,22.75,20.000,13.0,16.00,16.125,20.0,20.75,19.500,18.0,16.00,16.250
15316,2022,Washington Commanders,15,18,December 18,8:15PM ET,L,0,7-6-1,,New York Giants,12.0,20.0,20.0,387.0,228.0,159.0,2.0,19.0,288.0,160.0,128.0,0.0,-3.08,-0.95,-2.25,HOME,12,2022,20,2022-12-18 20:00:00,0,0.0,0.75,0.750,246.0,188.25,178.625,165.0,161.50,146.625,182.0,161.00,178.25,134.0,104.00,110.250,1.0,0.25,0.125,1.0,1.00,0.875,1.0,2.00,1.750,20.0,23.50,20.375,20.0,16.00,16.000,25.0,22.50,20.500,20.0,16.75,16.875
15317,2022,Washington Commanders,16,24,December 24,4:05PM ET,L,0,7-7-1,@,San Francisco 49ers,20.0,37.0,21.0,349.0,270.0,79.0,2.0,14.0,371.0,218.0,153.0,1.0,-2.45,-9.41,-3.02,AWAY,12,2022,16,2022-12-24 16:00:00,0,0.0,0.50,0.625,228.0,200.75,196.375,159.0,163.25,150.500,160.0,158.50,179.00,128.0,112.50,96.625,0.0,0.25,0.125,2.0,1.00,1.125,0.0,1.00,1.500,12.0,18.50,20.375,20.0,15.75,17.625,20.0,21.25,21.250,19.0,17.00,16.750
15318,2022,Washington Commanders,17,1,January 1,1:00PM ET,L,0,7-8-1,,Cleveland Browns,10.0,24.0,17.0,260.0,124.0,136.0,3.0,16.0,301.0,155.0,146.0,0.0,-7.50,-7.26,0.54,HOME,1,2023,13,2023-01-01 13:00:00,0,0.0,0.25,0.500,270.0,220.50,205.375,79.0,144.75,139.625,218.0,181.25,182.00,153.0,145.50,111.000,0.0,0.25,0.125,2.0,1.50,1.250,1.0,0.75,1.500,20.0,17.75,20.000,37.0,22.50,19.625,21.0,21.50,21.125,14.0,17.75,16.500


In [38]:
# should engineered features (rolling averages) reset after each season? Yes
# added season to the .groupby method - .groupby(['team', 'season'])[col]
# they should also reset when iterating through the for loop after each iteration
# set the min_period to n - .rolling(n, min_periods=n).mean()
# this was applied on 05/03/23

In [39]:
data.loc[data['team'] == 'Philadelphia Eagles'].tail(42)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,home_or_away,month,year,hour,date_time,win,win_rate_last_1_games,win_rate_last_4_games,win_rate_last_8_games,pass_rate_last_1_games,pass_rate_last_4_games,pass_rate_last_8_games,rush_rate_last_1_games,rush_rate_last_4_games,rush_rate_last_8_games,pass_allowed_rate_last_1_games,pass_allowed_rate_last_4_games,pass_allowed_rate_last_8_games,rush_allowed_rate_last_1_games,rush_allowed_rate_last_4_games,rush_allowed_rate_last_8_games,ot_rate_last_1_games,ot_rate_last_4_games,ot_rate_last_8_games,to_rate_last_1_games,to_rate_last_4_games,to_rate_last_8_games,to_forced_rate_last_1_games,to_forced_rate_last_4_games,to_forced_rate_last_8_games,points_scored_rate_last_1_games,points_scored_rate_last_4_games,points_scored_rate_last_8_games,points_allowed_rate_last_1_games,points_allowed_rate_last_4_games,points_allowed_rate_last_8_games,1st_downs_rate_last_1_games,1st_downs_rate_last_4_games,1st_downs_rate_last_8_games,1st_downs_allowed_rate_last_1_games,1st_downs_allowed_rate_last_4_games,1st_downs_allowed_rate_last_8_games
12343,2020,Philadelphia Eagles,14,13,December 13,4:25PM ET,W,0,4-8-1,,New Orleans Saints,24.0,21.0,21.0,413.0,167.0,246.0,1.0,20.0,358.0,262.0,96.0,2.0,6.77,-4.23,1.33,HOME,12,2020,16,2020-12-13 16:00:00,1,0.0,0.0,0.25,161.0,185.0,200.125,117.0,112.25,119.0,288.0,232.75,204.0,149.0,128.25,140.5,0.0,0.0,0.0,1.0,1.25,1.625,0.0,0.25,0.875,16.0,16.75,21.125,30.0,25.5,25.0,17.0,19.0,20.0,20.0,19.75,19.875
12344,2020,Philadelphia Eagles,15,20,December 20,4:05PM ET,L,0,4-9-1,@,Arizona Cardinals,26.0,33.0,26.0,422.0,305.0,117.0,0.0,26.0,526.0,423.0,103.0,3.0,10.01,-4.72,-12.58,AWAY,12,2020,16,2020-12-20 16:00:00,0,1.0,0.25,0.375,167.0,179.25,190.75,246.0,134.75,138.0,262.0,240.5,207.875,96.0,114.5,135.5,0.0,0.0,0.0,1.0,1.5,1.5,2.0,0.75,1.0,24.0,18.5,20.5,21.0,24.0,22.875,21.0,18.75,20.125,20.0,19.25,19.25
12345,2020,Philadelphia Eagles,16,27,December 27,4:25PM ET,L,0,4-10-1,@,Dallas Cowboys,17.0,37.0,24.0,477.0,326.0,151.0,3.0,22.0,513.0,362.0,151.0,1.0,5.42,-23.4,0.38,AWAY,12,2020,16,2020-12-27 16:00:00,0,0.0,0.25,0.375,305.0,203.25,207.625,117.0,137.5,128.375,423.0,299.5,239.125,103.0,106.0,125.625,0.0,0.0,0.0,0.0,0.75,1.375,3.0,1.25,1.375,26.0,20.75,20.25,33.0,26.75,23.25,26.0,20.5,21.0,26.0,21.5,20.375
12346,2020,Philadelphia Eagles,17,3,January 3,8:20PM ET,L,0,4-11-1,,Washington Commanders,14.0,20.0,16.0,216.0,98.0,118.0,3.0,16.0,248.0,145.0,103.0,2.0,-11.71,8.33,-3.56,HOME,1,2021,20,2021-01-03 20:00:00,0,0.0,0.25,0.25,326.0,239.75,205.125,151.0,157.75,135.25,362.0,333.75,263.75,151.0,124.75,124.5,0.0,0.0,0.0,3.0,1.25,1.625,1.0,1.5,1.125,17.0,20.75,19.625,37.0,30.25,25.25,24.0,22.0,20.625,22.0,22.0,21.0
12347,2021,Philadelphia Eagles,1,12,September 12,1:00PM ET,W,0,1-0,@,Atlanta Falcons,32.0,6.0,24.0,434.0,261.0,173.0,0.0,19.0,260.0,136.0,124.0,0.0,13.14,12.48,-2.36,AWAY,9,2021,13,2021-09-12 13:00:00,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12348,2021,Philadelphia Eagles,2,19,September 19,1:00PM ET,L,0,1-1,,San Francisco 49ers,11.0,17.0,18.0,328.0,177.0,151.0,0.0,23.0,306.0,189.0,117.0,0.0,5.3,-7.86,-2.68,HOME,9,2021,13,2021-09-19 13:00:00,0,1.0,,,261.0,,,173.0,,,136.0,,,124.0,,,0.0,,,0.0,,,0.0,,,32.0,,,6.0,,,24.0,,,19.0,,
12349,2021,Philadelphia Eagles,3,27,September 27,8:15PM ET,L,0,1-2,@,Dallas Cowboys,21.0,41.0,12.0,367.0,303.0,64.0,2.0,27.0,380.0,220.0,160.0,1.0,-11.19,-9.13,0.01,AWAY,9,2021,20,2021-09-27 20:00:00,0,0.0,,,177.0,,,151.0,,,189.0,,,117.0,,,0.0,,,0.0,,,0.0,,,11.0,,,17.0,,,18.0,,,23.0,,
12350,2021,Philadelphia Eagles,4,3,October 3,1:00PM ET,L,0,1-3,,Kansas City Chiefs,30.0,42.0,30.0,461.0,358.0,103.0,0.0,31.0,471.0,271.0,200.0,1.0,21.32,-34.67,0.7,HOME,10,2021,13,2021-10-03 13:00:00,0,0.0,,,303.0,,,64.0,,,220.0,,,160.0,,,0.0,,,2.0,,,1.0,,,21.0,,,41.0,,,12.0,,,27.0,,
12351,2021,Philadelphia Eagles,5,10,October 10,1:00PM ET,W,0,2-3,@,Carolina Panthers,21.0,18.0,15.0,273.0,182.0,91.0,2.0,17.0,267.0,158.0,109.0,3.0,-15.13,16.27,3.99,AWAY,10,2021,13,2021-10-10 13:00:00,1,0.0,0.25,,358.0,274.75,,103.0,122.75,,271.0,204.0,,200.0,150.25,,0.0,0.0,,0.0,0.5,,1.0,0.5,,30.0,23.5,,42.0,26.5,,30.0,21.0,,31.0,25.0,
12352,2021,Philadelphia Eagles,6,14,October 14,8:20PM ET,L,0,2-4,,Tampa Bay Buccaneers,22.0,28.0,16.0,213.0,113.0,100.0,1.0,27.0,399.0,297.0,102.0,1.0,9.52,-15.69,-3.01,HOME,10,2021,20,2021-10-14 20:00:00,0,1.0,0.25,,182.0,255.0,,91.0,102.25,,158.0,209.5,,109.0,146.5,,0.0,0.0,,2.0,1.0,,3.0,1.25,,21.0,20.75,,18.0,29.5,,15.0,18.75,,17.0,24.5,


### Reducing the number of rows per game from two to one

In [25]:
print(f'{len(data)=}')

len(data)=15320


In [27]:
# splitting the data and then merging it
columns_to_keep = [
    # these are basically the ids we need for these columns to join the dataframes for home and away teams
    'season', 'week',
    'team', 'opp',
    'date_time',

    # features, aka info we can use to predict the target
    'win_rate_last_1_games', 'win_rate_last_4_games', 'win_rate_last_8_games', 
    'pass_rate_last_1_games', 'pass_rate_last_4_games', 'pass_rate_last_8_games', 
    'rush_rate_last_1_games', 'rush_rate_last_4_games', 'rush_rate_last_8_games', 
    'pass_allowed_rate_last_1_games', 'pass_allowed_rate_last_4_games', 'pass_allowed_rate_last_8_games', 
    'rush_allowed_rate_last_1_games', 'rush_allowed_rate_last_4_games', 'rush_allowed_rate_last_8_games', 
    'ot_rate_last_1_games', 'ot_rate_last_4_games', 'ot_rate_last_8_games', 
    'to_rate_last_1_games', 'to_rate_last_4_games', 'to_rate_last_8_games', 
    'to_forced_rate_last_1_games', 'to_forced_rate_last_4_games', 'to_forced_rate_last_8_games',
    'points_scored_rate_last_1_games', 'points_scored_rate_last_4_games', 'points_scored_rate_last_8_games', 
    'points_allowed_rate_last_1_games', 'points_allowed_rate_last_4_games', 'points_allowed_rate_last_8_games',
    '1st_downs_rate_last_1_games', '1st_downs_rate_last_4_games', '1st_downs_rate_last_8_games', 
    '1st_downs_allowed_rate_last_1_games', '1st_downs_allowed_rate_last_4_games', '1st_downs_allowed_rate_last_8_games',
    
    # target, aka what we want to predict
    'win',
]

home_team_data = data[data['home_or_away'] == 'HOME'][columns_to_keep]
away_team_data = data[data['home_or_away'] == 'AWAY'][columns_to_keep]

# add prefix to home team data columns
home_team_data = home_team_data.add_prefix('home_team_')

# add prefix to away team data columns
away_team_data = away_team_data.add_prefix('away_team_')

game_level_data = home_team_data.merge(
    away_team_data,
    how='right',
    left_on=['home_team_opp', 'home_team_date_time'],
    right_on=['away_team_team', 'away_team_date_time']#,
    #suffixes=('_home', '_away')
)

In [28]:
len(away_team_data)

7660

In [29]:
# we should have 15320/2 = 7660 rows in the game_level_data
# we don't because of SuperBowl games, where the `home_or_away` column is not properly defined
# instead, we get 7631, which has a difference of 29 when subtracted from 7660
# 29 also represents the number of seasons reflected in this dataset so 7631 is expected
# also note that one entire regular season game was canceled last year (bills & bengals 2022 season)
# issue fixed 04/30/2023
print(f'{len(game_level_data)=}')

len(game_level_data)=7660


In [30]:
game_level_data = game_level_data.sort_values(by=['home_team_date_time'], ascending=[True], ignore_index=True)#.reset_index(drop=True)

In [33]:
# drop the `win_away` columnn, because it's the same as `win_home`, and would cause data leakage
# also drop 'opp_home' and 'opp_away' columns since they are the same as 'team_home' and 'team_away'
game_level_data.drop(columns=['away_team_win', 'home_team_opp', 'away_team_opp', 
                              'away_team_season', 'away_team_week', 
                              'away_team_date_time'], inplace=True)

KeyError: "['away_team_win', 'home_team_opp', 'away_team_opp', 'away_team_season', 'away_team_week', 'away_team_date_time'] not found in axis"

In [34]:
game_level_data.columns

Index(['home_team_season', 'home_team_week', 'home_team_team',
       'home_team_date_time', 'home_team_win_rate_last_1_games',
       'home_team_win_rate_last_4_games', 'home_team_win_rate_last_8_games',
       'home_team_pass_rate_last_1_games', 'home_team_pass_rate_last_4_games',
       'home_team_pass_rate_last_8_games', 'home_team_rush_rate_last_1_games',
       'home_team_rush_rate_last_4_games', 'home_team_rush_rate_last_8_games',
       'home_team_pass_allowed_rate_last_1_games',
       'home_team_pass_allowed_rate_last_4_games',
       'home_team_pass_allowed_rate_last_8_games',
       'home_team_rush_allowed_rate_last_1_games',
       'home_team_rush_allowed_rate_last_4_games',
       'home_team_rush_allowed_rate_last_8_games',
       'home_team_ot_rate_last_1_games', 'home_team_ot_rate_last_4_games',
       'home_team_ot_rate_last_8_games', 'home_team_to_rate_last_1_games',
       'home_team_to_rate_last_4_games', 'home_team_to_rate_last_8_games',
       'home_team_to_forced

In [36]:
# verifying the data types in the df
game_level_data.dtypes

home_team_season                                          int64
home_team_week                                           object
home_team_team                                           object
home_team_date_time                              datetime64[ns]
home_team_win_rate_last_1_games                         float64
                                                      ...      
away_team_1st_downs_rate_last_4_games                   float64
away_team_1st_downs_rate_last_8_games                   float64
away_team_1st_downs_allowed_rate_last_1_games           float64
away_team_1st_downs_allowed_rate_last_4_games           float64
away_team_1st_downs_allowed_rate_last_8_games           float64
Length: 78, dtype: object

In [37]:
# rename some columns
# home_team_win is the target we will be predicting
new_col_names = {'home_team_season':'season', 'home_team_week':'week', 'home_team_team':'home_team',
                 'home_team_date_time':'date_time', 'away_team_team':'away_team'}

game_level_data.rename(columns=new_col_names, inplace=True)

In [40]:
game_level_data

Unnamed: 0,season,week,home_team,date_time,home_team_win_rate_last_1_games,home_team_win_rate_last_4_games,home_team_win_rate_last_8_games,home_team_pass_rate_last_1_games,home_team_pass_rate_last_4_games,home_team_pass_rate_last_8_games,home_team_rush_rate_last_1_games,home_team_rush_rate_last_4_games,home_team_rush_rate_last_8_games,home_team_pass_allowed_rate_last_1_games,home_team_pass_allowed_rate_last_4_games,home_team_pass_allowed_rate_last_8_games,home_team_rush_allowed_rate_last_1_games,home_team_rush_allowed_rate_last_4_games,home_team_rush_allowed_rate_last_8_games,home_team_ot_rate_last_1_games,home_team_ot_rate_last_4_games,home_team_ot_rate_last_8_games,home_team_to_rate_last_1_games,home_team_to_rate_last_4_games,home_team_to_rate_last_8_games,home_team_to_forced_rate_last_1_games,home_team_to_forced_rate_last_4_games,home_team_to_forced_rate_last_8_games,home_team_points_scored_rate_last_1_games,home_team_points_scored_rate_last_4_games,home_team_points_scored_rate_last_8_games,home_team_points_allowed_rate_last_1_games,home_team_points_allowed_rate_last_4_games,home_team_points_allowed_rate_last_8_games,home_team_1st_downs_rate_last_1_games,home_team_1st_downs_rate_last_4_games,home_team_1st_downs_rate_last_8_games,home_team_1st_downs_allowed_rate_last_1_games,home_team_1st_downs_allowed_rate_last_4_games,home_team_1st_downs_allowed_rate_last_8_games,home_team_win,away_team,away_team_win_rate_last_1_games,away_team_win_rate_last_4_games,away_team_win_rate_last_8_games,away_team_pass_rate_last_1_games,away_team_pass_rate_last_4_games,away_team_pass_rate_last_8_games,away_team_rush_rate_last_1_games,away_team_rush_rate_last_4_games,away_team_rush_rate_last_8_games,away_team_pass_allowed_rate_last_1_games,away_team_pass_allowed_rate_last_4_games,away_team_pass_allowed_rate_last_8_games,away_team_rush_allowed_rate_last_1_games,away_team_rush_allowed_rate_last_4_games,away_team_rush_allowed_rate_last_8_games,away_team_ot_rate_last_1_games,away_team_ot_rate_last_4_games,away_team_ot_rate_last_8_games,away_team_to_rate_last_1_games,away_team_to_rate_last_4_games,away_team_to_rate_last_8_games,away_team_to_forced_rate_last_1_games,away_team_to_forced_rate_last_4_games,away_team_to_forced_rate_last_8_games,away_team_points_scored_rate_last_1_games,away_team_points_scored_rate_last_4_games,away_team_points_scored_rate_last_8_games,away_team_points_allowed_rate_last_1_games,away_team_points_allowed_rate_last_4_games,away_team_points_allowed_rate_last_8_games,away_team_1st_downs_rate_last_1_games,away_team_1st_downs_rate_last_4_games,away_team_1st_downs_rate_last_8_games,away_team_1st_downs_allowed_rate_last_1_games,away_team_1st_downs_allowed_rate_last_4_games,away_team_1st_downs_allowed_rate_last_8_games
0,1994,1,Indianapolis Colts,1994-09-04 13:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Tennessee Titans,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1994,1,Detroit Lions,1994-09-04 13:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Atlanta Falcons,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1994,1,Chicago Bears,1994-09-04 13:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Tampa Bay Buccaneers,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1994,1,Cincinnati Bengals,1994-09-04 13:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Cleveland Browns,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1994,1,New Orleans Saints,1994-09-04 13:00:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Kansas City Chiefs,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7655,2022,Division,Buffalo Bills,2023-01-22 15:00:00,1.0,1.0,1.000,316.0,255.25,224.625,107.0,150.25,146.250,189.0,192.25,216.250,42.0,104.25,91.125,0.0,0.00,0.000,3.0,2.50,1.500,2.0,1.75,1.500,34.0,34.00,29.875,31.0,24.00,20.750,25.0,23.00,22.250,16.0,16.75,18.625,0,Cincinnati Bengals,1.0,1.00,1.00,183.0,234.50,257.000,51.0,58.00,86.250,209.0,253.25,253.375,155.0,104.50,99.000,0.0,0.00,0.000,1.0,1.50,1.125,2.0,2.75,1.625,24.0,26.75,26.750,17.0,18.50,19.250,18.0,20.25,21.625,23.0,20.75,19.125
7656,2022,Division,San Francisco 49ers,2023-01-22 18:00:00,1.0,1.0,1.000,324.0,242.00,228.125,181.0,168.25,158.625,228.0,264.25,248.625,104.0,94.75,76.750,0.0,0.25,0.125,0.0,0.50,0.500,2.0,2.50,2.500,41.0,38.25,31.875,23.0,22.50,15.875,24.0,21.50,21.250,22.0,20.75,18.125,1,Dallas Cowboys,1.0,0.75,0.75,297.0,248.25,242.375,128.0,98.50,133.000,334.0,266.75,250.500,52.0,96.75,111.125,0.0,0.00,0.125,0.0,1.50,1.750,1.0,2.00,2.250,31.0,26.00,30.875,14.0,21.75,23.625,26.0,21.75,22.625,24.0,20.50,20.375
7657,2022,Conf. Champ.,Philadelphia Eagles,2023-01-29 15:00:00,1.0,0.5,0.750,148.0,239.00,246.500,268.0,139.25,169.000,109.0,188.00,172.750,118.0,122.75,120.500,0.0,0.00,0.000,0.0,1.50,1.250,1.0,0.75,0.875,38.0,26.00,31.500,7.0,20.75,21.000,26.0,21.25,23.375,13.0,18.75,17.250,1,San Francisco 49ers,1.0,1.00,1.00,199.0,237.25,225.375,113.0,158.25,160.750,206.0,248.25,249.750,76.0,94.00,78.375,0.0,0.25,0.125,1.0,0.50,0.625,2.0,2.50,2.500,19.0,33.75,32.625,12.0,20.50,17.375,21.0,23.25,21.250,15.0,19.25,18.250
7658,2022,Conf. Champ.,Kansas City Chiefs,2023-01-29 18:00:00,1.0,1.0,0.875,218.0,236.75,266.625,144.0,108.75,121.000,205.0,193.75,184.375,144.0,123.25,120.125,0.0,0.00,0.125,0.0,0.50,1.250,2.0,1.75,1.500,27.0,27.25,27.875,20.0,16.75,19.500,23.0,20.00,22.750,20.0,20.25,19.375,1,Cincinnati Bengals,1.0,1.00,1.00,240.0,248.50,243.750,172.0,87.75,100.000,261.0,242.50,254.875,64.0,97.50,94.250,0.0,0.00,0.000,0.0,1.25,0.875,1.0,2.00,1.750,27.0,25.00,25.500,10.0,15.25,16.750,30.0,22.75,22.625,19.0,19.25,19.375


In [42]:
game_level_data.loc[game_level_data['home_team'] == 'Kansas City Chiefs'].tail(21)

Unnamed: 0,season,week,home_team,date_time,home_team_win_rate_last_1_games,home_team_win_rate_last_4_games,home_team_win_rate_last_8_games,home_team_pass_rate_last_1_games,home_team_pass_rate_last_4_games,home_team_pass_rate_last_8_games,home_team_rush_rate_last_1_games,home_team_rush_rate_last_4_games,home_team_rush_rate_last_8_games,home_team_pass_allowed_rate_last_1_games,home_team_pass_allowed_rate_last_4_games,home_team_pass_allowed_rate_last_8_games,home_team_rush_allowed_rate_last_1_games,home_team_rush_allowed_rate_last_4_games,home_team_rush_allowed_rate_last_8_games,home_team_ot_rate_last_1_games,home_team_ot_rate_last_4_games,home_team_ot_rate_last_8_games,home_team_to_rate_last_1_games,home_team_to_rate_last_4_games,home_team_to_rate_last_8_games,home_team_to_forced_rate_last_1_games,home_team_to_forced_rate_last_4_games,home_team_to_forced_rate_last_8_games,home_team_points_scored_rate_last_1_games,home_team_points_scored_rate_last_4_games,home_team_points_scored_rate_last_8_games,home_team_points_allowed_rate_last_1_games,home_team_points_allowed_rate_last_4_games,home_team_points_allowed_rate_last_8_games,home_team_1st_downs_rate_last_1_games,home_team_1st_downs_rate_last_4_games,home_team_1st_downs_rate_last_8_games,home_team_1st_downs_allowed_rate_last_1_games,home_team_1st_downs_allowed_rate_last_4_games,home_team_1st_downs_allowed_rate_last_8_games,home_team_win,away_team,away_team_win_rate_last_1_games,away_team_win_rate_last_4_games,away_team_win_rate_last_8_games,away_team_pass_rate_last_1_games,away_team_pass_rate_last_4_games,away_team_pass_rate_last_8_games,away_team_rush_rate_last_1_games,away_team_rush_rate_last_4_games,away_team_rush_rate_last_8_games,away_team_pass_allowed_rate_last_1_games,away_team_pass_allowed_rate_last_4_games,away_team_pass_allowed_rate_last_8_games,away_team_rush_allowed_rate_last_1_games,away_team_rush_allowed_rate_last_4_games,away_team_rush_allowed_rate_last_8_games,away_team_ot_rate_last_1_games,away_team_ot_rate_last_4_games,away_team_ot_rate_last_8_games,away_team_to_rate_last_1_games,away_team_to_rate_last_4_games,away_team_to_rate_last_8_games,away_team_to_forced_rate_last_1_games,away_team_to_forced_rate_last_4_games,away_team_to_forced_rate_last_8_games,away_team_points_scored_rate_last_1_games,away_team_points_scored_rate_last_4_games,away_team_points_scored_rate_last_8_games,away_team_points_allowed_rate_last_1_games,away_team_points_allowed_rate_last_4_games,away_team_points_allowed_rate_last_8_games,away_team_1st_downs_rate_last_1_games,away_team_1st_downs_rate_last_4_games,away_team_1st_downs_rate_last_8_games,away_team_1st_downs_allowed_rate_last_1_games,away_team_1st_downs_allowed_rate_last_4_games,away_team_1st_downs_allowed_rate_last_8_games
7169,2021,5,Kansas City Chiefs,2021-10-10 20:00:00,1.0,0.5,,271.0,297.25,,200.0,130.25,,358.0,291.75,,103.0,146.0,,0.0,0.0,,1.0,1.75,,0.0,1.0,,42.0,33.5,,30.0,31.25,,31.0,26.75,,30.0,26.5,,0,Buffalo Bills,1.0,0.75,,251.0,258.75,,199.0,145.25,,61.0,148.75,,48.0,68.0,,0.0,0.0,,1.0,1.0,,5.0,2.75,,40.0,33.5,,0.0,11.0,,26.0,24.5,,6.0,12.0,
7212,2021,8,Kansas City Chiefs,2021-11-01 20:00:00,0.0,0.5,,257.0,297.5,,77.0,126.5,,266.0,280.25,,103.0,105.25,,0.0,0.0,,3.0,2.75,,1.0,0.75,,3.0,24.0,,27.0,27.0,,22.0,27.75,,24.0,22.25,,1,New York Giants,1.0,0.5,,199.0,274.0,,103.0,79.75,,117.0,225.0,,56.0,139.5,,0.0,0.25,,0.0,1.75,,1.0,1.5,,25.0,20.75,,3.0,26.5,,21.0,20.75,,11.0,20.0,
7224,2021,9,Kansas City Chiefs,2021-11-07 16:00:00,1.0,0.5,0.5,261.0,295.0,296.125,107.0,103.25,116.75,228.0,247.75,269.75,72.0,97.5,121.75,0.0,0.0,0.0,2.0,3.0,2.375,1.0,1.0,1.0,20.0,18.5,26.0,17.0,23.75,27.5,29.0,27.25,27.0,18.0,19.25,22.875,1,Green Bay Packers,1.0,1.0,0.875,184.0,233.25,229.375,151.0,123.75,108.125,260.0,224.0,216.75,74.0,128.0,115.0,0.0,0.25,0.125,0.0,0.5,0.75,3.0,2.0,1.75,24.0,24.25,24.0,21.0,16.75,20.875,24.0,21.5,21.125,22.0,21.5,21.25
7251,2021,11,Kansas City Chiefs,2021-11-21 16:00:00,1.0,0.75,0.625,422.0,275.0,285.5,94.0,88.75,121.25,249.0,230.5,256.5,50.0,86.75,92.75,0.0,0.0,0.0,1.0,1.5,2.25,2.0,1.5,1.0,41.0,19.25,24.25,14.0,16.25,22.0,29.0,23.5,27.0,15.0,19.0,20.25,1,Dallas Cowboys,1.0,0.75,0.875,317.0,328.75,282.25,114.0,98.0,149.5,111.0,180.0,237.0,103.0,128.5,107.375,0.0,0.25,0.125,1.0,1.75,1.375,3.0,1.25,1.625,43.0,28.5,31.875,3.0,19.5,20.5,22.0,22.75,24.125,11.0,17.0,18.5
7283,2021,13,Kansas City Chiefs,2021-12-05 20:00:00,1.0,1.0,0.75,244.0,271.75,284.625,126.0,101.0,113.75,194.0,212.5,246.375,82.0,81.5,93.375,0.0,0.0,0.0,2.0,1.25,2.0,3.0,2.0,1.375,19.0,23.25,23.625,9.0,11.75,19.375,22.0,23.5,25.625,16.0,17.0,19.625,1,Denver Broncos,1.0,0.75,0.375,155.0,193.5,210.125,147.0,129.0,110.125,285.0,224.25,247.375,72.0,119.5,124.375,0.0,0.0,0.0,1.0,0.75,1.25,2.0,1.75,1.0,28.0,22.0,19.0,13.0,17.25,21.25,23.0,20.0,19.0,20.0,19.25,19.375
7291,2021,14,Kansas City Chiefs,2021-12-12 13:00:00,1.0,1.0,0.75,178.0,251.0,273.0,89.0,96.5,99.875,250.0,218.0,232.875,154.0,102.0,99.75,0.0,0.0,0.0,1.0,1.0,2.0,3.0,2.5,1.75,22.0,23.75,21.125,9.0,9.75,16.75,15.0,20.0,23.625,22.0,18.0,18.625,1,Las Vegas Raiders,0.0,0.25,0.375,234.0,263.75,274.0,76.0,85.25,91.75,186.0,277.5,231.125,112.0,107.25,121.0,0.0,0.25,0.125,0.0,1.0,1.125,1.0,0.75,1.25,15.0,19.5,21.25,17.0,30.75,26.5,22.0,19.5,19.625,23.0,24.5,22.5
7326,2021,16,Kansas City Chiefs,2021-12-26 16:00:00,1.0,1.0,0.875,410.0,268.0,271.5,86.0,108.25,98.5,236.0,231.5,231.0,192.0,118.0,102.375,1.0,0.25,0.125,2.0,1.25,1.375,2.0,3.25,2.375,34.0,30.75,25.0,28.0,13.75,15.0,26.0,21.0,22.25,28.0,21.25,20.125,1,Pittsburgh Steelers,1.0,0.5,0.5,133.0,222.0,225.625,35.0,69.25,87.125,117.0,181.0,207.875,201.0,187.0,171.0,0.0,0.0,0.125,0.0,1.0,1.0,4.0,2.0,1.5,19.0,19.25,21.75,13.0,27.25,25.375,12.0,16.75,19.0,22.0,22.75,22.5
7367,2021,Wild Card,Kansas City Chiefs,2022-01-16 20:00:00,1.0,0.75,0.875,255.0,294.5,282.75,135.0,125.75,118.0,173.0,249.25,242.0,191.0,143.25,112.875,0.0,0.25,0.125,0.0,0.5,0.75,1.0,1.5,2.375,28.0,32.25,32.375,24.0,24.0,17.125,28.0,25.5,23.625,18.0,22.25,20.125,1,Pittsburgh Steelers,1.0,0.75,0.5,235.0,162.5,206.25,79.0,108.5,91.375,132.0,160.5,202.875,249.0,167.5,172.0,1.0,0.25,0.125,1.0,1.25,1.125,3.0,2.25,1.75,16.0,17.75,20.75,13.0,19.0,26.625,19.0,17.75,18.5,20.0,20.5,23.0
7372,2021,Division,Kansas City Chiefs,2022-01-23 18:00:00,1.0,0.75,0.875,372.0,285.0,276.5,106.0,130.75,119.5,201.0,240.5,236.0,56.0,109.25,113.625,0.0,0.0,0.125,2.0,0.5,0.875,1.0,1.25,2.25,42.0,34.25,32.5,21.0,22.25,18.0,26.0,25.5,23.25,19.0,20.0,20.625,1,Buffalo Bills,1.0,1.0,0.75,308.0,248.5,232.375,174.0,172.75,149.375,216.0,132.25,146.125,89.0,95.5,117.0,0.0,0.0,0.125,0.0,0.75,1.0,2.0,1.25,1.0,47.0,34.0,29.375,17.0,15.75,16.25,29.0,27.5,24.375,20.0,15.5,16.25
7373,2021,Conf. Champ.,Kansas City Chiefs,2022-01-30 15:00:00,1.0,0.75,0.875,370.0,314.0,292.25,182.0,144.5,126.5,313.0,275.5,250.875,109.0,104.0,117.0,1.0,0.25,0.25,0.0,0.5,0.625,0.0,0.5,1.875,42.0,35.75,35.375,36.0,28.75,21.375,30.0,26.75,24.25,23.0,20.75,21.5,0,Cincinnati Bengals,1.0,0.75,0.625,280.0,255.75,278.5,65.0,71.75,82.375,213.0,231.25,239.75,140.0,150.75,119.25,0.0,0.0,0.125,1.0,0.25,0.875,3.0,1.75,1.5,19.0,23.75,24.5,16.0,21.75,23.125,17.0,17.25,19.625,16.0,21.5,20.625


In [43]:
# Questions - 


## do you think this dataset is ready to be prepared for modeling? 
## do you have any suggestions on how to convert categorical data types?


##


In [44]:
from src.data_preparation import export_transformed_data_to_csv
export_transformed_data_to_csv(game_level_data)