# 02 Data Prep

In [1]:
# common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import sys
from datetime import datetime
import sklearn 

sys.executable

'/opt/anaconda3/bin/python'

In [2]:
# will display all the columns in the df moving forward
pd.set_option('display.max_columns', 500)

In [3]:
# viewing the paths connected to this file
sys.path

['/Users/mirror/Desktop/GitHub/NFL_Game_Predictor',
 '/opt/anaconda3/lib/python38.zip',
 '/opt/anaconda3/lib/python3.8',
 '/opt/anaconda3/lib/python3.8/lib-dynload',
 '',
 '/Users/mirror/.local/lib/python3.8/site-packages',
 '/opt/anaconda3/lib/python3.8/site-packages',
 '/opt/anaconda3/lib/python3.8/site-packages/aeosa',
 '/opt/anaconda3/lib/python3.8/site-packages/IPython/extensions',
 '/Users/mirror/.ipython']

In [4]:
# Importing the data from a csv file
# avoid using hard coded paths like the next two lines
# path = "/Users/mirror/Desktop/GitHub/nfl_game_predictor/data/scraped_data.csv"
# df = pd.read_csv(path) # instead, use this

from src.config import DATA_DIR
path = DATA_DIR / 'scraped_data.csv'
df = pd.read_csv(path)

# previewing the data
df.head(30)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,BUF,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96
1,2022,BUF,2,Mon,September 19,7:15PM ET,W,,2-0,,Tennessee Titans,41,7,23,414,313,101,,12,187,107,80,4,17.69,18.01,1.55
2,2022,BUF,3,Sun,September 25,1:00PM ET,L,,2-1,@,Miami Dolphins,19,21,31,497,382,115,1,15,212,171,41,,15.88,-7.45,-4.86
3,2022,BUF,4,Sun,October 2,1:00PM ET,W,,3-1,@,Baltimore Ravens,23,20,22,326,201,125,2,22,296,134,162,2,2.10,2.66,-1.69
4,2022,BUF,5,Sun,October 9,1:00PM ET,W,,4-1,,Pittsburgh Steelers,38,3,21,552,432,120,2,23,364,310,54,2,20.66,9.42,3.54
5,2022,BUF,6,Sun,October 16,4:25PM ET,W,,5-1,@,Kansas City Chiefs,24,20,26,443,318,125,1,23,387,319,68,2,11.13,-6.45,0.56
6,2022,BUF,8,Sun,October 30,8:20PM ET,W,,6-1,,Green Bay Packers,27,17,20,369,216,153,2,21,398,190,208,1,12.79,-5.55,4.01
7,2022,BUF,9,Sun,November 6,1:00PM ET,L,,6-2,@,New York Jets,17,20,19,317,183,134,2,21,310,136,174,1,0.54,-7.64,2.11
8,2022,BUF,10,Sun,November 13,1:00PM ET,L,OT,6-3,,Minnesota Vikings,30,33,29,486,311,175,4,24,481,334,147,2,1.45,-4.80,-0.39
9,2022,BUF,11,Sun,November 20,1:00PM ET,W,,7-3,,Cleveland Browns,31,23,22,357,186,171,,27,396,316,80,1,6.00,-10.86,13.72


### Quick Exploration

In [5]:
df.shape

(15322, 26)

In [6]:
df.dtypes

season                int64
team                 object
week                 object
day                  object
date                 object
time                 object
result               object
ot                   object
record               object
@                    object
opp                  object
points_scored        object
points_allowed       object
1st_downs            object
totyd                object
passyd               object
rushyd               object
to                   object
1st_downs_allowed    object
totyd_allowed        object
passyd_allowed       object
rushyd_allowed       object
to_forced            object
off_exp_pts          object
def_exp_pts          object
sts_exp_pts          object
dtype: object

In [7]:
# determining how many missing values are in each column
df.isnull().sum()

season                   0
team                     0
week                     0
day                      0
date                     0
time                     0
result                   0
ot                   14374
record                   0
@                     7632
opp                      0
points_scored            0
points_allowed           0
1st_downs                0
totyd                    0
passyd                   2
rushyd                   0
to                    3378
1st_downs_allowed        0
totyd_allowed            0
passyd_allowed           2
rushyd_allowed           0
to_forced             3378
off_exp_pts              0
def_exp_pts              0
sts_exp_pts              0
dtype: int64

In [8]:
# determining how many missing values are in specific column(s)
df['ot'].isnull().sum()

14374

In [9]:
# discovering how many times an opponent played a game
df.opp.value_counts()

New England Patriots        515
Green Bay Packers           507
Pittsburgh Steelers         502
Philadelphia Eagles         499
Indianapolis Colts          498
San Francisco 49ers         496
Seattle Seahawks            495
Kansas City Chiefs          491
Denver Broncos              489
Dallas Cowboys              488
Minnesota Vikings           488
New Orleans Saints          485
New York Giants             485
Tampa Bay Buccaneers        484
Atlanta Falcons             484
Cincinnati Bengals          480
Buffalo Bills               480
New York Jets               480
Miami Dolphins              480
Arizona Cardinals           478
Chicago Bears               477
Detroit Lions               473
Carolina Panthers           467
Jacksonville Jaguars        466
Baltimore Ravens            462
Washington Redskins         423
Cleveland Browns            423
Oakland Raiders             408
Tennessee Titans            404
San Diego Chargers          382
Houston Texans              348
St. Loui

### SweeetViz

In [10]:
# # importing a new library that was recommended to me by my mentor
# import sweetviz as sv

# df_report = sv.analyze(df)
# df_report.show_html()
# #df_report.show_notebook()

In [11]:
# compares two dataframes via sweetviz
#df_comp = sv.compare(df)

In [12]:
# another way to get great insights is to use the comparison functionality to split your dataset into 2 sub-populations.
#df_comp_intra = sv.compare_intra(df)

### Initial Data Cleansing

In [13]:
# there are only supposed to be 32 teams in the NFL. some teams have changed their name and/or location.
len(df.opp.unique())

40

In [14]:
# some teams have changed their name and/or location, which created another opponent.
# manually replacing team names.
df.loc[df["opp"] == "Washington Redskins", "opp"] = "Washington Commanders"
df.loc[df["opp"] == "Washington Football Team", "opp"] = "Washington Commanders"
df.loc[df["opp"] == "Oakland Raiders", "opp"] = "Las Vegas Raiders"
df.loc[df["opp"] == "Los Angeles Raiders", "opp"] = "Las Vegas Raiders"
df.loc[df["opp"] == "Houston Oilers", "opp"] = "Tennessee Titans"
df.loc[df["opp"] == "Tennessee Oilers", "opp"] = "Tennessee Titans"
df.loc[df["opp"] == "San Diego Chargers", "opp"] = "Los Angeles Chargers"
df.loc[df["opp"] == "St. Louis Rams", "opp"] = "Los Angeles Rams"

In [15]:
# verifying that there are only 32 teams in the df.opp column now.
len(df.opp.unique())

32

In [16]:
# first sort the df by season and week
df = df.sort_values(by=['season', 'week'], ascending=[False, True])

In [17]:
# reset index ###########
df = df.reset_index(drop=True)

In [18]:
# right now, we have two records for each game played because we scraped the data for each team's season
# we need to reduce the df to only have one record for each game played
# there should only be 16 week 1 entries for a given season, but here we have 32 week 1 entries
df.head(33)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,BUF,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96
1,2022,MIA,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98
2,2022,NE,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98
3,2022,NYJ,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
4,2022,CIN,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,,-7.49,8.41,-4.85
5,2022,BAL,1,Sun,September 11,1:00PM ET,W,,1-0,@,New York Jets,24.0,9.0,13.0,274.0,211.0,63.0,1.0,24.0,380.0,297.0,83.0,2.0,3.65,8.07,5.04
6,2022,PIT,1,Sun,September 11,1:00PM ET,W,OT,1-0,@,Cincinnati Bengals,23.0,20.0,13.0,267.0,192.0,75.0,,32.0,432.0,299.0,133.0,5.0,-8.41,7.49,4.85
7,2022,CLE,1,Sun,September 11,1:00PM ET,W,,1-0,@,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66
8,2022,JAX,1,Sun,September 11,1:00PM ET,L,,0-1,@,Washington Commanders,22.0,28.0,24.0,383.0,260.0,123.0,1.0,26.0,390.0,305.0,85.0,3.0,3.64,-9.54,-0.26
9,2022,TEN,1,Sun,September 11,4:25PM ET,L,,0-1,,New York Giants,20.0,21.0,19.0,359.0,266.0,93.0,1.0,19.0,394.0,156.0,238.0,2.0,5.97,-3.72,-2.44


In [19]:
# Team name column was also abbreviated, so it needs to match to the opp name column too
# y will be the list of team names from the opp column
# z will be the list of team abbrviations from the team column
z = set(df['team'].values.tolist())
y = set(df['opp'].values.tolist())
z = list(z)
z.sort()
y = list(y)
y.sort()

In [20]:
# verifying that the opp list only has 32 unqiue teams now
len(y)
len(set(y))

32

In [21]:
y

['Arizona Cardinals',
 'Atlanta Falcons',
 'Baltimore Ravens',
 'Buffalo Bills',
 'Carolina Panthers',
 'Chicago Bears',
 'Cincinnati Bengals',
 'Cleveland Browns',
 'Dallas Cowboys',
 'Denver Broncos',
 'Detroit Lions',
 'Green Bay Packers',
 'Houston Texans',
 'Indianapolis Colts',
 'Jacksonville Jaguars',
 'Kansas City Chiefs',
 'Las Vegas Raiders',
 'Los Angeles Chargers',
 'Los Angeles Rams',
 'Miami Dolphins',
 'Minnesota Vikings',
 'New England Patriots',
 'New Orleans Saints',
 'New York Giants',
 'New York Jets',
 'Philadelphia Eagles',
 'Pittsburgh Steelers',
 'San Francisco 49ers',
 'Seattle Seahawks',
 'Tampa Bay Buccaneers',
 'Tennessee Titans',
 'Washington Commanders']

In [22]:
# we must reorder a couple of teams in z so that it matches y
z

['ARZ',
 'ATL',
 'BAL',
 'BUF',
 'CAR',
 'CHI',
 'CIN',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GB',
 'HOU',
 'IND',
 'JAX',
 'KC',
 'LAC',
 'LAR',
 'LV',
 'MIA',
 'MIN',
 'NE',
 'NO',
 'NYG',
 'NYJ',
 'PHI',
 'PIT',
 'SEA',
 'SF',
 'TB',
 'TEN',
 'WAS']

In [23]:
z = z[0:16] + z[18:19] + z[16:17] + z[17:18] + z[19:-5] + z[-4:-3] + z[-5:-4] + z[-3:]
z

['ARZ',
 'ATL',
 'BAL',
 'BUF',
 'CAR',
 'CHI',
 'CIN',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GB',
 'HOU',
 'IND',
 'JAX',
 'KC',
 'LV',
 'LAC',
 'LAR',
 'MIA',
 'MIN',
 'NE',
 'NO',
 'NYG',
 'NYJ',
 'PHI',
 'PIT',
 'SF',
 'SEA',
 'TB',
 'TEN',
 'WAS']

In [24]:
print(len(z))
print(len(set(z)))

32
32


In [25]:
# creating dictionary with the two lists (z & y)
team_dict = dict(zip(z, y))
len(team_dict)

32

In [26]:
# using the manually created team_dictionary to replace the team names.
df = df.replace({"team": team_dict})

In [27]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96
1,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98
2,2022,New England Patriots,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98
3,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
4,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,,-7.49,8.41,-4.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15317,1994,Kansas City Chiefs,Wild Card,Sat,December 31,4:00PM ET,L,,9-8,@,Miami Dolphins,17.0,27.0,24.0,414.0,314.0,100.0,2.0,22.0,381.0,249.0,132.0,,8.31,-15.96,-1.16
15318,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65
15319,1994,Green Bay Packers,Wild Card,Sat,December 31,12:30PM ET,W,,10-7,,Detroit Lions,16.0,12.0,18.0,336.0,255.0,81.0,,9.0,171.0,175.0,-4.0,,-2.93,11.53,0.69
15320,1994,Detroit Lions,Wild Card,Sat,December 31,12:30PM ET,L,,9-8,@,Green Bay Packers,12.0,16.0,9.0,171.0,175.0,-4.0,,18.0,336.0,255.0,81.0,,-11.53,2.93,-0.69


In [28]:
# create home team and away team columns. this will replace the "opp" and "team" columns.
# which will get me towards my goal of having one record per game instead of two.
# it was difficult for me to vectorize this transformation so I used a for loop instead
h = []
a = [] 
count = 0

# organizing teams by conference since the Super Bowl home/away teams are selected this way
afc = ['Baltimore Ravens', 'Buffalo Bills', 'Cincinnati Bengals', 'Cleveland Browns', 'Denver Broncos', 'Houston Texans', 
       'Indianapolis Colts', 'Jacksonville Jaguars', 'Kansas City Chiefs', 'Las Vegas Raiders','Los Angeles Chargers', 
       'Miami Dolphins', 'New England Patriots', 'New York Jets', 'Pittsburgh Steelers', 'Tennessee Titans']

nfc = ['Arizona Cardinals', 'Atlanta Falcons', 'Carolina Panthers', 'Chicago Bears', 'Dallas Cowboys', 'Detroit Lions',
       'Green Bay Packers', 'Los Angeles Rams', 'Minnesota Vikings', 'New Orleans Saints', 'New York Giants',
       'Philadelphia Eagles', 'San Francisco 49ers', 'Seattle Seahawks', 'Tampa Bay Buccaneers', 'Washington Commanders']


for i in df['@']:
    if '@' in str(i): # the presence of a @ symbol represents the team entry as the away team and the opp team entry as the home team
        h.append(df['opp'].iloc[count])
        a.append(df['team'].iloc[count])
        
            # we must determine which teams are the home teams and away teams for the SB. 
            # even year = AFC team is the home team for the SB, odd year = NFC team is the home team for the SB
            # the game is played on neutral ground where the location (typically a new stadium) is determined years in advance
            
    elif 'N' in str(i): # this entire elif block is to organize the home/away teams in a neutral 'N' stadium specifically for SuperBowls
        if int(df['season'].iloc[count])%2 == 0:
            if df['team'].iloc[count] in afc:
                h.append(df['team'].iloc[count])
                a.append(df['opp'].iloc[count])
            else: 
                h.append(df['opp'].iloc[count])
                a.append(df['team'].iloc[count])
        else:
            if df['team'].iloc[count] in nfc:
                h.append(df['team'].iloc[count])
                a.append(df['opp'].iloc[count])
            else:
                h.append(df['opp'].iloc[count])
                a.append(df['team'].iloc[count])
                
    else:    # the lack of @ symbol represents the team entry as the home team and the opp team entry as the away team
        h.append(df['team'].iloc[count])
        a.append(df['opp'].iloc[count])
    count += 1 # this counter allows me to iterate through the column one-by-one

# using the lists h & a to create new home_team and away_team columns in the dataframe
df = df.assign(away_t = a)
df = df.assign(home_t = h)

In [29]:
# attemping a vectorized version of the above ^^^, but this fails to drop duplicate records later on
# so keep the above code for now
###afc = ['Baltimore Ravens', 'Buffalo Bills', 'Cincinnati Bengals', 'Cleveland Browns', 
###       'Denver Broncos', 'Houston Texans', 'Indianapolis Colts', 'Jacksonville Jaguars', 
###       'Kansas City Chiefs', 'Las Vegas Raiders','Los Angeles Chargers', 'Miami Dolphins', 
###       'New England Patriots', 'New York Jets', 'Pittsburgh Steelers', 'Tennessee Titans']

###nfc = ['Arizona Cardinals', 'Atlanta Falcons', 'Carolina Panthers', 'Chicago Bears', 
###       'Dallas Cowboys', 'Detroit Lions', 'Green Bay Packers', 'Los Angeles Rams', 
###       'Minnesota Vikings', 'New Orleans Saints', 'New York Giants', 'Philadelphia Eagles', 
###       'San Francisco 49ers', 'Seattle Seahawks', 'Tampa Bay Buccaneers', 'Washington Commanders']

###def create_home_away_teams(df):
###    df['away_t'] = np.where(df['@'].str.contains('@'), df['team'], np.where(df['@'].str.contains('N'), 
###                           np.where(df['season'].astype(int) % 2 == 0, np.where(df['team'].isin(afc), df['opp'], df['team']), 
###                           np.where(df['team'].isin(nfc), df['opp'], df['team'])), df['opp']))
###    df['home_t'] = np.where(df['@'].str.contains('@'), df['opp'], np.where(df['@'].str.contains('N'), 
###                           np.where(df['season'].astype(int) % 2 == 0, np.where(df['team'].isin(afc), df['team'], df['opp']), 
###                           np.where(df['team'].isin(nfc), df['team'], df['opp'])), df['team']))
###    return df

###df = create_home_away_teams(df)

In [30]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams
1,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins
2,2022,New England Patriots,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98,New England Patriots,Miami Dolphins
3,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets
4,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15317,1994,Kansas City Chiefs,Wild Card,Sat,December 31,4:00PM ET,L,,9-8,@,Miami Dolphins,17.0,27.0,24.0,414.0,314.0,100.0,2.0,22.0,381.0,249.0,132.0,,8.31,-15.96,-1.16,Kansas City Chiefs,Miami Dolphins
15318,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings
15319,1994,Green Bay Packers,Wild Card,Sat,December 31,12:30PM ET,W,,10-7,,Detroit Lions,16.0,12.0,18.0,336.0,255.0,81.0,,9.0,171.0,175.0,-4.0,,-2.93,11.53,0.69,Detroit Lions,Green Bay Packers
15320,1994,Detroit Lions,Wild Card,Sat,December 31,12:30PM ET,L,,9-8,@,Green Bay Packers,12.0,16.0,9.0,171.0,175.0,-4.0,,18.0,336.0,255.0,81.0,,-11.53,2.93,-0.69,Detroit Lions,Green Bay Packers


### Reduce the number of rows per game from two to one

In [31]:
# we have two rows of data for each game so we will reduce it to one row for each game
df = df.drop_duplicates(subset=['season', 'week', 'home_t', 'away_t'])

In [32]:
# reset index
df = df.reset_index(drop=True)

In [33]:
df = df.sort_values(by=['season', 'week'], ascending=[False, True])
# thinks season and week columns are string datatypes

In [34]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams
1,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins
2,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets
3,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals
4,2022,Cleveland Browns,1,Sun,September 11,1:00PM ET,W,,1-0,@,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,Sun,January 29,6:00PM ET,L,,13-6,N,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers
7657,1994,Miami Dolphins,Wild Card,Sat,December 31,4:00PM ET,W,,11-6,,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins
7658,1994,New England Patriots,Wild Card,Sun,January 1,12:30PM ET,L,,10-7,@,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns
7659,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings


In [35]:
df.head(30)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams
1,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins
2,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets
3,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals
4,2022,Cleveland Browns,1,Sun,September 11,1:00PM ET,W,,1-0,@,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers
5,2022,Jacksonville Jaguars,1,Sun,September 11,1:00PM ET,L,,0-1,@,Washington Commanders,22.0,28.0,24.0,383.0,260.0,123.0,1.0,26.0,390.0,305.0,85.0,3.0,3.64,-9.54,-0.26,Jacksonville Jaguars,Washington Commanders
6,2022,Tennessee Titans,1,Sun,September 11,4:25PM ET,L,,0-1,,New York Giants,20.0,21.0,19.0,359.0,266.0,93.0,1.0,19.0,394.0,156.0,238.0,2.0,5.97,-3.72,-2.44,New York Giants,Tennessee Titans
7,2022,Indianapolis Colts,1,Sun,September 11,1:00PM ET,T,OT,0-0-1,@,Houston Texans,20.0,20.0,33.0,517.0,340.0,177.0,2.0,20.0,299.0,222.0,77.0,1.0,5.67,2.14,-9.43,Indianapolis Colts,Houston Texans
8,2022,Kansas City Chiefs,1,Sun,September 11,4:25PM ET,W,,1-0,@,Arizona Cardinals,44.0,21.0,33.0,488.0,360.0,128.0,1.0,18.0,282.0,179.0,103.0,,33.41,-2.29,-6.88,Kansas City Chiefs,Arizona Cardinals
9,2022,Los Angeles Chargers,1,Sun,September 11,4:25PM ET,W,,1-0,,Las Vegas Raiders,24.0,19.0,18.0,355.0,279.0,76.0,,18.0,320.0,256.0,64.0,3.0,6.3,2.73,-1.95,Las Vegas Raiders,Los Angeles Chargers


In [36]:
df.tail(30)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
7631,1994,Kansas City Chiefs,8,Sun,October 23,1:00PM ET,W,,5-2,,Seattle Seahawks,38.0,23.0,24.0,477.0,305.0,172.0,,16.0,263.0,96.0,167.0,2.0,16.95,-0.26,-1.67,Seattle Seahawks,Kansas City Chiefs
7632,1994,Las Vegas Raiders,8,Sun,October 23,4:00PM ET,W,,3-4,,Atlanta Falcons,30.0,17.0,21.0,313.0,197.0,116.0,1.0,15.0,255.0,185.0,70.0,2.0,1.44,1.86,4.1,Atlanta Falcons,Las Vegas Raiders
7633,1994,Dallas Cowboys,8,Sun,October 23,4:00PM ET,W,,6-1,@,Arizona Cardinals,28.0,21.0,14.0,312.0,237.0,75.0,,22.0,315.0,208.0,107.0,,10.71,-3.98,-3.24,Dallas Cowboys,Arizona Cardinals
7634,1994,Minnesota Vikings,8,Thu,October 20,8:00PM ET,W,OT,5-2,,Green Bay Packers,13.0,10.0,16.0,252.0,230.0,22.0,2.0,12.0,158.0,87.0,71.0,2.0,-22.19,34.26,-3.95,Green Bay Packers,Minnesota Vikings
7635,1994,Detroit Lions,8,Sun,October 23,1:00PM ET,W,,3-4,,Chicago Bears,21.0,16.0,8.0,232.0,49.0,183.0,3.0,22.0,402.0,285.0,117.0,4.0,-17.35,10.02,5.27,Chicago Bears,Detroit Lions
7636,1994,Tampa Bay Buccaneers,8,Sun,October 23,4:00PM ET,L,,2-5,@,San Francisco 49ers,16.0,41.0,17.0,235.0,125.0,110.0,1.0,28.0,451.0,270.0,181.0,1.0,-3.33,-24.24,-1.83,Tampa Bay Buccaneers,San Francisco 49ers
7637,1994,New Orleans Saints,8,Sun,October 23,1:00PM ET,W,,3-5,,Los Angeles Rams,37.0,34.0,18.0,314.0,196.0,118.0,1.0,13.0,242.0,157.0,85.0,3.0,-6.28,13.61,-10.3,Los Angeles Rams,New Orleans Saints
7638,1994,Miami Dolphins,9,Sun,October 30,4:00PM ET,W,,6-2,@,New England Patriots,23.0,3.0,20.0,338.0,198.0,140.0,2.0,11.0,188.0,142.0,46.0,3.0,-5.64,27.59,1.87,Miami Dolphins,New England Patriots
7639,1994,Indianapolis Colts,9,Sun,October 30,4:00PM ET,W,,4-5,,New York Jets,28.0,25.0,22.0,330.0,151.0,179.0,5.0,14.0,256.0,199.0,57.0,1.0,-0.71,14.87,-7.33,New York Jets,Indianapolis Colts
7640,1994,Buffalo Bills,9,Sun,October 30,1:00PM ET,W,,5-3,,Kansas City Chiefs,44.0,10.0,17.0,357.0,180.0,177.0,,16.0,266.0,179.0,87.0,5.0,9.49,23.88,4.11,Kansas City Chiefs,Buffalo Bills


In [37]:
#df.loc[df['team'] == 'New York Jets'].tail(50)

In [38]:
df[df.week == 'SuperBowl']
# we must determine which teams are the home teams and away teams for the SB. 
# even year = AFC team is the home team for the SB, odd year = NFC team is the home team for the SB
# the game is played on neutral ground where the location (typically a new stadium) is determined years in advance.
# the for loop above already determined this for us. this code is just to verify that the home/away teams of each SB are accurate.

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
278,2022,Kansas City Chiefs,SuperBowl,Sun,February 12,6:30PM ET,W,,17-3,N,Philadelphia Eagles,38.0,35.0,21.0,340.0,182.0,158.0,,25.0,417.0,302.0,115.0,1.0,19.2,-16.09,0.1,Philadelphia Eagles,Kansas City Chiefs
563,2021,Cincinnati Bengals,SuperBowl,Sun,February 13,6:30PM ET,L,,13-8,N,Los Angeles Rams,20.0,23.0,15.0,305.0,226.0,79.0,,17.0,313.0,270.0,43.0,2.0,-5.48,-0.42,1.93,Cincinnati Bengals,Los Angeles Rams
832,2020,Kansas City Chiefs,SuperBowl,Sun,February 7,6:30PM ET,L,,16-3,N,Tampa Bay Buccaneers,9.0,31.0,22.0,350.0,243.0,107.0,2.0,26.0,340.0,195.0,145.0,,-7.9,-10.75,-1.18,Tampa Bay Buccaneers,Kansas City Chiefs
1101,2019,Kansas City Chiefs,SuperBowl,Sun,February 2,6:30PM ET,W,,15-4,N,San Francisco 49ers,31.0,20.0,26.0,397.0,268.0,129.0,2.0,21.0,351.0,210.0,141.0,2.0,18.54,-5.19,-1.58,Kansas City Chiefs,San Francisco 49ers
1368,2018,New England Patriots,SuperBowl,Sun,February 3,6:30PM ET,W,,14-5,N,Los Angeles Rams,13.0,3.0,22.0,407.0,253.0,154.0,1.0,14.0,260.0,198.0,62.0,1.0,2.46,11.82,-4.34,Los Angeles Rams,New England Patriots
1635,2017,New England Patriots,SuperBowl,Sun,February 4,6:30PM ET,L,,15-4,N,Philadelphia Eagles,33.0,41.0,29.0,613.0,500.0,113.0,1.0,25.0,538.0,374.0,164.0,1.0,27.61,-29.26,-4.65,New England Patriots,Philadelphia Eagles
1902,2016,New England Patriots,SuperBowl,Sun,February 5,6:30PM ET,W,OT,17-2,N,Atlanta Falcons,34.0,28.0,37.0,546.0,442.0,104.0,2.0,17.0,344.0,240.0,104.0,1.0,13.47,-7.42,-1.32,Atlanta Falcons,New England Patriots
2169,2015,Denver Broncos,SuperBowl,Sun,February 7,6:39PM ET,W,,15-4,N,Carolina Panthers,24.0,10.0,11.0,194.0,104.0,90.0,2.0,21.0,315.0,197.0,118.0,4.0,-20.81,20.91,11.64,Denver Broncos,Carolina Panthers
2436,2014,New England Patriots,SuperBowl,Sun,February 1,6:30PM ET,W,,15-4,N,Seattle Seahawks,28.0,24.0,25.0,377.0,320.0,57.0,2.0,20.0,396.0,234.0,162.0,1.0,13.67,-10.4,1.37,Seattle Seahawks,New England Patriots
2703,2013,Denver Broncos,SuperBowl,Sun,February 2,6:32PM ET,L,,15-4,N,Seattle Seahawks,8.0,43.0,18.0,306.0,279.0,27.0,4.0,17.0,341.0,206.0,135.0,,-16.32,-7.18,-9.66,Denver Broncos,Seattle Seahawks


### Converting Missing Values and Categorical Data Types

In [39]:
df.isnull().sum()

season                  0
team                    0
week                    0
day                     0
date                    0
time                    0
result                  0
ot                   7187
record                  0
@                    3836
opp                     0
points_scored           0
points_allowed          0
1st_downs               0
totyd                   0
passyd                  1
rushyd                  0
to                   1868
1st_downs_allowed       0
totyd_allowed           0
passyd_allowed          1
rushyd_allowed          0
to_forced            1510
off_exp_pts             0
def_exp_pts             0
sts_exp_pts             0
away_t                  0
home_t                  0
dtype: int64

In [40]:
df.dtypes

season                int64
team                 object
week                 object
day                  object
date                 object
time                 object
result               object
ot                   object
record               object
@                    object
opp                  object
points_scored        object
points_allowed       object
1st_downs            object
totyd                object
passyd               object
rushyd               object
to                   object
1st_downs_allowed    object
totyd_allowed        object
passyd_allowed       object
rushyd_allowed       object
to_forced            object
off_exp_pts          object
def_exp_pts          object
sts_exp_pts          object
away_t               object
home_t               object
dtype: object

In [41]:
# convert NaN values for columns "ot" (overtime), "to" (turnovers), and "to_forced" to the integer 0.
df['ot'] = df['ot'].fillna(0)
df['to'] = df['to'].fillna(0)
df['to_forced'] = df['to_forced'].fillna(0)

In [42]:
# convert misc values for columns "home/away" (home team?) and "ot" (overtime) to a binary integer
df.loc[df["@"] == "@", "@"] = 0
df.loc[df["@"] == "N", "@"] = 0
df.loc[df["ot"] == "OT", "ot"] = 1

In [43]:
# changing the data types for some of the object columns
df[["ot"]] = df[["ot"]].apply(pd.to_numeric)

In [44]:
# convert NaN values for columns "home/away" (home team?) to the integer 1.
df['@'] = df['@'].fillna(1)

In [45]:
# checking how many rows still have missing values in them
df[df.isnull().any(axis=1)]

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
3637,2009,New York Jets,17,Sun,January 3,8:31PM ET,W,0,9-7,1,Cincinnati Bengals,37.0,0.0,21.0,320.0,63.0,257.0,0.0,5.0,72.0,,72.0,3.0,10.82,33.61,-6.11,Cincinnati Bengals,New York Jets
5246,2003,Las Vegas Raiders,17,Sun,December 28,4:15PM ET,L,0,4-12,0,Los Angeles Chargers,14.0,21.0,9.0,141.0,,141.0,1.0,20.0,337.0,74.0,263.0,0.0,-25.91,-1.64,16.75,Las Vegas Raiders,Los Angeles Chargers


In [46]:
# converting the last four missing values to zero. The net passing yards equaled zero due to sacks.
df['passyd'] = df['passyd'].fillna(0)
df['passyd_allowed'] = df['passyd_allowed'].fillna(0)

In [47]:
# verifying the data types in the df
df.dtypes

season                int64
team                 object
week                 object
day                  object
date                 object
time                 object
result               object
ot                    int64
record               object
@                     int64
opp                  object
points_scored        object
points_allowed       object
1st_downs            object
totyd                object
passyd               object
rushyd               object
to                   object
1st_downs_allowed    object
totyd_allowed        object
passyd_allowed       object
rushyd_allowed       object
to_forced            object
off_exp_pts          object
def_exp_pts          object
sts_exp_pts          object
away_t               object
home_t               object
dtype: object

In [48]:
# checking how many rows still have missing values in them
len(df[df.isnull().any(axis=1)])

0

### Converting Date and Time columns

In [49]:
# "day" of the week needs to be converted into an integer.
day_dict = {"Tue":1, "Wed":2, "Thu":3, "Fri":4, "Sat":5, "Sun":6, "Mon":7}

# using the manually created day_dictionary to replace the day of the week.
df["day_of_week"] = df["day"].map(day_dict)

In [50]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3
1,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6
2,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6
3,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6
4,2022,Cleveland Browns,1,Sun,September 11,1:00PM ET,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,Sun,January 29,6:00PM ET,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6
7657,1994,Miami Dolphins,Wild Card,Sat,December 31,4:00PM ET,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5
7658,1994,New England Patriots,Wild Card,Sun,January 1,12:30PM ET,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6
7659,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6


In [51]:
# creating a year column out of the season column for future date transformations
df['year'] = df['season'] 

In [52]:
# Split the 'date' column into two new columns: 'month' and 'day'
df[['month', 'day']] = df['date'].str.split(' ', expand=True)

# Convert the 'day' column to integer type
df['day'] = df['day'].astype(int)

In [53]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month
0,2022,Buffalo Bills,1,8,September 8,8:20PM ET,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,September
1,2022,Miami Dolphins,1,11,September 11,1:00PM ET,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,September
2,2022,New York Jets,1,11,September 11,1:00PM ET,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,September
3,2022,Cincinnati Bengals,1,11,September 11,1:00PM ET,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,September
4,2022,Cleveland Browns,1,11,September 11,1:00PM ET,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,September
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,January 29,6:00PM ET,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,January
7657,1994,Miami Dolphins,Wild Card,31,December 31,4:00PM ET,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,December
7658,1994,New England Patriots,Wild Card,1,January 1,12:30PM ET,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,January
7659,1994,Minnesota Vikings,Wild Card,1,January 1,4:00PM ET,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,January


In [54]:
# converted to vectorized solution instead ^
# date need to be converted into a numerical value (month and day_of_month)
###c = []
###v = []
###for i in df['date']:
###    c.append((i.split(' ')[0]))
###    v.append(int(i.split(' ')[1]))
###df = df.assign(month = c)
###df = df.assign(day = v)

In [55]:
# "month of the year needs to be converted into an integer. NFL games start in August (preseason) and end in February (super bowl).
month_dict = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6, 
            "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}

# using the manually created day_dictionary to replace the day of the week.
df = df.replace({"month": month_dict}) 

In [56]:
# we can drop the date column after converting it
df = df.drop('date', axis=1)

In [57]:
# Extract the hour from the 'time' column
df['hour'] = df['time'].str[0].astype(int)

# Add 12 to the 'hour' column for times in the PM
df['hour'] += df['time'].str.contains('PM').astype(int) * 12

In [58]:
# converted into vectorized solution ^
# time needs to be converted into a numerical value
###b = []
###for i in df['time']:
###    if 'PM' in i:
###        b.append(int(i[0]) + 12)
###    else:
###        b.append(int(i[0]))
###df = df.assign(hour = b)

In [59]:
df

Unnamed: 0,season,team,week,day,time,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month,hour
0,2022,Buffalo Bills,1,8,8:20PM ET,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,9,20
1,2022,Miami Dolphins,1,11,1:00PM ET,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,9,13
2,2022,New York Jets,1,11,1:00PM ET,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,9,13
3,2022,Cincinnati Bengals,1,11,1:00PM ET,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,9,13
4,2022,Cleveland Browns,1,11,1:00PM ET,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,9,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,6:00PM ET,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,1,18
7657,1994,Miami Dolphins,Wild Card,31,4:00PM ET,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,12,16
7658,1994,New England Patriots,Wild Card,1,12:30PM ET,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,1,13
7659,1994,Minnesota Vikings,Wild Card,1,4:00PM ET,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,1,16


In [60]:
# we can drop the time column after converting it
df = df.drop('time', axis=1)

In [61]:
# creating an a datetime type column
df['date'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

In [62]:
df

Unnamed: 0,season,team,week,day,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month,hour,date
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,9,20,2022-09-08 20:00:00
1,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,9,13,2022-09-11 13:00:00
2,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,9,13,2022-09-11 13:00:00
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,9,13,2022-09-11 13:00:00
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,9,13,2022-09-11 13:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,1,18,1994-01-29 18:00:00
7657,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,12,16,1994-12-31 16:00:00
7658,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,1,13,1994-01-01 13:00:00
7659,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,1,16,1994-01-01 16:00:00


In [63]:
# verifying the data types in the df
df.dtypes

season                        int64
team                         object
week                         object
day                           int64
result                       object
ot                            int64
record                       object
@                             int64
opp                          object
points_scored                object
points_allowed               object
1st_downs                    object
totyd                        object
passyd                       object
rushyd                       object
to                           object
1st_downs_allowed            object
totyd_allowed                object
passyd_allowed               object
rushyd_allowed               object
to_forced                    object
off_exp_pts                  object
def_exp_pts                  object
sts_exp_pts                  object
away_t                       object
home_t                       object
day_of_week                   int64
year                        

In [64]:
# we now have to create columns so that it is home_team_pts_scored, home_team_pts_allowed, etc.
# since we created home/away team columns, the stats for those games need to be converted accordingly
# those stats were based on the initial data source where we had two records for each game
# creating columns containing game stats from the home_team's point of view
ht_pts_scored = []
ht_pts_allowed = [] 
ht_1st_downs = []
ht_1st_downs_allowed = []
ht_totyd = []
ht_passyd = []
ht_rushyd = [] 
ht_to = []
ht_totyd_allowed = []
ht_passyd_allowed = []
ht_rushyd_allowed = []
ht_to_forced = []

counter = 0

for i in df['team']:
    if df['team'].iloc[counter] == df['home_t'].iloc[counter]:
        ht_pts_scored.append(df['points_scored'].iloc[counter])
        ht_pts_allowed.append(df['points_allowed'].iloc[counter])
        ht_1st_downs.append(df['1st_downs'].iloc[counter])
        ht_1st_downs_allowed.append(df['1st_downs_allowed'].iloc[counter])
        ht_totyd.append(df['totyd'].iloc[counter])
        ht_totyd_allowed.append(df['totyd_allowed'].iloc[counter])
        ht_passyd.append(df['passyd'].iloc[counter])
        ht_passyd_allowed.append(df['passyd_allowed'].iloc[counter])
        ht_rushyd.append(df['rushyd'].iloc[counter])
        ht_rushyd_allowed.append(df['rushyd_allowed'].iloc[counter])
        ht_to.append(df['to'].iloc[counter])
        ht_to_forced.append(df['to_forced'].iloc[counter])
    else:
        ht_pts_scored.append(df['points_allowed'].iloc[counter])
        ht_pts_allowed.append(df['points_scored'].iloc[counter])
        ht_1st_downs.append(df['1st_downs_allowed'].iloc[counter])
        ht_1st_downs_allowed.append(df['1st_downs'].iloc[counter])
        ht_totyd.append(df['totyd_allowed'].iloc[counter])
        ht_totyd_allowed.append(df['totyd'].iloc[counter])
        ht_passyd.append(df['passyd_allowed'].iloc[counter])
        ht_passyd_allowed.append(df['passyd'].iloc[counter])
        ht_rushyd.append(df['rushyd_allowed'].iloc[counter])
        ht_rushyd_allowed.append(df['rushyd'].iloc[counter])
        ht_to.append(df['to_forced'].iloc[counter])
        ht_to_forced.append(df['to'].iloc[counter])
    counter += 1

In [65]:
df

Unnamed: 0,season,team,week,day,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month,hour,date
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,9,20,2022-09-08 20:00:00
1,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,9,13,2022-09-11 13:00:00
2,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,9,13,2022-09-11 13:00:00
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,9,13,2022-09-11 13:00:00
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,9,13,2022-09-11 13:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,1,18,1994-01-29 18:00:00
7657,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,12,16,1994-12-31 16:00:00
7658,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,1,13,1994-01-01 13:00:00
7659,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,1,16,1994-01-01 16:00:00


In [66]:
ht_pts_scored[:5]

['10', '20.0', '9.0', '20', '24.0']

In [67]:
df = df.assign(ht_pts_scored = ht_pts_scored)
df = df.assign(ht_pts_allowed = ht_pts_allowed)
df = df.assign(ht_1st_downs = ht_1st_downs)
df = df.assign(ht_totyd = ht_totyd)
df = df.assign(ht_passyd = ht_passyd)
df = df.assign(ht_rushyd = ht_rushyd)
df = df.assign(ht_to = ht_to)
df = df.assign(ht_1st_downs_allowed = ht_1st_downs_allowed)
df = df.assign(ht_totyd_allowed = ht_totyd_allowed)
df = df.assign(ht_passyd_allowed = ht_passyd_allowed)
df = df.assign(ht_rushyd_allowed = ht_rushyd_allowed)
df = df.assign(ht_to_forced = ht_to_forced)

In [68]:
df

Unnamed: 0,season,team,week,day,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_totyd,ht_passyd,ht_rushyd,ht_to,ht_1st_downs_allowed,ht_totyd_allowed,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,9,20,2022-09-08 20:00:00,10,31,19,243,191,52,3,23,413,292,121,4
1,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,9,13,2022-09-11 13:00:00,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0
2,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,9,13,2022-09-11 13:00:00,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,9,13,2022-09-11 13:00:00,20,23,32,432,299,133,5,13,267,192,75,0
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,9,13,2022-09-11 13:00:00,24.0,26.0,15.0,261.0,207.0,54.0,1.0,23.0,355.0,138.0,217.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,1,18,1994-01-29 18:00:00,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0
7657,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,12,16,1994-12-31 16:00:00,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0
7658,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,1,13,1994-01-01 13:00:00,20.0,13.0,22.0,379.0,254.0,125.0,1.0,20.0,303.0,246.0,57.0,3.0
7659,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,1,16,1994-01-01 16:00:00,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0


In [69]:
# we probably need to develop 'home_team_record' and 'away_team_record' columns, but it might be too difficult to engineer


### Creating the Target feature

In [70]:
# Create the conditions
conditions = [
    (df['team'] == df['home_t']) & (df['result'] == 'W'),
    (df['team'] != df['home_t']) & (df['result'] == 'L')
]

# Create the choices
choices = [1, 0]

# Use np.select to create the 'home_team_wins' column
df['home_team_wins'] = np.select(conditions, choices)

In [71]:
# created a vectorized solution instead ^
# we have to create a target feature 'home_team_wins', where 0 = home team did not win, 1 = home team won
###ht_wins = []
###counter = 0
###
###for i in df['result']:
###    if df['team'].iloc[counter] == df['home_t'].iloc[counter]:
###        if i == 'W':
###            ht_wins.append(1)
###        else:
###            ht_wins.append(0)
###        counter += 1
###    else:
###        if i == 'W':
###            ht_wins.append(0)
###        else:
###            ht_wins.append(1)
###        counter += 1   

###df = df.assign(ht_wins = ht_wins)

In [72]:
df

Unnamed: 0,season,team,week,day,result,ot,record,@,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,day_of_week,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_totyd,ht_passyd,ht_rushyd,ht_to,ht_1st_downs_allowed,ht_totyd_allowed,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced,home_team_wins
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31,10,23,413,292,121,4,19,243,191,52,3,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,3,2022,9,20,2022-09-08 20:00:00,10,31,19,243,191,52,3,23,413,292,121,4,0
1,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,6,2022,9,13,2022-09-11 13:00:00,20.0,7.0,18.0,307.0,242.0,65.0,0,17.0,271.0,193.0,78.0,3.0,1
2,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,6,2022,9,13,2022-09-11 13:00:00,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,0
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20,23,32,432,299,133,5,13,267,192,75,0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,6,2022,9,13,2022-09-11 13:00:00,20,23,32,432,299,133,5,13,267,192,75,0,0
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,6,2022,9,13,2022-09-11 13:00:00,24.0,26.0,15.0,261.0,207.0,54.0,1.0,23.0,355.0,138.0,217.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7656,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,6,1994,1,18,1994-01-29 18:00:00,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0,0
7657,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,5,1994,12,16,1994-12-31 16:00:00,27.0,17.0,22.0,381.0,249.0,132.0,0,24.0,414.0,314.0,100.0,2.0,1
7658,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,6,1994,1,13,1994-01-01 13:00:00,20.0,13.0,22.0,379.0,254.0,125.0,1.0,20.0,303.0,246.0,57.0,3.0,0
7659,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,6,1994,1,16,1994-01-01 16:00:00,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,0


In [73]:
df.dtypes

season                           int64
team                            object
week                            object
day                              int64
result                          object
ot                               int64
record                          object
@                                int64
opp                             object
points_scored                   object
points_allowed                  object
1st_downs                       object
totyd                           object
passyd                          object
rushyd                          object
to                              object
1st_downs_allowed               object
totyd_allowed                   object
passyd_allowed                  object
rushyd_allowed                  object
to_forced                       object
off_exp_pts                     object
def_exp_pts                     object
sts_exp_pts                     object
away_t                          object
home_t                   

### Dropping unnecessary columns 

In [74]:
df = df.drop(['record', 'team', 'result', '@', 'opp', 'points_scored', 'points_allowed', '1st_downs', 
              'totyd', 'passyd', 'rushyd', 'to', '1st_downs_allowed', 'totyd_allowed', 'passyd_allowed', 
              'rushyd_allowed', 'to_forced', 'off_exp_pts', 'def_exp_pts', 'sts_exp_pts', 'year'], axis=1)

In [75]:
df.dtypes

season                           int64
week                            object
day                              int64
ot                               int64
away_t                          object
home_t                          object
day_of_week                      int64
month                            int64
hour                             int64
date                    datetime64[ns]
ht_pts_scored                   object
ht_pts_allowed                  object
ht_1st_downs                    object
ht_totyd                        object
ht_passyd                       object
ht_rushyd                       object
ht_to                           object
ht_1st_downs_allowed            object
ht_totyd_allowed                object
ht_passyd_allowed               object
ht_rushyd_allowed               object
ht_to_forced                    object
home_team_wins                   int64
dtype: object

### Feature Engineering

In [76]:
# the goal is to obtain a new column that contains rolling_averages for a team over the last four games.
# should the end of a season reset the rolling percentage??? Yes
# creating a df that is organized at the team and season level
grouped_teams = df.groupby("home_t")

In [77]:
# viewing one instance
group_arz = grouped_teams.get_group("Arizona Cardinals")
group_arz

Unnamed: 0,season,week,day,ot,away_t,home_t,day_of_week,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_totyd,ht_passyd,ht_rushyd,ht_to,ht_1st_downs_allowed,ht_totyd_allowed,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced,home_team_wins
8,2022,1,11,0,Kansas City Chiefs,Arizona Cardinals,6,9,16,2022-09-11 16:00:00,21.0,44.0,18.0,282.0,179.0,103.0,0,33.0,488.0,360.0,128.0,1.0,0
43,2022,11,21,0,San Francisco 49ers,Arizona Cardinals,7,11,20,2022-11-21 20:00:00,10.0,38.0,19.0,314.0,247.0,67.0,2.0,21.0,387.0,228.0,159.0,0,0
53,2022,12,27,0,Los Angeles Chargers,Arizona Cardinals,6,11,16,2022-11-27 16:00:00,24.0,25.0,20.0,366.0,185.0,181.0,2.0,22.0,311.0,246.0,65.0,0,0
77,2022,14,12,0,New England Patriots,Arizona Cardinals,7,12,20,2022-12-12 20:00:00,13.0,27.0,19.0,323.0,210.0,113.0,2.0,18.0,328.0,225.0,103.0,1.0,0
119,2022,16,25,1,Tampa Bay Buccaneers,Arizona Cardinals,6,12,20,2022-12-25 20:00:00,16.0,19.0,16.0,325.0,204.0,121.0,3.0,21.0,396.0,281.0,115.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,1994,16,18,0,Cincinnati Bengals,Arizona Cardinals,6,12,16,1994-12-18 16:00:00,28.0,7.0,24.0,364.0,212.0,152.0,0,12.0,189.0,125.0,64.0,3.0,0
7560,1994,2,11,0,New York Giants,Arizona Cardinals,6,9,20,1994-09-11 20:00:00,17.0,20.0,11.0,174.0,135.0,39.0,3.0,19.0,206.0,88.0,118.0,2.0,0
7599,1994,5,2,0,Minnesota Vikings,Arizona Cardinals,6,10,16,1994-10-02 16:00:00,17.0,7.0,21.0,309.0,200.0,109.0,2.0,19.0,358.0,340.0,18.0,4.0,1
7633,1994,8,23,0,Dallas Cowboys,Arizona Cardinals,6,10,16,1994-10-23 16:00:00,21.0,28.0,22.0,315.0,208.0,107.0,0,14.0,312.0,237.0,75.0,0,0


def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(4).mean() # closed='left' inside rolling parameters
    # the problem is that it is using the fourth games' info to predict the fourth game and beyond.
    # closed='left' would solve this problem, but that requires datetime. How do I resolve this???
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [79]:
def rolling_averages(group, cols, new_cols):
    """
    Compute rolling averages of the given columns for a pandas DataFrame grouped by some categorical variable.

    Parameters:
    group (pandas.DataFrame): The DataFrame to group by.
    cols (list of str): The names of the columns to compute rolling averages for.
    new_cols (list of str): The names of the new columns to create with the rolling averages.

    Returns:
    pandas.DataFrame: The original DataFrame with the new columns added, and rows with missing values dropped.
    """
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(4).mean() # Use the last 4 rows of each group to compute the rolling average.
    # Note that by default, rolling() uses a "right closed" window, meaning that the last value in the window is included.
    # We want to use a "left closed" window, meaning that the first value in the window is included, so we pass the parameter
    # closed='left' to rolling(). However, this requires that the 'date' column be of type datetime, so make sure to convert
    # it beforehand if it isn't already.
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) # Drop rows with missing values in the new columns.
    return group

In [80]:
# creating a list of the columns I plan on finding the rolling averages for
cols = ["home_team_wins", "ht_pts_scored", "ht_pts_allowed",
        "ht_totyd", "ht_to", "ht_totyd_allowed", "ht_to_forced",
        "ht_passyd", "ht_rushyd", "ht_passyd_allowed", "ht_rushyd_allowed",
        "ht_1st_downs", "ht_1st_downs_allowed", "ot"]

new_cols = [f"{c}_rolling" for c in cols]

In [81]:
# first four weeks are being dropped because of na values from rolling(4)
# should we bring datetime back??
# testing the function on one group (arz)
rolling_averages(group_arz, cols, new_cols)

Unnamed: 0,season,week,day,ot,away_t,home_t,day_of_week,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_totyd,ht_passyd,ht_rushyd,ht_to,ht_1st_downs_allowed,ht_totyd_allowed,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced,home_team_wins,home_team_wins_rolling,ht_pts_scored_rolling,ht_pts_allowed_rolling,ht_totyd_rolling,ht_to_rolling,ht_totyd_allowed_rolling,ht_to_forced_rolling,ht_passyd_rolling,ht_rushyd_rolling,ht_passyd_allowed_rolling,ht_rushyd_allowed_rolling,ht_1st_downs_rolling,ht_1st_downs_allowed_rolling,ot_rolling
7641,1994,9,30,1,Pittsburgh Steelers,Arizona Cardinals,6,10,20,1994-10-30 20:00:00,20.0,17.0,16.0,335.0,236.0,99.0,1.0,12.0,317.0,232.0,85.0,3.0,0,0.25,18.75,18.00,283.25,1.50,298.25,2.25,194.75,88.50,224.25,74.00,17.50,16.00,0.25
7478,1994,12,20,0,Philadelphia Eagles,Arizona Cardinals,6,11,16,1994-11-20 16:00:00,12.0,6.0,16.0,281.0,123.0,158.0,1.0,14.0,185.0,110.0,75.0,2.0,1,0.50,17.50,14.50,310.00,1.00,293.00,2.25,191.75,118.25,229.75,63.25,18.75,14.75,0.25
7491,1994,13,27,1,Chicago Bears,Arizona Cardinals,6,11,16,1994-11-27 16:00:00,16.0,19.0,15.0,244.0,177.0,67.0,1.0,20.0,318.0,186.0,132.0,2.0,0,0.25,17.25,17.50,293.75,0.75,283.00,1.75,186.00,107.75,191.25,91.75,17.25,15.00,0.50
7519,1994,15,11,0,Washington Commanders,Arizona Cardinals,6,12,16,1994-12-11 16:00:00,17.0,15.0,14.0,278.0,194.0,84.0,1.0,19.0,406.0,283.0,123.0,2.0,1,0.50,16.25,14.25,284.50,1.00,306.50,2.25,182.50,102.00,202.75,103.75,15.25,16.25,0.50
7527,1994,16,18,0,Cincinnati Bengals,Arizona Cardinals,6,12,16,1994-12-18 16:00:00,28.0,7.0,24.0,364.0,212.0,152.0,0,12.0,189.0,125.0,64.0,3.0,0,0.50,18.25,11.75,291.75,0.75,274.50,2.25,176.50,115.25,176.00,98.50,17.25,16.25,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271,2022,9,6,0,Seattle Seahawks,Arizona Cardinals,6,11,16,2022-11-06 16:00:00,21.0,31.0,15.0,262.0,140.0,122.0,1.0,27.0,421.0,263.0,158.0,1.0,0,0.00,23.00,26.25,329.00,0.50,402.75,1.25,215.75,113.25,282.25,120.50,20.50,22.75,0.00
43,2022,11,21,0,San Francisco 49ers,Arizona Cardinals,7,11,20,2022-11-21 20:00:00,10.0,38.0,19.0,314.0,247.0,67.0,2.0,21.0,387.0,228.0,159.0,0,0,0.00,22.50,30.75,316.25,1.00,414.75,1.00,203.75,112.50,279.50,135.25,19.50,24.25,0.00
53,2022,12,27,0,Los Angeles Chargers,Arizona Cardinals,6,11,16,2022-11-27 16:00:00,24.0,25.0,20.0,366.0,185.0,181.0,2.0,22.0,311.0,246.0,65.0,0,0,0.00,24.25,32.00,317.00,1.25,403.25,1.00,190.25,126.75,286.50,116.75,18.75,23.75,0.00
77,2022,14,12,0,New England Patriots,Arizona Cardinals,7,12,20,2022-12-12 20:00:00,13.0,27.0,19.0,323.0,210.0,113.0,2.0,18.0,328.0,225.0,103.0,1.0,0,0.00,17.00,30.25,316.25,1.75,361.75,0.50,195.50,120.75,240.50,121.25,18.25,22.00,0.00


In [82]:
#apply the new rolling_averages columns to each group
df_rolling = df.groupby(["season", "week"]).apply(lambda x: rolling_averages(x, cols, new_cols))

       'ht_passyd_allowed', 'ht_pts_allowed', 'ht_pts_scored', 'ht_rushyd',
       'ht_rushyd_allowed', 'ht_to', 'ht_to_forced', 'ht_totyd',
       'ht_totyd_allowed'],
      dtype='object')
  rolling_stats = group[cols].rolling(4).mean() # Use the last 4 rows of each group to compute the rolling average.


ValueError: Columns must be same length as key

In [83]:
df_rolling

NameError: name 'df_rolling' is not defined

In [None]:
# dropping extra index levels
df_rolling = df_rolling.droplevel('season')
df_rolling = df_rolling.droplevel('week')
df_rolling

In [None]:
df_rolling.dtypes

In [None]:
from sklearn.preprocessing import OneHotEncoder

#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_df = pd.DataFrame(encoder.fit_transform(df_rolling[['away_t', 'home_t']]).toarray())

#merge one-hot encoded columns back with original DataFrame
a_df = df_rolling.join(encoder_df)

#view final df
a_df

#### Getting "NaN" values because the index doesnt match (i think)

In [None]:
df_rolling

In [None]:
# using pandas to convert the prepared dataframe into a csv file that is model ready.
df_rolling.to_csv("Data/transformed.csv", index=False)