# 02 Data Prep

In [1]:
# common imports
import requests
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import sys
from datetime import datetime

sys.executable

'/opt/anaconda3/bin/python'

In [2]:
# will display all the columns in the df now
pd.set_option('display.max_columns', 500)

In [3]:
sys.path

['/Users/mirror/Desktop/GitHub/NFL_Game_Predictor',
 '/opt/anaconda3/lib/python38.zip',
 '/opt/anaconda3/lib/python3.8',
 '/opt/anaconda3/lib/python3.8/lib-dynload',
 '',
 '/Users/mirror/.local/lib/python3.8/site-packages',
 '/opt/anaconda3/lib/python3.8/site-packages',
 '/opt/anaconda3/lib/python3.8/site-packages/aeosa',
 '/opt/anaconda3/lib/python3.8/site-packages/IPython/extensions',
 '/Users/mirror/.ipython']

In [4]:
# Importing data from csv file
# avoid using hard coded paths like the next two lines
# path = "/Users/mirror/Desktop/GitHub/nfl_game_predictor/data/scraped_data.csv"
# df = pd.read_csv(path) # instead, use this

from src.config import DATA_DIR
path = DATA_DIR / 'scraped_data.csv'
df = pd.read_csv(path)

df.head(30)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,BUF,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96
1,2022,BUF,2,Mon,September 19,7:15PM ET,W,,2-0,,Tennessee Titans,41.0,7.0,23.0,414.0,313.0,101.0,,12.0,187.0,107.0,80.0,4.0,17.69,18.01,1.55
2,2022,BUF,3,Sun,September 25,1:00PM ET,L,,2-1,@,Miami Dolphins,19.0,21.0,31.0,497.0,382.0,115.0,1.0,15.0,212.0,171.0,41.0,,15.88,-7.45,-4.86
3,2022,BUF,4,Sun,October 2,1:00PM ET,W,,3-1,@,Baltimore Ravens,23.0,20.0,22.0,326.0,201.0,125.0,2.0,22.0,296.0,134.0,162.0,2.0,2.1,2.66,-1.69
4,2022,BUF,5,Sun,October 9,1:00PM ET,W,,4-1,,Pittsburgh Steelers,38.0,3.0,21.0,552.0,432.0,120.0,2.0,23.0,364.0,310.0,54.0,2.0,20.66,9.42,3.54
5,2022,BUF,6,Sun,October 16,4:25PM ET,W,,5-1,@,Kansas City Chiefs,24.0,20.0,26.0,443.0,318.0,125.0,1.0,23.0,387.0,319.0,68.0,2.0,11.13,-6.45,0.56
6,2022,BUF,8,Sun,October 30,8:20PM ET,W,,6-1,,Green Bay Packers,27.0,17.0,20.0,369.0,216.0,153.0,2.0,21.0,398.0,190.0,208.0,1.0,12.79,-5.55,4.01
7,2022,NYJ,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
8,2022,NYJ,2,Sun,September 18,1:00PM ET,W,,1-1,@,Cleveland Browns,31.0,30.0,20.0,402.0,309.0,93.0,1.0,29.0,405.0,221.0,184.0,1.0,15.19,-17.92,5.38
9,2022,NYJ,3,Sun,September 25,1:00PM ET,L,,1-2,,Cincinnati Bengals,12.0,27.0,19.0,328.0,252.0,76.0,4.0,20.0,330.0,261.0,69.0,1.0,-16.45,-3.99,6.98


### Quick Exploration

In [5]:
df.shape

(15000, 26)

In [6]:
df.dtypes

season                 int64
team                  object
week                  object
day                   object
date                  object
time                  object
result                object
ot                    object
record                object
home_team             object
opp                   object
points_scored        float64
points_allowed       float64
1st_downs            float64
totyd                float64
passyd               float64
rushyd               float64
to                   float64
1st_downs_allowed    float64
totyd_allowed        float64
passyd_allowed       float64
rushyd_allowed       float64
to_forced            float64
off_exp_pts          float64
def_exp_pts          float64
sts_exp_pts          float64
dtype: object

In [7]:
df.isnull().sum()

season                   0
team                     0
week                     0
day                      0
date                     0
time                     0
result                   0
ot                   14078
record                   0
home_team             7472
opp                      0
points_scored            0
points_allowed           0
1st_downs                0
totyd                    0
passyd                   2
rushyd                   0
to                    3287
1st_downs_allowed        0
totyd_allowed            0
passyd_allowed           2
rushyd_allowed           0
to_forced             3287
off_exp_pts              0
def_exp_pts              0
sts_exp_pts              0
dtype: int64

In [8]:
df['ot'].isnull().sum()

14078

In [9]:
# discovering how many times an opponent played a game
df.opp.value_counts()

New England Patriots        506
Green Bay Packers           498
Pittsburgh Steelers         493
Indianapolis Colts          489
Philadelphia Eagles         487
Seattle Seahawks            485
San Francisco 49ers         484
Denver Broncos              480
Kansas City Chiefs          478
Minnesota Vikings           477
Dallas Cowboys              477
New Orleans Saints          476
Atlanta Falcons             475
New York Giants             474
Tampa Bay Buccaneers        474
New York Jets               471
Miami Dolphins              470
Arizona Cardinals           469
Chicago Bears               468
Buffalo Bills               468
Cincinnati Bengals          468
Detroit Lions               463
Carolina Panthers           458
Jacksonville Jaguars        455
Baltimore Ravens            452
Washington Redskins         423
Cleveland Browns            414
Oakland Raiders             408
Tennessee Titans            394
San Diego Chargers          382
St. Louis Rams              346
Houston 

In [10]:
# # importing a new library that was recommended to me by my mentor
# import sweetviz as sv

# df_report = sv.analyze(df)
# df_report.show_html()
# #df_report.show_notebook()

In [11]:
# compares two dataframes via sweetviz
#df_comp = sv.compare(df)

In [12]:
# another way to get great insights is to use the comparison functionality to split your dataset into 2 sub-populations.
#df_comp_intra = sv.compare_intra(df)

### Initial Data Cleansing

In [13]:
# there are only supposed to be 32 teams in the NFL. some teams have changed their name and/or location.
len(df.opp.unique())

40

In [14]:
# some teams have changed their name and/or location, which created another opponent.
# manually replacing team names.
df.loc[df["opp"] == "Washington Redskins", "opp"] = "Washington Commanders"
df.loc[df["opp"] == "Washington Football Team", "opp"] = "Washington Commanders"
df.loc[df["opp"] == "Oakland Raiders", "opp"] = "Las Vegas Raiders"
df.loc[df["opp"] == "Los Angeles Raiders", "opp"] = "Las Vegas Raiders"
df.loc[df["opp"] == "Houston Oilers", "opp"] = "Tennessee Titans"
df.loc[df["opp"] == "Tennessee Oilers", "opp"] = "Tennessee Titans"
df.loc[df["opp"] == "San Diego Chargers", "opp"] = "Los Angeles Chargers"
df.loc[df["opp"] == "St. Louis Rams", "opp"] = "Los Angeles Rams"

In [15]:
# verifying that there are only 32 teams in the df.opp column now.
len(df.opp.unique())

32

In [16]:
# first sort the df by season and week
df = df.sort_values(by=['season', 'week'], ascending=[False, True])

In [17]:
# right now, we two records for each game played because we scraped the data for each team's season
# we need to reduce the df to only have one record for each game played
df.head(33)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,BUF,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96
7,2022,NYJ,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
15,2022,MIA,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98
23,2022,NE,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98
31,2022,BAL,1,Sun,September 11,1:00PM ET,W,,1-0,@,New York Jets,24.0,9.0,13.0,274.0,211.0,63.0,1.0,24.0,380.0,297.0,83.0,2.0,3.65,8.07,5.04
39,2022,CIN,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,,-7.49,8.41,-4.85
47,2022,CLE,1,Sun,September 11,1:00PM ET,W,,1-0,@,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66
55,2022,PIT,1,Sun,September 11,1:00PM ET,W,OT,1-0,@,Cincinnati Bengals,23.0,20.0,13.0,267.0,192.0,75.0,,32.0,432.0,299.0,133.0,5.0,-8.41,7.49,4.85
63,2022,TEN,1,Sun,September 11,4:25PM ET,L,,0-1,,New York Giants,20.0,21.0,19.0,359.0,266.0,93.0,1.0,19.0,394.0,156.0,238.0,2.0,5.97,-3.72,-2.44
70,2022,IND,1,Sun,September 11,1:00PM ET,T,OT,0-0-1,@,Houston Texans,20.0,20.0,33.0,514.0,340.0,174.0,2.0,20.0,299.0,222.0,77.0,1.0,5.67,2.14,-9.43


In [18]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,BUF,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96
7,2022,NYJ,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
15,2022,MIA,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98
23,2022,NE,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98
31,2022,BAL,1,Sun,September 11,1:00PM ET,W,,1-0,@,New York Jets,24.0,9.0,13.0,274.0,211.0,63.0,1.0,24.0,380.0,297.0,83.0,2.0,3.65,8.07,5.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14716,1994,KC,Wild Card,Sat,December 31,4:00PM ET,L,,9-8,@,Miami Dolphins,17.0,27.0,24.0,414.0,314.0,100.0,2.0,22.0,381.0,249.0,132.0,,8.31,-15.96,-1.16
14863,1994,MIN,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65
14880,1994,GB,Wild Card,Sat,December 31,12:30PM ET,W,,10-7,,Detroit Lions,16.0,12.0,18.0,336.0,255.0,81.0,,9.0,171.0,175.0,-4.0,,-2.93,11.53,0.69
14898,1994,DET,Wild Card,Sat,December 31,12:30PM ET,L,,9-8,@,Green Bay Packers,12.0,16.0,9.0,171.0,175.0,-4.0,,18.0,336.0,255.0,81.0,,-11.53,2.93,-0.69


In [19]:
# Team name was abbreviated, making it match to opp name
z = set(df['team'].values.tolist())
y = set(df['opp'].values.tolist())
z = list(z)
z.sort()
y = list(y)
y.sort()

In [20]:
len(y)

32

In [21]:
y

['Arizona Cardinals',
 'Atlanta Falcons',
 'Baltimore Ravens',
 'Buffalo Bills',
 'Carolina Panthers',
 'Chicago Bears',
 'Cincinnati Bengals',
 'Cleveland Browns',
 'Dallas Cowboys',
 'Denver Broncos',
 'Detroit Lions',
 'Green Bay Packers',
 'Houston Texans',
 'Indianapolis Colts',
 'Jacksonville Jaguars',
 'Kansas City Chiefs',
 'Las Vegas Raiders',
 'Los Angeles Chargers',
 'Los Angeles Rams',
 'Miami Dolphins',
 'Minnesota Vikings',
 'New England Patriots',
 'New Orleans Saints',
 'New York Giants',
 'New York Jets',
 'Philadelphia Eagles',
 'Pittsburgh Steelers',
 'San Francisco 49ers',
 'Seattle Seahawks',
 'Tampa Bay Buccaneers',
 'Tennessee Titans',
 'Washington Commanders']

In [22]:
# we must reorder a couple of teams in z so that it matches y
z

['ARZ',
 'ATL',
 'BAL',
 'BUF',
 'CAR',
 'CHI',
 'CIN',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GB',
 'HOU',
 'IND',
 'JAX',
 'KC',
 'LAC',
 'LAR',
 'LV',
 'MIA',
 'MIN',
 'NE',
 'NO',
 'NYG',
 'NYJ',
 'PHI',
 'PIT',
 'SEA',
 'SF',
 'TB',
 'TEN',
 'WAS']

In [23]:
z = z[0:16] + z[18:19] + z[16:17] + z[17:18] + z[19:-5] + z[-4:-3] + z[-5:-4] + z[-3:]
z

['ARZ',
 'ATL',
 'BAL',
 'BUF',
 'CAR',
 'CHI',
 'CIN',
 'CLE',
 'DAL',
 'DEN',
 'DET',
 'GB',
 'HOU',
 'IND',
 'JAX',
 'KC',
 'LV',
 'LAC',
 'LAR',
 'MIA',
 'MIN',
 'NE',
 'NO',
 'NYG',
 'NYJ',
 'PHI',
 'PIT',
 'SF',
 'SEA',
 'TB',
 'TEN',
 'WAS']

In [24]:
len(set(z))

32

In [25]:
# creating dictionary with the two lists (z & y)
team_dict = dict(zip(z, y))
len(team_dict)

32

In [28]:
# using the manually created team_dictionary to replace the team names.
df = df.replace({"team": team_dict})

In [29]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96
7,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04
15,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98
23,2022,New England Patriots,1,Sun,September 11,1:00PM ET,L,,0-1,@,Miami Dolphins,7.0,20.0,17.0,271.0,193.0,78.0,3.0,18.0,307.0,242.0,65.0,,-8.28,-6.08,-1.98
31,2022,Baltimore Ravens,1,Sun,September 11,1:00PM ET,W,,1-0,@,New York Jets,24.0,9.0,13.0,274.0,211.0,63.0,1.0,24.0,380.0,297.0,83.0,2.0,3.65,8.07,5.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14716,1994,Kansas City Chiefs,Wild Card,Sat,December 31,4:00PM ET,L,,9-8,@,Miami Dolphins,17.0,27.0,24.0,414.0,314.0,100.0,2.0,22.0,381.0,249.0,132.0,,8.31,-15.96,-1.16
14863,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65
14880,1994,Green Bay Packers,Wild Card,Sat,December 31,12:30PM ET,W,,10-7,,Detroit Lions,16.0,12.0,18.0,336.0,255.0,81.0,,9.0,171.0,175.0,-4.0,,-2.93,11.53,0.69
14898,1994,Detroit Lions,Wild Card,Sat,December 31,12:30PM ET,L,,9-8,@,Green Bay Packers,12.0,16.0,9.0,171.0,175.0,-4.0,,18.0,336.0,255.0,81.0,,-11.53,2.93,-0.69


In [30]:
# create home team and away team columns
h = []
a = [] 
count = 0

afc = ['Baltimore Ravens', 'Buffalo Bills', 'Cincinnati Bengals', 'Cleveland Browns', 'Denver Broncos', 'Houston Texans', 
       'Indianapolis Colts', 'Jacksonville Jaguars', 'Kansas City Chiefs', 'Las Vegas Raiders','Los Angeles Chargers', 
       'Miami Dolphins', 'New England Patriots', 'New York Jets', 'Pittsburgh Steelers', 'Tennessee Titans']

nfc = ['Arizona Cardinals', 'Atlanta Falcons', 'Carolina Panthers', 'Chicago Bears', 'Dallas Cowboys', 'Detroit Lions',
       'Green Bay Packers', 'Los Angeles Rams', 'Minnesota Vikings', 'New Orleans Saints', 'New York Giants',
       'Philadelphia Eagles', 'San Francisco 49ers', 'Seattle Seahawks', 'Tampa Bay Buccaneers', 'Washington Commanders']

for i in df['home_team']:
    if '@' in str(i):
        h.append(df['opp'].iloc[count])
        a.append(df['team'].iloc[count])
    elif 'N' in str(i): # this entire elif block is to compensate for games in a neutral 'N' stadium specifically for SuperBowls.
        if int(df['season'].iloc[count])%2 == 0:
            if df['team'].iloc[count] in afc:
                h.append(df['team'].iloc[count])
                a.append(df['opp'].iloc[count])
            else: 
                h.append(df['opp'].iloc[count])
                a.append(df['team'].iloc[count])
        else:
            if df['team'].iloc[count] in nfc:
                h.append(df['team'].iloc[count])
                a.append(df['opp'].iloc[count])
            else:
                h.append(df['opp'].iloc[count])
                a.append(df['team'].iloc[count])
    else:
        h.append(df['team'].iloc[count])
        a.append(df['opp'].iloc[count])
    count += 1
        
df = df.assign(away_t = a)
df = df.assign(home_t = h)

In [31]:
# we have two rows of data for each game so we will reduce it to one row for each game
df = df.drop_duplicates(subset=['season', 'week', 'home_t', 'away_t'])

In [32]:
# reset index
df = df.reset_index(drop=True)

In [33]:
df = df.sort_values(by=['season', 'week'], ascending=[False, True])
# thinks season and week columns are string datatypes

In [34]:
df

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
0,2022,Buffalo Bills,1,Thu,September 8,8:20PM ET,W,,1-0,@,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams
1,2022,New York Jets,1,Sun,September 11,1:00PM ET,L,,0-1,,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets
2,2022,Miami Dolphins,1,Sun,September 11,1:00PM ET,W,,1-0,,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins
3,2022,Cincinnati Bengals,1,Sun,September 11,1:00PM ET,L,OT,0-1,,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals
4,2022,Cleveland Browns,1,Sun,September 11,1:00PM ET,W,,1-0,@,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,1994,Los Angeles Chargers,SuperBowl,Sun,January 29,6:00PM ET,L,,13-6,N,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers
7496,1994,Miami Dolphins,Wild Card,Sat,December 31,4:00PM ET,W,,11-6,,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins
7497,1994,New England Patriots,Wild Card,Sun,January 1,12:30PM ET,L,,10-7,@,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns
7498,1994,Minnesota Vikings,Wild Card,Sun,January 1,4:00PM ET,L,,10-7,,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings


In [35]:
df.tail(50)

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
7450,1994,Green Bay Packers,6,Sun,October 9,1:00PM ET,W,,3-3,,Los Angeles Rams,24.0,17.0,21.0,292.0,208.0,84.0,2.0,13.0,211.0,147.0,64.0,2.0,-4.98,12.67,0.73,Los Angeles Rams,Green Bay Packers
7451,1994,Detroit Lions,6,Sun,October 9,1:00PM ET,L,,2-4,,San Francisco 49ers,21.0,27.0,21.0,340.0,242.0,98.0,2.0,20.0,233.0,143.0,90.0,,-1.85,-9.3,-0.83,San Francisco 49ers,Detroit Lions
7452,1994,Chicago Bears,6,Sun,October 9,1:00PM ET,W,,4-2,,New Orleans Saints,17.0,7.0,16.0,305.0,174.0,131.0,1.0,17.0,316.0,209.0,107.0,2.0,3.69,7.43,3.79,New Orleans Saints,Chicago Bears
7453,1994,Tampa Bay Buccaneers,6,Sun,October 9,1:00PM ET,L,,2-4,@,Atlanta Falcons,13.0,34.0,19.0,320.0,288.0,32.0,3.0,20.0,356.0,271.0,85.0,3.0,-17.94,6.57,-2.91,Tampa Bay Buccaneers,Atlanta Falcons
7454,1994,Miami Dolphins,7,Sun,October 16,1:00PM ET,W,OT,5-2,,Las Vegas Raiders,20.0,17.0,19.0,344.0,175.0,169.0,3.0,12.0,210.0,86.0,124.0,1.0,-24.79,26.41,0.39,Las Vegas Raiders,Miami Dolphins
7455,1994,New England Patriots,7,Sun,October 16,1:00PM ET,L,,3-4,@,New York Jets,17.0,24.0,19.0,324.0,220.0,104.0,4.0,17.0,242.0,70.0,172.0,2.0,-12.68,11.6,-6.53,New England Patriots,New York Jets
7456,1994,Indianapolis Colts,7,Sun,October 16,1:00PM ET,W,,3-4,@,Buffalo Bills,27.0,17.0,20.0,325.0,231.0,94.0,,22.0,329.0,250.0,79.0,2.0,13.96,-11.23,2.22,Indianapolis Colts,Buffalo Bills
7457,1994,Pittsburgh Steelers,7,Sun,October 16,1:00PM ET,W,,4-2,,Cincinnati Bengals,14.0,10.0,16.0,243.0,129.0,114.0,1.0,11.0,241.0,135.0,106.0,2.0,-9.26,15.45,-0.3,Cincinnati Bengals,Pittsburgh Steelers
7458,1994,Cleveland Browns,7,Thu,October 13,8:00PM ET,W,,5-1,@,Tennessee Titans,11.0,8.0,12.0,260.0,196.0,64.0,2.0,20.0,305.0,199.0,106.0,2.0,-4.03,6.25,0.38,Cleveland Browns,Tennessee Titans
7459,1994,Los Angeles Chargers,7,Sun,October 16,4:00PM ET,W,,6-0,@,New Orleans Saints,36.0,22.0,25.0,378.0,180.0,198.0,,24.0,284.0,180.0,104.0,2.0,15.49,-4.78,7.69,Los Angeles Chargers,New Orleans Saints


In [36]:
#df.loc[df['team'] == 'New York Jets'].tail(50)

In [37]:
df[df.week == 'SuperBowl']
# we must determine which teams are the home teams and away teams for the SB. 
# even year = AFC team is the home team for the SB, odd year = NFC team is the home team for the SB
# the game is played on neutral ground where the location is determined years in advance.

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
402,2021,Cincinnati Bengals,SuperBowl,Sun,February 13,6:30PM ET,L,,13-8,N,Los Angeles Rams,20.0,23.0,15.0,305.0,226.0,79.0,,17.0,313.0,270.0,43.0,2.0,-5.48,-0.42,1.93,Cincinnati Bengals,Los Angeles Rams
671,2020,Kansas City Chiefs,SuperBowl,Sun,February 7,6:30PM ET,L,,16-3,N,Tampa Bay Buccaneers,9.0,31.0,22.0,350.0,243.0,107.0,2.0,26.0,340.0,195.0,145.0,,-7.9,-10.75,-1.18,Tampa Bay Buccaneers,Kansas City Chiefs
940,2019,Kansas City Chiefs,SuperBowl,Sun,February 2,6:30PM ET,W,,15-4,N,San Francisco 49ers,31.0,20.0,26.0,397.0,268.0,129.0,2.0,21.0,351.0,210.0,141.0,2.0,18.54,-5.19,-1.58,Kansas City Chiefs,San Francisco 49ers
1207,2018,New England Patriots,SuperBowl,Sun,February 3,6:30PM ET,W,,14-5,N,Los Angeles Rams,13.0,3.0,22.0,407.0,253.0,154.0,1.0,14.0,260.0,198.0,62.0,1.0,2.46,11.82,-4.34,Los Angeles Rams,New England Patriots
1474,2017,New England Patriots,SuperBowl,Sun,February 4,6:30PM ET,L,,15-4,N,Philadelphia Eagles,33.0,41.0,29.0,613.0,500.0,113.0,1.0,25.0,538.0,374.0,164.0,1.0,27.61,-29.26,-4.65,New England Patriots,Philadelphia Eagles
1741,2016,New England Patriots,SuperBowl,Sun,February 5,6:30PM ET,W,OT,17-2,N,Atlanta Falcons,34.0,28.0,37.0,546.0,442.0,104.0,2.0,17.0,344.0,240.0,104.0,1.0,13.47,-7.42,-1.32,Atlanta Falcons,New England Patriots
2008,2015,Denver Broncos,SuperBowl,Sun,February 7,6:39PM ET,W,,15-4,N,Carolina Panthers,24.0,10.0,11.0,194.0,104.0,90.0,2.0,21.0,315.0,197.0,118.0,4.0,-20.81,20.91,11.64,Denver Broncos,Carolina Panthers
2275,2014,New England Patriots,SuperBowl,Sun,February 1,6:30PM ET,W,,15-4,N,Seattle Seahawks,28.0,24.0,25.0,377.0,320.0,57.0,2.0,20.0,396.0,234.0,162.0,1.0,13.67,-10.4,1.37,Seattle Seahawks,New England Patriots
2542,2013,Denver Broncos,SuperBowl,Sun,February 2,6:32PM ET,L,,15-4,N,Seattle Seahawks,8.0,43.0,18.0,306.0,279.0,27.0,4.0,17.0,341.0,206.0,135.0,,-16.32,-7.18,-9.66,Denver Broncos,Seattle Seahawks
2809,2012,Baltimore Ravens,SuperBowl,Sun,February 3,6:31PM ET,W,,14-6,N,San Francisco 49ers,34.0,31.0,21.0,367.0,274.0,93.0,1.0,23.0,468.0,286.0,182.0,2.0,9.33,-8.82,1.8,San Francisco 49ers,Baltimore Ravens


### Converting Missing Values and Categorical Data Types

In [39]:
df.isnull().sum()

season                  0
team                    0
week                    0
day                     0
date                    0
time                    0
result                  0
ot                   7039
record                  0
home_team            3763
opp                     0
points_scored           0
points_allowed          0
1st_downs               0
totyd                   0
passyd                  1
rushyd                  0
to                   1818
1st_downs_allowed       0
totyd_allowed           0
passyd_allowed          1
rushyd_allowed          0
to_forced            1469
off_exp_pts             0
def_exp_pts             0
sts_exp_pts             0
away_t                  0
home_t                  0
dtype: int64

In [40]:
df.dtypes

season                 int64
team                  object
week                  object
day                   object
date                  object
time                  object
result                object
ot                    object
record                object
home_team             object
opp                   object
points_scored        float64
points_allowed       float64
1st_downs            float64
totyd                float64
passyd               float64
rushyd               float64
to                   float64
1st_downs_allowed    float64
totyd_allowed        float64
passyd_allowed       float64
rushyd_allowed       float64
to_forced            float64
off_exp_pts          float64
def_exp_pts          float64
sts_exp_pts          float64
away_t                object
home_t                object
dtype: object

In [41]:
# convert NaN values for columns "ot" (overtime), "to" (turnovers), and "to_forced" to the integer 0.
df['ot'] = df['ot'].fillna(0)
df['to'] = df['to'].fillna(0)
df['to_forced'] = df['to_forced'].fillna(0)

In [42]:
# convert misc values for columns "home/away" (home team?) and "ot" (overtime) to a binary integer
df.loc[df["home_team"] == "@", "home_team"] = 0
df.loc[df["home_team"] == "N", "home_team"] = 0
df.loc[df["ot"] == "OT", "ot"] = 1

In [105]:
# changing the data types for some of the object columns
df[["ot"]] = df[["ot"]].apply(pd.to_numeric)

In [43]:
# convert NaN values for columns "home/away" (home team?) to the integer 1.
df['home_team'] = df['home_team'].fillna(1)

In [44]:
# checking how many rows still have missing values in them
df[df.isnull().any(axis=1)]

Unnamed: 0,season,team,week,day,date,time,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t
3476,2009,New York Jets,17,Sun,January 3,8:31PM ET,W,0,9-7,1,Cincinnati Bengals,37.0,0.0,21.0,320.0,63.0,257.0,0.0,5.0,72.0,,72.0,3.0,10.82,33.61,-6.11,Cincinnati Bengals,New York Jets
5085,2003,Las Vegas Raiders,17,Sun,December 28,4:15PM ET,L,0,4-12,0,Los Angeles Chargers,14.0,21.0,9.0,141.0,,141.0,1.0,20.0,337.0,74.0,263.0,0.0,-25.91,-1.64,16.75,Las Vegas Raiders,Los Angeles Chargers


In [45]:
# converting the last four missing values to zero. The net passing yards equaled zero due to sacks.
df['passyd'] = df['passyd'].fillna(0)
df['passyd_allowed'] = df['passyd_allowed'].fillna(0)

In [46]:
# verifying the data types in the df
df.dtypes

season                 int64
team                  object
week                  object
day                   object
date                  object
time                  object
result                object
ot                    object
record                object
home_team              int64
opp                   object
points_scored        float64
points_allowed       float64
1st_downs            float64
totyd                float64
passyd               float64
rushyd               float64
to                   float64
1st_downs_allowed    float64
totyd_allowed        float64
passyd_allowed       float64
rushyd_allowed       float64
to_forced            float64
off_exp_pts          float64
def_exp_pts          float64
sts_exp_pts          float64
away_t                object
home_t                object
dtype: object

In [47]:
# checking how many rows still have missing values in them
len(df[df.isnull().any(axis=1)])

0

### Converting Date and Time columns

In [48]:
# "day" of the week needs to be converted into an integer.
day_dict = {"Tue":1, "Wed":2, "Thu":3, "Fri":4, "Sat":5, "Sun":6, "Mon":7}

# using the manually created day_dictionary to replace the day of the week.
df = df.replace({"day_of_week": day_dict})

In [49]:
df['year'] = df['season'] 

In [50]:
# date need to be converted into a numerical value (month and day_of_month)
c = []
v = []
for i in df['date']:
    c.append((i.split(' ')[0]))
    v.append(int(i.split(' ')[1]))
df = df.assign(month = c)
df = df.assign(day = v)

In [51]:
# "month of the year needs to be converted into an integer. NFL games start in August (preseason) and end in February (super bowl).
month_dict = {"January":1, "February":2, "March":3, "April":4, "May":5, "June":6, 
            "July":7, "August":8, "September":9, "October":10, "November":11, "December":12}

# using the manually created day_dictionary to replace the day of the week.
df = df.replace({"month": month_dict}) 

In [52]:
# we can drop the date column after converting it
df = df.drop('date', axis=1)

In [53]:
# time needs to be converted into a numerical value
b = []
for i in df['time']:
    if 'PM' in i:
        b.append(int(i[0]) + 12)
    else:
        b.append(int(i[0]))
df = df.assign(hour = b)

In [54]:
# we can drop the time column after converting it
df = df.drop('time', axis=1)

In [57]:
df['date'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

In [91]:
df

Unnamed: 0,season,team,week,day,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_1st_downs_allowed,ht_totyd,ht_totyd_allowed
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,2022,9,20,2022-09-08 20:00:00,10.0,31.0,19.0,23.0,243.0,413.0
1,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,2022,9,13,2022-09-11 13:00:00,9.0,24.0,24.0,13.0,380.0,274.0
2,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0.0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,2022,9,13,2022-09-11 13:00:00,20.0,7.0,18.0,17.0,307.0,271.0
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,0.0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,2022,9,13,2022-09-11 13:00:00,20.0,23.0,32.0,13.0,432.0,267.0
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0.0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,2022,9,13,2022-09-11 13:00:00,24.0,26.0,15.0,23.0,261.0,355.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0.0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,1994,1,18,1994-01-29 18:00:00,26.0,49.0,20.0,28.0,354.0,455.0
7496,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0.0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,1994,12,16,1994-12-31 16:00:00,27.0,17.0,22.0,24.0,381.0,414.0
7497,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,1994,1,13,1994-01-01 13:00:00,20.0,13.0,22.0,20.0,379.0,303.0
7498,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,1994,1,16,1994-01-01 16:00:00,18.0,35.0,22.0,18.0,389.0,308.0


In [92]:
# verifying the data types in the df
df.dtypes

season                           int64
team                            object
week                            object
day                              int64
result                          object
ot                              object
record                          object
home_team                        int64
opp                             object
points_scored                  float64
points_allowed                 float64
1st_downs                      float64
totyd                          float64
passyd                         float64
rushyd                         float64
to                             float64
1st_downs_allowed              float64
totyd_allowed                  float64
passyd_allowed                 float64
rushyd_allowed                 float64
to_forced                      float64
off_exp_pts                    float64
def_exp_pts                    float64
sts_exp_pts                    float64
away_t                          object
home_t                   

In [93]:
# we now have to create columns so that it is home_team_pts_scored, home_team_pts_allowed, etc.
ht_pts_scored = []
ht_pts_allowed = [] 
ht_1st_downs = []
ht_1st_downs_allowed = []
ht_totyd = []
ht_passyd = []
ht_rushyd = [] 
ht_to = []
ht_totyd_allowed = []
ht_passyd_allowed = []
ht_rushyd_allowed = []
ht_to_forced = []

counter = 0

for i in df['team']:
    if df['team'].iloc[counter] == df['home_t'].iloc[counter]:
        ht_pts_scored.append(df['points_scored'].iloc[counter])
        ht_pts_allowed.append(df['points_allowed'].iloc[counter])
        ht_1st_downs.append(df['1st_downs'].iloc[counter])
        ht_1st_downs_allowed.append(df['1st_downs_allowed'].iloc[counter])
        ht_totyd.append(df['totyd'].iloc[counter])
        ht_totyd_allowed.append(df['totyd_allowed'].iloc[counter])
        ht_passyd.append(df['passyd'].iloc[counter])
        ht_passyd_allowed.append(df['passyd_allowed'].iloc[counter])
        ht_rushyd.append(df['rushyd'].iloc[counter])
        ht_rushyd_allowed.append(df['rushyd_allowed'].iloc[counter])
        ht_to.append(df['to'].iloc[counter])
        ht_to_forced.append(df['to_forced'].iloc[counter])
    else:
        ht_pts_scored.append(df['points_allowed'].iloc[counter])
        ht_pts_allowed.append(df['points_scored'].iloc[counter])
        ht_1st_downs.append(df['1st_downs_allowed'].iloc[counter])
        ht_1st_downs_allowed.append(df['1st_downs'].iloc[counter])
        ht_totyd.append(df['totyd_allowed'].iloc[counter])
        ht_totyd_allowed.append(df['totyd'].iloc[counter])
        ht_passyd.append(df['passyd_allowed'].iloc[counter])
        ht_passyd_allowed.append(df['passyd'].iloc[counter])
        ht_rushyd.append(df['rushyd_allowed'].iloc[counter])
        ht_rushyd_allowed.append(df['rushyd'].iloc[counter])
        ht_to.append(df['to_forced'].iloc[counter])
        ht_to_forced.append(df['to'].iloc[counter])
    counter += 1

In [94]:
ht_pts_scored[:5]

[10.0, 9.0, 20.0, 20.0, 24.0]

In [95]:
df = df.assign(ht_pts_scored = ht_pts_scored)
df = df.assign(ht_pts_allowed = ht_pts_allowed)
df = df.assign(ht_1st_downs = ht_1st_downs)
df = df.assign(ht_totyd = ht_totyd)
df = df.assign(ht_passyd = ht_passyd)
df = df.assign(ht_rushyd = ht_rushyd)
df = df.assign(ht_to = ht_to)
df = df.assign(ht_1st_downs_allowed = ht_1st_downs_allowed)
df = df.assign(ht_totyd_allowed = ht_totyd_allowed)
df = df.assign(ht_passyd_allowed = ht_passyd_allowed)
df = df.assign(ht_rushyd_allowed = ht_rushyd_allowed)
df = df.assign(ht_to_forced = ht_to_forced)

In [96]:
df

Unnamed: 0,season,team,week,day,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_1st_downs_allowed,ht_totyd,ht_totyd_allowed,ht_passyd,ht_rushyd,ht_to,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,2022,9,20,2022-09-08 20:00:00,10.0,31.0,19.0,23.0,243.0,413.0,191.0,52.0,3.0,292.0,121.0,4.0
1,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,2022,9,13,2022-09-11 13:00:00,9.0,24.0,24.0,13.0,380.0,274.0,297.0,83.0,2.0,211.0,63.0,1.0
2,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0.0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,2022,9,13,2022-09-11 13:00:00,20.0,7.0,18.0,17.0,307.0,271.0,242.0,65.0,0.0,193.0,78.0,3.0
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,0.0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,2022,9,13,2022-09-11 13:00:00,20.0,23.0,32.0,13.0,432.0,267.0,299.0,133.0,5.0,192.0,75.0,0.0
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0.0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,2022,9,13,2022-09-11 13:00:00,24.0,26.0,15.0,23.0,261.0,355.0,207.0,54.0,1.0,138.0,217.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0.0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,1994,1,18,1994-01-29 18:00:00,26.0,49.0,20.0,28.0,354.0,455.0,287.0,67.0,3.0,316.0,139.0,0.0
7496,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0.0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,1994,12,16,1994-12-31 16:00:00,27.0,17.0,22.0,24.0,381.0,414.0,249.0,132.0,0.0,314.0,100.0,2.0
7497,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,1994,1,13,1994-01-01 13:00:00,20.0,13.0,22.0,20.0,379.0,303.0,254.0,125.0,1.0,246.0,57.0,3.0
7498,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,1994,1,16,1994-01-01 16:00:00,18.0,35.0,22.0,18.0,389.0,308.0,340.0,49.0,4.0,214.0,94.0,2.0


In [None]:
# we probably need to develop 'home_team_record' and 'away_team_record' columns, but it might be too difficult to engineer


### Creating the Target feature

In [97]:
# we have to create a target feature 'home_team_wins', where 0 = home team did not win, 1 = home team won
ht_wins = []
counter = 0

for i in df['result']:
    if df['team'].iloc[counter] == df['home_t'].iloc[counter]:
        if i == 'W':
            ht_wins.append(1)
        else:
            ht_wins.append(0)
        counter += 1
    else:
        if i == 'W':
            ht_wins.append(0)
        else:
            ht_wins.append(1)
        counter += 1   

In [98]:
df = df.assign(ht_wins = ht_wins)

In [99]:
df

Unnamed: 0,season,team,week,day,result,ot,record,home_team,opp,points_scored,points_allowed,1st_downs,totyd,passyd,rushyd,to,1st_downs_allowed,totyd_allowed,passyd_allowed,rushyd_allowed,to_forced,off_exp_pts,def_exp_pts,sts_exp_pts,away_t,home_t,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_1st_downs_allowed,ht_totyd,ht_totyd_allowed,ht_passyd,ht_rushyd,ht_to,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced,ht_wins
0,2022,Buffalo Bills,1,8,W,0,1-0,0,Los Angeles Rams,31.0,10.0,23.0,413.0,292.0,121.0,4.0,19.0,243.0,191.0,52.0,3.0,13.89,10.29,-3.96,Buffalo Bills,Los Angeles Rams,2022,9,20,2022-09-08 20:00:00,10.0,31.0,19.0,23.0,243.0,413.0,191.0,52.0,3.0,292.0,121.0,4.0,0
1,2022,New York Jets,1,11,L,0,0-1,1,Baltimore Ravens,9.0,24.0,24.0,380.0,297.0,83.0,2.0,13.0,274.0,211.0,63.0,1.0,-8.07,-3.65,-5.04,Baltimore Ravens,New York Jets,2022,9,13,2022-09-11 13:00:00,9.0,24.0,24.0,13.0,380.0,274.0,297.0,83.0,2.0,211.0,63.0,1.0,0
2,2022,Miami Dolphins,1,11,W,0,1-0,1,New England Patriots,20.0,7.0,18.0,307.0,242.0,65.0,0.0,17.0,271.0,193.0,78.0,3.0,6.08,8.28,1.98,New England Patriots,Miami Dolphins,2022,9,13,2022-09-11 13:00:00,20.0,7.0,18.0,17.0,307.0,271.0,242.0,65.0,0.0,193.0,78.0,3.0,1
3,2022,Cincinnati Bengals,1,11,L,1,0-1,1,Pittsburgh Steelers,20.0,23.0,32.0,432.0,299.0,133.0,5.0,13.0,267.0,192.0,75.0,0.0,-7.49,8.41,-4.85,Pittsburgh Steelers,Cincinnati Bengals,2022,9,13,2022-09-11 13:00:00,20.0,23.0,32.0,13.0,432.0,267.0,299.0,133.0,5.0,192.0,75.0,0.0,0
4,2022,Cleveland Browns,1,11,W,0,1-0,0,Carolina Panthers,26.0,24.0,23.0,355.0,138.0,217.0,0.0,15.0,261.0,207.0,54.0,1.0,6.82,-5.38,1.66,Cleveland Browns,Carolina Panthers,2022,9,13,2022-09-11 13:00:00,24.0,26.0,15.0,23.0,261.0,355.0,207.0,54.0,1.0,138.0,217.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,1994,Los Angeles Chargers,SuperBowl,29,L,0,13-6,0,San Francisco 49ers,26.0,49.0,20.0,354.0,287.0,67.0,3.0,28.0,455.0,316.0,139.0,0.0,-3.94,-27.02,4.06,San Francisco 49ers,Los Angeles Chargers,1994,1,18,1994-01-29 18:00:00,26.0,49.0,20.0,28.0,354.0,455.0,287.0,67.0,3.0,316.0,139.0,0.0,0
7496,1994,Miami Dolphins,Wild Card,31,W,0,11-6,1,Kansas City Chiefs,27.0,17.0,22.0,381.0,249.0,132.0,0.0,24.0,414.0,314.0,100.0,2.0,15.96,-8.31,1.16,Kansas City Chiefs,Miami Dolphins,1994,12,16,1994-12-31 16:00:00,27.0,17.0,22.0,24.0,381.0,414.0,249.0,132.0,0.0,314.0,100.0,2.0,1
7497,1994,New England Patriots,Wild Card,1,L,0,10-7,0,Cleveland Browns,13.0,20.0,20.0,303.0,246.0,57.0,3.0,22.0,379.0,254.0,125.0,1.0,-11.86,-8.13,10.97,New England Patriots,Cleveland Browns,1994,1,13,1994-01-01 13:00:00,20.0,13.0,22.0,20.0,379.0,303.0,254.0,125.0,1.0,246.0,57.0,3.0,1
7498,1994,Minnesota Vikings,Wild Card,1,L,0,10-7,1,Chicago Bears,18.0,35.0,22.0,389.0,340.0,49.0,4.0,18.0,308.0,214.0,94.0,2.0,-9.54,-6.07,0.65,Chicago Bears,Minnesota Vikings,1994,1,16,1994-01-01 16:00:00,18.0,35.0,22.0,18.0,389.0,308.0,340.0,49.0,4.0,214.0,94.0,2.0,0


In [106]:
df.dtypes

season                           int64
week                            object
day                              int64
ot                               int64
away_t                          object
home_t                          object
year                             int64
month                            int64
hour                             int64
date                    datetime64[ns]
ht_pts_scored                  float64
ht_pts_allowed                 float64
ht_1st_downs                   float64
ht_1st_downs_allowed           float64
ht_totyd                       float64
ht_totyd_allowed               float64
ht_passyd                      float64
ht_rushyd                      float64
ht_to                          float64
ht_passyd_allowed              float64
ht_rushyd_allowed              float64
ht_to_forced                   float64
ht_wins                          int64
dtype: object

### Dropping unnecessary columns 

In [107]:
df = df.drop(['record', 'team', 'result', 'home_team', 'opp', 'points_scored', 'points_allowed', '1st_downs', 
              'totyd', 'passyd', 'rushyd', 'to', '1st_downs_allowed', 'totyd_allowed', 'passyd_allowed', 
              'rushyd_allowed', 'to_forced', 'off_exp_pts', 'def_exp_pts', 'sts_exp_pts', 'year'], axis=1)

KeyError: "['record', 'team', 'result', 'home_team', 'opp', 'points_scored', 'points_allowed', '1st_downs', 'totyd', 'passyd', 'rushyd', 'to', '1st_downs_allowed', 'totyd_allowed', 'passyd_allowed', 'rushyd_allowed', 'to_forced', 'off_exp_pts', 'def_exp_pts', 'sts_exp_pts'] not found in axis"

In [108]:
df.dtypes

season                           int64
week                            object
day                              int64
ot                               int64
away_t                          object
home_t                          object
year                             int64
month                            int64
hour                             int64
date                    datetime64[ns]
ht_pts_scored                  float64
ht_pts_allowed                 float64
ht_1st_downs                   float64
ht_1st_downs_allowed           float64
ht_totyd                       float64
ht_totyd_allowed               float64
ht_passyd                      float64
ht_rushyd                      float64
ht_to                          float64
ht_passyd_allowed              float64
ht_rushyd_allowed              float64
ht_to_forced                   float64
ht_wins                          int64
dtype: object

### Feature Engineering

In [109]:
# the goal is to obtain a new column that contains rolling_averages for a team over the last four games.
# should the end of a season reset the rolling percentage??? Yes
# creating a df that is organized at the team and season level
grouped_teams = df.groupby("home_t")

In [110]:
# viewing one instance
group_arz = grouped_teams.get_group("Arizona Cardinals")
group_arz

Unnamed: 0,season,week,day,ot,away_t,home_t,year,month,hour,date,ht_pts_scored,ht_pts_allowed,ht_1st_downs,ht_1st_downs_allowed,ht_totyd,ht_totyd_allowed,ht_passyd,ht_rushyd,ht_to,ht_passyd_allowed,ht_rushyd_allowed,ht_to_forced,ht_wins
8,2022,1,11,0,Kansas City Chiefs,Arizona Cardinals,2022,9,16,2022-09-11 16:00:00,21.0,44.0,18.0,33.0,282.0,488.0,179.0,103.0,0.0,360.0,128.0,1.0,0
47,2022,3,25,0,Los Angeles Rams,Arizona Cardinals,2022,9,16,2022-09-25 16:00:00,12.0,20.0,23.0,15.0,365.0,339.0,295.0,70.0,0.0,239.0,100.0,1.0,0
73,2022,5,9,0,Philadelphia Eagles,Arizona Cardinals,2022,10,16,2022-10-09 16:00:00,17.0,20.0,23.0,24.0,363.0,357.0,239.0,124.0,1.0,218.0,139.0,0.0,0
107,2022,7,20,0,New Orleans Saints,Arizona Cardinals,2022,10,20,2022-10-20 20:00:00,42.0,34.0,21.0,25.0,326.0,494.0,189.0,137.0,0.0,409.0,85.0,3.0,1
152,2021,10,14,0,Carolina Panthers,Arizona Cardinals,2021,11,16,2021-11-14 16:00:00,10.0,34.0,11.0,24.0,169.0,341.0,104.0,65.0,2.0,175.0,166.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7366,1994,16,18,0,Cincinnati Bengals,Arizona Cardinals,1994,12,16,1994-12-18 16:00:00,28.0,7.0,24.0,12.0,364.0,189.0,212.0,152.0,0.0,125.0,64.0,3.0,1
7399,1994,2,11,0,New York Giants,Arizona Cardinals,1994,9,20,1994-09-11 20:00:00,17.0,20.0,11.0,19.0,174.0,206.0,135.0,39.0,3.0,88.0,118.0,2.0,0
7438,1994,5,2,0,Minnesota Vikings,Arizona Cardinals,1994,10,16,1994-10-02 16:00:00,17.0,7.0,21.0,19.0,309.0,358.0,200.0,109.0,2.0,340.0,18.0,4.0,1
7472,1994,8,23,0,Dallas Cowboys,Arizona Cardinals,1994,10,16,1994-10-23 16:00:00,21.0,28.0,22.0,14.0,315.0,312.0,208.0,107.0,0.0,237.0,75.0,0.0,0
