In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
from random import randint
import sportsdataverse.nfl as sdv_nfl
pd.set_option('display.max_columns',None)

In [3]:
games = pd.read_csv('./Data/spreadspoke_scores.csv')
teams = pd.read_csv('./Data/nfl_teams.csv')
odds = pd.read_excel('./Data/Data.xlsx')
sdv_pl = sdv_nfl.load_nfl_pfr_def()


clean the games df and get it ready to merge with the box scores df

In [4]:
games.loc[:,'schedule_date'] = pd.to_datetime(games['schedule_date']).dt.floor('D')
games.dropna(subset='spread_favorite', inplace=True)
games.dropna(subset='over_under_line', inplace=True)
def winner(df):
    if df['score_home'] > df['score_away']:
        return df['team_home']
    elif df['score_away'] > df['score_home']:
        return df['team_away']
    else:
        return 'Tie'
    
def over(df):
    if float(df['score_home'] + df['score_away']) > float(df['over_under_line']):
        return "Over"
    elif float(df['score_home'] + df['score_away']) < float(df['over_under_line']):
        return "Under"
    else:
        return "Push"
#Define functions to get the abbreviation for each team in the games dataset
def find_home_team_abbrev(df,):
    match = teams[teams['team_name'] == df['team_home']]
    abrev = match.iloc[0]['team_id']
    return abrev
def find_away_team_abbrev(df,):
    match = teams[teams['team_name'] == df['team_away']]
    abrev = match.iloc[0]['team_id']
    return abrev
games['home_abrev'] = games.apply(find_home_team_abbrev,axis = 1)
games['away_abrev'] = games.apply(find_away_team_abbrev,axis = 1)
def find_point_diff(df):
    '''
    function to find the actual point differential in a game. 
    define the point differential as favored team points - other team points
    This function is supposed to be applied to a dataframe
    '''
    if df['team_favorite_id'] == df['home_abrev']:
        return df['score_home'] - df['score_away']
    elif df['team_favorite_id'] == df['away_abrev']:
        return df['score_away'] - df['score_home']
    elif df['team_favorite_id'] == "PICK":
        return df['score_home'] - df['score_away']
    else:
        return np.nan

def make_game_id(teams_df):
    team1 = teams_df['home_abrev']
    team2 = teams_df['away_abrev']
    teams = [team1, team2]
    sorted_teams = sorted(teams)
    date_str = str(teams_df['schedule_date'])
    gameID = date_str[:10] + ' ' + sorted_teams[0] + ' vs ' + sorted_teams[1]
    return gameID
 
games['point_diff'] = games.apply(find_point_diff,axis=1)
games['point_total'] = games['score_home'] + games['score_away']
games['winner'] = games.apply(winner, axis = 1)
games['over'] = games.apply(over, axis = 1)
games['gameID'] = games.apply(make_game_id, axis = 1)

clean the scores dataframe and get it ready to merge with the games dataframe

In [23]:
scores = pd.read_csv("Data/box_scores_cleaned.csv")
scores['3rd-down-conv-rate'] = scores['3rd-down-conv-rate'].str[:-1].astype(float)
scores['4th-down-conv-rate'] = scores['4th-down-conv-rate'].str[:-1].astype(float)
scores['3rd-down-conv-rate'] = scores['3rd-down-conv-rate'] / 100
scores['4th-down-conv-rate'] = scores['4th-down-conv-rate'] / 100
cols = [
 'total-first-downs',
 'total-first-downs',
 'rushing-first-downs',
 'rushing-first-downs',
 'passing-first-downs',
 'passing-first-downs',
 'penalty-first-downs',
 'penalty-first-downs',
 'net-yards',
 'net-yards',
 'net-rushing-yds',
 'net-rushing-yds',
 'rushing-plays',
 'rushing-plays',
 'avg-gain-rushing',
 'avg-gain-rushing',
 'net-passing-yds',
 'net-passing-yds',
 'gross-passing-yds',
 'gross-passing-yds',
 'yds-per-att',
 'yds-per-att',
 'blocked-kicks-allowed',
 'blocked-kicks-allowed',
 'total-plays',
 'total-plays',
 'avg-gain-per-play',
 'passing-attempts',
 'passing-attempts',
 'completions',
 'completions',
 'int-thrown',
 'int-thrown',
 'fumbles',
 'fumbles',
 'fumbles-lost',
 'fumbles-lost',
 'fga',
 'fga',
 'fgm',
 '3rd-down-convs',
 '3rd-down-convs',
 '3rd-downs',
 '3rd-downs',
 '3rd-down-conv-rate',
 '3rd-down-conv-rate',
 'punts',
 'yards-per-punt',
 'yards-per-punt',
 'penalties',
 'penalties',
 'penalty-yards',
 'penalty-yards',
 'sacks_allowed',
 'sacks_allowed',
 'sack-yds-lost',
 'sack-yds-lost',
 'punts-returned',
 'punts-returned',
 'punt-return-yds',
 'punt-return-yds',
 'kicks-returned',
 'kicks-returned',
 'kick-return-yds',
 'kick-return-yds',
]
def find_season(df):
    if int(df['date'].split('-')[1]) > 6:
        return int(df['date'].split('-')[0])
    else:
        return int(df['date'].split('-')[0])-1

scores['season']  = scores.apply(find_season,axis=1)

def get_rolling(column,window_size):
    rolling = scores.groupby(['season','team-abrev'])[column].rolling(window = window_size,min_periods=1,closed = "left").mean()
    rolling = pd.DataFrame(rolling)
    rolling.reset_index(inplace=True)
    rolling.set_index('level_2',inplace=True)
    rolling.rename_axis(index=None,inplace=True)
    rolling.sort_index(ascending=True,inplace=True)
    rolling[column] = rolling[column].rename(f'{column}-rolling-{window_size}')
    return rolling[column]
def prev_season_avg(column,season):
    if season >= 2021:
        window_size = 17
    else:
        window_size = 16
    prev_season = scores[scores['season'] == season - 1]
    prev_season_avg = prev_season.groupby(['team-abrev'])[column].rolling(window = window_size,min_periods=1,closed = "left").mean()
    prev_season_avg = pd.DataFrame(prev_season_avg)
    #prev_season_avg.set_index('level_1',inplace=True)
    # prev_season_avg.rename_axis(index=None,inplace=True)
    # prev_season_avg.sort_index(ascending=True,inplace=True)
    return prev_season_avg

# rolling_df = pd.DataFrame()
# for i in range(len(cols)):
#     scores[f'{cols[i]}-4-game-avg'] = get_rolling(cols[i],4)

team1_box_scores = scores[scores.index % 2 == 0]
team2_box_scores = scores[scores.index % 2 == 1]
team1_box_scores.reset_index(inplace=True,drop=True)
team2_box_scores.reset_index(inplace=True,drop=True)
box_scores_by_game = team1_box_scores.join(team2_box_scores,how="outer",lsuffix="-1",rsuffix="-2")

def make_game_id_3(box_scores_DF):
    team1 = box_scores_DF['team-abrev-1']
    team2 = box_scores_DF['team-abrev-2']
    teams = [team1, team2]
    sorted_teams = sorted(teams)
    date_str = str(box_scores_DF['date-1'])
    gameID = date_str + ' ' + sorted_teams[0] + ' vs ' + sorted_teams[1]
    return gameID
box_scores_by_game['gameID'] = box_scores_by_game.apply(make_game_id_3,axis=1)

#TODO: fill in week 1 averages with previous season

In [24]:
prev_season_avg('total-first-downs',2017)

Unnamed: 0_level_0,Unnamed: 1_level_0,total-first-downs
team-abrev,Unnamed: 1_level_1,Unnamed: 2_level_1
ARI,18795,
ARI,18821,21.000000
ARI,18834,20.500000
ARI,18885,22.000000
ARI,18894,23.000000
...,...,...
WAS,19144,23.181818
WAS,19158,22.833333
WAS,19215,22.307692
WAS,19228,22.000000


In [6]:
scores

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,team,total-first-downs,rushing-first-downs,passing-first-downs,penalty-first-downs,net-yards,net-rushing-yds,rushing-plays,avg-gain-rushing,net-passing-yds,gross-passing-yds,yds-per-att,blocked-kicks-allowed,total-plays,avg-gain-per-play,date,matchup,Time of Possession,passing-attempts,completions,int-thrown,interceptions,int-return-yards,fumbles,fumbles-lost,fga,fgm,3rd-down-convs,3rd-downs,3rd-down-conv-rate,punts,yards-per-punt,penalties,penalty-yards,sacks_allowed,sack-yds-lost,punts-returned,punt-return-yds,kicks-returned,kick-return-yds,4th-down-convs,4th-downs,4th-down-conv-rate,team-abrev,season,total-first-downs-4-game-avg,rushing-first-downs-4-game-avg,passing-first-downs-4-game-avg,penalty-first-downs-4-game-avg,net-yards-4-game-avg,net-rushing-yds-4-game-avg,rushing-plays-4-game-avg,avg-gain-rushing-4-game-avg,net-passing-yds-4-game-avg,gross-passing-yds-4-game-avg,yds-per-att-4-game-avg,blocked-kicks-allowed-4-game-avg,total-plays-4-game-avg,avg-gain-per-play-4-game-avg,passing-attempts-4-game-avg,completions-4-game-avg,int-thrown-4-game-avg,fumbles-4-game-avg,fumbles-lost-4-game-avg,fga-4-game-avg,fgm-4-game-avg,3rd-down-convs-4-game-avg,3rd-downs-4-game-avg,3rd-down-conv-rate-4-game-avg,punts-4-game-avg,yards-per-punt-4-game-avg,penalties-4-game-avg,penalty-yards-4-game-avg,sacks_allowed-4-game-avg,sack-yds-lost-4-game-avg,punts-returned-4-game-avg,punt-return-yds-4-game-avg,kicks-returned-4-game-avg,kick-return-yds-4-game-avg
0,0,0.0,NY GiantsNYG,12,6,6,0,238,76,35,2.2,162,183,7.3,0.0,64,3.7,1978-09-02,new york giants vs tampa bay buccaneers,,25,12,1,3.0,46.0,0,0,2.0,2.0,5,18,0.27,9.0,43.3,7,64,4,21,4,11,4,103,,,,NYG,1978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,1.0,Tampa BayTB,16,9,4,3,251,165,39,4.2,86,93,3.3,0.0,68,3.7,1978-09-02,new york giants vs tampa bay buccaneers,,28,10,3,1.0,3.0,4,1,2.0,2.0,4,17,0.23,7.0,44.3,8,55,1,7,5,76,5,86,,,,TB,1978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,2.0,Green BayGB,16,11,2,3,212,181,55,3.3,31,31,3.4,0.0,64,3.3,1978-09-03,green bay packers vs detroit lions,,9,3,0,1.0,10.0,1,0,2.0,2.0,8,18,0.44,7.0,40.6,8,40,0,0,2,15,2,45,,,,GB,1978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,3.0,DetroitDet,11,5,6,0,122,82,26,3.2,40,116,6.1,0.0,53,2.3,1978-09-03,green bay packers vs detroit lions,,19,10,1,,,3,0,0.0,0.0,4,15,0.26,7.0,38.7,9,31,8,76,5,46,4,91,,,,DET,1978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,4.0,HoustonHou,13,2,9,2,261,156,21,7.4,105,136,3.8,1.0,61,4.3,1978-09-03,houston oilers vs atlanta falcons,,36,19,2,,,4,2,0.0,0.0,4,15,0.26,9.0,37.1,7,52,4,31,4,23,5,169,,,,HOU,1978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22935,22935,22935.0,PhiladelphiaPhi,24,9,12,3,378,185,32,5.8,193,200,6.5,0.0,65,5.8,2023-11-26,buffalo bills vs philadelphia eagles,,31,18,1,1.0,5.0,1,1,1.0,1.0,4,11,0.36,5.0,53.8,4,30,2,7,1,18,0,0,0.0,0.0,0.00,PHI,2023,20.75,8.00,10.75,2.00,314.75,95.25,29.00,3.275,219.50,238.75,8.300,0.0,60.75,5.125,28.50,20.75,0.50,1.75,0.75,0.50,0.50,5.50,12.50,0.4275,3.75,53.750,5.00,43.25,3.25,19.25,1.25,20.50,0.50,11.25
22936,22936,22936.0,BaltimoreBal,20,10,9,1,361,197,35,5.6,164,177,5.5,0.0,69,5.2,2023-11-26,baltimore ravens vs los angeles chargers,,32,18,0,1.0,0.0,1,0,2.0,3.0,4,13,0.30,3.0,40.3,5,42,2,13,1,3,0,0,1.0,3.0,0.33,BAL,2023,22.25,9.50,9.25,3.50,373.50,172.75,31.25,5.325,200.75,217.25,8.125,0.0,61.25,6.075,27.25,18.25,0.50,1.25,0.50,1.75,2.25,4.50,11.00,0.3925,3.50,51.025,6.25,57.50,2.75,16.50,2.00,18.75,0.50,7.50
22937,22937,22937.0,LA ChargersLAC,16,5,11,0,279,86,19,4.5,193,217,4.9,0.0,66,4.2,2023-11-26,baltimore ravens vs los angeles chargers,,44,29,1,0.0,0.0,3,3,1.0,1.0,7,15,0.46,3.0,52.0,5,41,3,24,1,23,1,21,1.0,3.0,0.33,LAC,2023,18.75,4.00,13.00,1.75,339.50,96.50,24.50,3.975,243.00,254.25,6.825,0.0,62.75,5.350,36.50,23.75,0.25,1.50,0.50,2.00,2.00,7.00,14.25,0.4900,3.75,47.675,3.25,36.00,1.75,11.25,1.25,29.00,1.00,21.25
22938,22938,22938.0,ChicagoChi,18,6,12,0,317,118,28,4.2,199,217,5.9,0.0,68,4.7,2023-11-27,chicago bears vs minnesota vikings,,37,27,0,4.0,19.0,3,2,4.0,5.0,8,18,0.44,3.0,45.7,7,76,3,18,1,0,1,28,1.0,1.0,1.00,CHI,2023,20.75,8.75,10.75,1.25,323.00,136.25,35.00,3.850,186.75,195.75,6.450,0.0,67.00,4.850,30.75,19.75,1.25,1.50,0.75,2.00,2.25,5.75,13.50,0.4250,3.25,45.425,7.00,59.25,1.25,9.00,1.00,6.50,0.75,12.00


In [7]:
def make_game_id_4(df):
    team1 = df['away_team']
    team2 = df['home_team']
    teams = [team1, team2]
    sorted_teams = sorted(teams)
    date_str = str(df['Date'])
    gameID = date_str + ' ' + sorted_teams[0] + ' vs ' + sorted_teams[1]
    return gameID
odds['gameID'] = odds.apply(make_game_id_4,axis=1)

In [8]:
master_df = games.merge(box_scores_by_game,on='gameID',how='inner')
master_df = master_df.merge(odds,on='gameID',how='inner')

In [9]:
convert_columns = list(box_scores_by_game.columns)
convert_columns_1 = []
convert_columns_2 = []
for col in convert_columns:
    if col[-1] == '1':
        convert_columns_1.append(col)
    if col[-1] == '2':
        convert_columns_2.append(col)
for i in range(len(convert_columns_1)):
    new_col_name_home = convert_columns_1[i][:-1] + 'home'
    new_col_name_away = convert_columns_1[i][:-1] + 'away'
    def check_home_team(df):
        if df['home_abrev'] == df['team-abrev-1']:
            return df[convert_columns_1[i]]
        else:
            return df[convert_columns_2[i]]
    def check_away_team(df):
        if df['away_abrev'] == df['team-abrev-1']:
            return df[convert_columns_1[i]]
        else:
            return df[convert_columns_2[i]]
    master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
    master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
master_df.drop(convert_columns_1,axis=1,inplace=True)
master_df.drop(convert_columns_2,axis=1,inplace=True)

  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(check_home_team,axis=1)
  master_df[new_col_name_away] = master_df.apply(check_away_team,axis=1)
  master_df[new_col_name_home] = master_df.apply(ch

In [10]:
def check_covered(df):
    if np.abs(df['spread_favorite']) > df['point_diff']:
        return "Home"
    elif np.abs(df['spread_favorite']) < df['point_diff']:
        return "Away"
    else:
        return "push"