In [48]:
import pandas as pd
import numpy as np
from sportsreference.nfl.boxscore import Boxscores
from sportsipy.nfl.boxscore import Boxscore
pd.set_option('display.max_columns', None)
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report
import statsmodels.api as sm

In [None]:
'''
Predicting NFL game outcomes 
Borrowed inspiration from https://www.activestate.com/blog/how-to-predict-nfl-winners-with-python/


Five Thirty Eight ELO Ratings can be grabbed from here: https://data.fivethirtyeight.com/#nfl-elo


'''

In [2]:
def get_schedule(year):
    weeks = list(range(1,18))
    schedule_df = pd.DataFrame()
    for w in weeks:
        date_string = str(w) + '-' + str(year)
        week_scores = Boxscores(w,year)
        week_games_df = pd.DataFrame()
        for idx,g in enumerate(week_scores.games[date_string]):
            game = pd.DataFrame(week_scores.games[date_string][idx], index = [0])[['away_name', 'away_abbr','home_name', 'home_abbr','winning_name', 'winning_abbr' ]]
            game['week'] = w
            week_games_df = pd.concat([week_games_df,game])
        schedule_df = pd.concat([schedule_df, week_games_df]).reset_index().drop(columns = 'index')
    return schedule_df

In [3]:
def game_data(game_df,game_stats):
    try:
        away_team_df = game_df[['away_name', 'away_abbr', 'away_score']].rename(columns = {'away_name': 'team_name', 'away_abbr': 'team_abbr', 'away_score': 'score'})
        home_team_df = game_df[['home_name','home_abbr', 'home_score']].rename(columns = {'home_name': 'team_name', 'home_abbr': 'team_abbr', 'home_score': 'score'})
        try:
            if game_df.loc[0,'away_score'] > game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
            elif game_df.loc[0,'away_score'] < game_df.loc[0,'home_score']:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [1]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [1], 'game_lost' : [0]}),left_index = True, right_index = True)
            else: 
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [0], 'game_lost' : [0]}),left_index = True, right_index = True)
        except TypeError:
                away_team_df = pd.merge(away_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)
                home_team_df = pd.merge(home_team_df, pd.DataFrame({'game_won' : [np.nan], 'game_lost' : [np.nan]}),left_index = True, right_index = True)        

        away_stats_df = game_stats.dataframe[['away_first_downs', 'away_fourth_down_attempts',
               'away_fourth_down_conversions', 'away_fumbles', 'away_fumbles_lost',
               'away_interceptions', 'away_net_pass_yards', 'away_pass_attempts',
               'away_pass_completions', 'away_pass_touchdowns', 'away_pass_yards',
               'away_penalties', 'away_points', 'away_rush_attempts',
               'away_rush_touchdowns', 'away_rush_yards', 'away_third_down_attempts',
               'away_third_down_conversions', 'away_time_of_possession',
               'away_times_sacked', 'away_total_yards', 'away_turnovers',
               'away_yards_from_penalties', 'away_yards_lost_from_sacks']].reset_index().drop(columns ='index').rename(columns = {
               'away_first_downs': 'first_downs', 'away_fourth_down_attempts':'fourth_down_attempts',
               'away_fourth_down_conversions':'fourth_down_conversions' , 'away_fumbles': 'fumbles', 'away_fumbles_lost': 'fumbles_lost',
               'away_interceptions': 'interceptions', 'away_net_pass_yards':'net_pass_yards' , 'away_pass_attempts': 'pass_attempts',
               'away_pass_completions':'pass_completions' , 'away_pass_touchdowns': 'pass_touchdowns', 'away_pass_yards': 'pass_yards',
               'away_penalties': 'penalties', 'away_points': 'points', 'away_rush_attempts': 'rush_attempts',
               'away_rush_touchdowns': 'rush_touchdowns', 'away_rush_yards': 'rush_yards', 'away_third_down_attempts': 'third_down_attempts',
               'away_third_down_conversions': 'third_down_conversions', 'away_time_of_possession': 'time_of_possession',
               'away_times_sacked': 'times_sacked', 'away_total_yards': 'total_yards', 'away_turnovers': 'turnovers',
               'away_yards_from_penalties':'yards_from_penalties', 'away_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        home_stats_df = game_stats.dataframe[['home_first_downs', 'home_fourth_down_attempts',
               'home_fourth_down_conversions', 'home_fumbles', 'home_fumbles_lost',
               'home_interceptions', 'home_net_pass_yards', 'home_pass_attempts',
               'home_pass_completions', 'home_pass_touchdowns', 'home_pass_yards',
               'home_penalties', 'home_points', 'home_rush_attempts',
               'home_rush_touchdowns', 'home_rush_yards', 'home_third_down_attempts',
               'home_third_down_conversions', 'home_time_of_possession',
               'home_times_sacked', 'home_total_yards', 'home_turnovers',
               'home_yards_from_penalties', 'home_yards_lost_from_sacks']].reset_index().drop(columns = 'index').rename(columns = {
               'home_first_downs': 'first_downs', 'home_fourth_down_attempts':'fourth_down_attempts',
               'home_fourth_down_conversions':'fourth_down_conversions' , 'home_fumbles': 'fumbles', 'home_fumbles_lost': 'fumbles_lost',
               'home_interceptions': 'interceptions', 'home_net_pass_yards':'net_pass_yards' , 'home_pass_attempts': 'pass_attempts',
               'home_pass_completions':'pass_completions' , 'home_pass_touchdowns': 'pass_touchdowns', 'home_pass_yards': 'pass_yards',
               'home_penalties': 'penalties', 'home_points': 'points', 'home_rush_attempts': 'rush_attempts',
               'home_rush_touchdowns': 'rush_touchdowns', 'home_rush_yards': 'rush_yards', 'home_third_down_attempts': 'third_down_attempts',
               'home_third_down_conversions': 'third_down_conversions', 'home_time_of_possession': 'time_of_possession',
               'home_times_sacked': 'times_sacked', 'home_total_yards': 'total_yards', 'home_turnovers': 'turnovers',
               'home_yards_from_penalties':'yards_from_penalties', 'home_yards_lost_from_sacks': 'yards_lost_from_sacks'})

        away_team_df = pd.merge(away_team_df, away_stats_df,left_index = True, right_index = True)
        home_team_df = pd.merge(home_team_df, home_stats_df,left_index = True, right_index = True)
        try:
            away_team_df['time_of_possession'] = (int(away_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(away_team_df['time_of_possession'].loc[0][3:5])
            home_team_df['time_of_possession'] = (int(home_team_df['time_of_possession'].loc[0][0:2]) * 60) + int(home_team_df['time_of_possession'].loc[0][3:5])
        except TypeError:
            away_team_df['time_of_possession'] = np.nan
            home_team_df['time_of_possession'] = np.nan
    except TypeError:
        away_team_df = pd.DataFrame()
        home_team_df = pd.DataFrame()
    return away_team_df, home_team_df

In [4]:
def game_data_up_to_week(weeks,year):
    """
    Returns every teams stats per week up until the week number selected
    """
    weeks_games_df = pd.DataFrame()
    for w in weeks:
        date_string = str(w) + '-' + str(year)
        week_scores = Boxscores(w,year)
        week_games_df = pd.DataFrame()
        for g in range(len(week_scores.games[date_string])):
            game_str = week_scores.games[date_string][g]['boxscore']
            game_stats = Boxscore(game_str)
            game_df = pd.DataFrame(week_scores.games[date_string][g], index = [0])
            away_team_df, home_team_df = game_data(game_df,game_stats)
            away_team_df['week'] = w
            home_team_df['week'] = w
            week_games_df = pd.concat([week_games_df,away_team_df])
            week_games_df = pd.concat([week_games_df,home_team_df])
        weeks_games_df = pd.concat([weeks_games_df,week_games_df])
    return weeks_games_df

In [5]:
## Need to recreate the agg_weekly_data function to acccount for games that haven't been played yet
def unplayed_games(schedule_df,weeks_games_df,current_week):
    games_df = schedule_df[schedule_df['week'] == current_week]
    agg_games_df = pd.DataFrame()
    agg_weekly_df = weeks_games_df[weeks_games_df.week < current_week].drop(columns = ['score','week','game_won', 'game_lost']).groupby(["team_name", "team_abbr"]).mean().reset_index()
    # create a winning pct view for each team leading up to the week in question
    win_loss_df = weeks_games_df[weeks_games_df.week < current_week][["team_name", "team_abbr",'game_won', 'game_lost']].groupby(by=["team_name", "team_abbr"]).sum().reset_index()
    win_loss_df['win_perc'] = win_loss_df['game_won'] / (win_loss_df['game_won'] + win_loss_df['game_lost'])
    win_loss_df = win_loss_df.drop(columns = ['game_won', 'game_lost'])
    # create a new feature called "fourth down pct"
    try:
        agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_conversions'] / agg_weekly_df['fourth_down_attempts'] 
    except ZeroDivisionError:
        agg_weekly_df['fourth_down_perc'] = 0
    agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_perc'].fillna(0)
    # create a new feature called "third down pct"
    try:
        agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_conversions'] / agg_weekly_df['third_down_attempts'] 
    except ZeroDivisionError:
        agg_weekly_df['third_down_perc'] = 0
    agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_perc'].fillna(0) 

    # get rid of the old third/fourth down fields
    agg_weekly_df = agg_weekly_df.drop(columns = ['fourth_down_attempts', 'fourth_down_conversions', 'third_down_attempts', 'third_down_conversions'])
    # merge the winning pct feature into the final aggregate view
    agg_weekly_df = pd.merge(win_loss_df,agg_weekly_df,left_on = ['team_name', 'team_abbr'], right_on = ['team_name', 'team_abbr'])

    # create a new away team dataframe and rename fields to represent stats for away team
    away_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['away_name', 'away_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
            'win_perc': 'away_win_perc',
          'first_downs': 'away_first_downs', 'fumbles': 'away_fumbles', 'fumbles_lost':'away_fumbles_lost', 'interceptions':'away_interceptions',
          'net_pass_yards': 'away_net_pass_yards', 'pass_attempts':'away_pass_attempts', 'pass_completions':'away_pass_completions',
          'pass_touchdowns':'away_pass_touchdowns', 'pass_yards':'away_pass_yards', 'penalties':'away_penalties', 'points':'away_points', 'rush_attempts':'away_rush_attempts',
          'rush_touchdowns':'away_rush_touchdowns', 'rush_yards':'away_rush_yards', 'time_of_possession':'away_time_of_possession', 'times_sacked':'away_times_sacked',
          'total_yards':'away_total_yards', 'turnovers':'away_turnovers', 'yards_from_penalties':'away_yards_from_penalties',
          'yards_lost_from_sacks': 'away_yards_lost_from_sacks', 'fourth_down_perc':'away_fourth_down_perc', 'third_down_perc':'away_third_down_perc'})
    # create a new home team dataframe and rename fields to represent stats for home team
    home_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['home_name', 'home_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
            'win_perc': 'home_win_perc',
          'first_downs': 'home_first_downs', 'fumbles': 'home_fumbles', 'fumbles_lost':'home_fumbles_lost', 'interceptions':'home_interceptions',
          'net_pass_yards': 'home_net_pass_yards', 'pass_attempts':'home_pass_attempts', 'pass_completions':'home_pass_completions',
          'pass_touchdowns':'home_pass_touchdowns', 'pass_yards':'home_pass_yards', 'penalties':'home_penalties', 'points':'home_points', 'rush_attempts':'home_rush_attempts',
          'rush_touchdowns':'home_rush_touchdowns', 'rush_yards':'home_rush_yards', 'time_of_possession':'home_time_of_possession', 'times_sacked':'home_times_sacked',
          'total_yards':'home_total_yards', 'turnovers':'home_turnovers', 'yards_from_penalties':'home_yards_from_penalties',
          'yards_lost_from_sacks': 'home_yards_lost_from_sacks', 'fourth_down_perc':'home_fourth_down_perc', 'third_down_perc':'home_third_down_perc'})
    # create final aggregate view with both home and away dataframes
    agg_weekly_df = pd.merge(away_df,home_df,left_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
          'winning_abbr', 'week'], right_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
          'winning_abbr', 'week'])
    # create the features that will be used in the ML model to predict outcomes
    agg_weekly_df['win_perc_dif'] = agg_weekly_df['away_win_perc'] - agg_weekly_df['home_win_perc']
    agg_weekly_df['first_downs_dif'] = agg_weekly_df['away_first_downs'] - agg_weekly_df['home_first_downs']
    agg_weekly_df['fumbles_dif'] = agg_weekly_df['away_fumbles'] - agg_weekly_df['home_fumbles']
    agg_weekly_df['interceptions_dif'] = agg_weekly_df['away_interceptions'] - agg_weekly_df['home_interceptions']
    agg_weekly_df['net_pass_yards_dif'] = agg_weekly_df['away_net_pass_yards'] - agg_weekly_df['home_net_pass_yards']
    agg_weekly_df['pass_attempts_dif'] = agg_weekly_df['away_pass_attempts'] - agg_weekly_df['home_pass_attempts']
    agg_weekly_df['pass_completions_dif'] = agg_weekly_df['away_pass_completions'] - agg_weekly_df['home_pass_completions']
    agg_weekly_df['pass_touchdowns_dif'] = agg_weekly_df['away_pass_touchdowns'] - agg_weekly_df['home_pass_touchdowns']
    agg_weekly_df['pass_yards_dif'] = agg_weekly_df['away_pass_yards'] - agg_weekly_df['home_pass_yards']
    agg_weekly_df['penalties_dif'] = agg_weekly_df['away_penalties'] - agg_weekly_df['home_penalties']
    agg_weekly_df['points_dif'] = agg_weekly_df['away_points'] - agg_weekly_df['home_points']
    agg_weekly_df['rush_attempts_dif'] = agg_weekly_df['away_rush_attempts'] - agg_weekly_df['home_rush_attempts']
    agg_weekly_df['rush_touchdowns_dif'] = agg_weekly_df['away_rush_touchdowns'] - agg_weekly_df['home_rush_touchdowns']
    agg_weekly_df['rush_yards_dif'] = agg_weekly_df['away_rush_yards'] - agg_weekly_df['home_rush_yards']
    agg_weekly_df['time_of_possession_dif'] = agg_weekly_df['away_time_of_possession'] - agg_weekly_df['home_time_of_possession']
    agg_weekly_df['times_sacked_dif'] = agg_weekly_df['away_times_sacked'] - agg_weekly_df['home_times_sacked']
    agg_weekly_df['total_yards_dif'] = agg_weekly_df['away_total_yards'] - agg_weekly_df['home_total_yards']
    agg_weekly_df['turnovers_dif'] = agg_weekly_df['away_turnovers'] - agg_weekly_df['home_turnovers']
    agg_weekly_df['yards_from_penalties_dif'] = agg_weekly_df['away_yards_from_penalties'] - agg_weekly_df['home_yards_from_penalties']
    agg_weekly_df['yards_lost_from_sacks_dif'] = agg_weekly_df['away_yards_lost_from_sacks'] - agg_weekly_df['home_yards_lost_from_sacks']
    agg_weekly_df['fourth_down_perc_dif'] = agg_weekly_df['away_fourth_down_perc'] - agg_weekly_df['home_fourth_down_perc']
    agg_weekly_df['third_down_perc_dif'] = agg_weekly_df['away_third_down_perc'] - agg_weekly_df['home_third_down_perc']

    # get rid of the home/away fields that were used to create features
    agg_weekly_df = agg_weekly_df.drop(columns = ['away_win_perc',
          'away_first_downs', 'away_fumbles', 'away_fumbles_lost', 'away_interceptions',
          'away_net_pass_yards', 'away_pass_attempts','away_pass_completions',
          'away_pass_touchdowns', 'away_pass_yards', 'away_penalties', 'away_points', 'away_rush_attempts',
          'away_rush_touchdowns', 'away_rush_yards', 'away_time_of_possession', 'away_times_sacked',
          'away_total_yards', 'away_turnovers', 'away_yards_from_penalties',
          'away_yards_lost_from_sacks','away_fourth_down_perc', 'away_third_down_perc','home_win_perc',
          'home_first_downs', 'home_fumbles', 'home_fumbles_lost', 'home_interceptions',
          'home_net_pass_yards', 'home_pass_attempts','home_pass_completions',
          'home_pass_touchdowns', 'home_pass_yards', 'home_penalties', 'home_points', 'home_rush_attempts',
          'home_rush_touchdowns', 'home_rush_yards', 'home_time_of_possession', 'home_times_sacked',
          'home_total_yards', 'home_turnovers', 'home_yards_from_penalties',
          'home_yards_lost_from_sacks','home_fourth_down_perc', 'home_third_down_perc'])
    # check if any of the games have not had a winner yet (have not been played yet or completed)
    if (agg_weekly_df['winning_name'].isnull().values.any()):
        agg_weekly_df['result'] = np.nan
        print(f"Week {current_week} games have not finished yet.")
    else:
        # create boolean flag for if the away team won the game or not
        agg_weekly_df['result'] = agg_weekly_df['winning_name'] == agg_weekly_df['away_name']
        agg_weekly_df['result'] = agg_weekly_df['result'].astype('float')
    agg_weekly_df = agg_weekly_df.drop(columns = ['winning_name', 'winning_abbr'])
    agg_games_df = pd.concat([agg_games_df, agg_weekly_df])

    agg_games_df = agg_games_df.reset_index().drop(columns = 'index')
    return agg_games_df

In [6]:
def agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks):
    # filter the schedule on only games up to the current week in question
    schedule_df = schedule_df[schedule_df.week < current_week]
    
    agg_games_df = pd.DataFrame()

    for w in range(1,len(weeks)):
        ## Retrieve the games for that week 'w'
        games_df = schedule_df[schedule_df.week == w]
        # get an aggregate view of each team leading up to the week in question (i.e. first downs, fourth down attempts, pass yards, etc.)
        agg_weekly_df = weeks_games_df[weeks_games_df.week < w].drop(columns = ['score','week','game_won', 'game_lost']).groupby(["team_name", "team_abbr"]).mean().reset_index()
        
        # create a winning pct view for each team leading up to the week in question
        win_loss_df = weeks_games_df[weeks_games_df.week < w][["team_name", "team_abbr",'game_won', 'game_lost']].groupby(by=["team_name", "team_abbr"]).sum().reset_index()
        win_loss_df['win_perc'] = win_loss_df['game_won'] / (win_loss_df['game_won'] + win_loss_df['game_lost'])
        win_loss_df = win_loss_df.drop(columns = ['game_won', 'game_lost'])
        
        # create a new feature called "fourth down pct"
        try:
            agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_conversions'] / agg_weekly_df['fourth_down_attempts'] 
        except ZeroDivisionError:
            agg_weekly_df['fourth_down_perc'] = 0
        agg_weekly_df['fourth_down_perc'] = agg_weekly_df['fourth_down_perc'].fillna(0)
        # create a new feature called "third down pct"
        try:
            agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_conversions'] / agg_weekly_df['third_down_attempts'] 
        except ZeroDivisionError:
            agg_weekly_df['third_down_perc'] = 0
        agg_weekly_df['third_down_perc'] = agg_weekly_df['third_down_perc'].fillna(0) 
        
        # get rid of the old third/fourth down fields
        agg_weekly_df = agg_weekly_df.drop(columns = ['fourth_down_attempts', 'fourth_down_conversions', 'third_down_attempts', 'third_down_conversions'])
        # merge the winning pct feature into the final aggregate view
        agg_weekly_df = pd.merge(win_loss_df,agg_weekly_df,left_on = ['team_name', 'team_abbr'], right_on = ['team_name', 'team_abbr'])
        
        # create a new away team dataframe and rename fields to represent stats for away team
        away_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['away_name', 'away_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'away_win_perc',
              'first_downs': 'away_first_downs', 'fumbles': 'away_fumbles', 'fumbles_lost':'away_fumbles_lost', 'interceptions':'away_interceptions',
              'net_pass_yards': 'away_net_pass_yards', 'pass_attempts':'away_pass_attempts', 'pass_completions':'away_pass_completions',
              'pass_touchdowns':'away_pass_touchdowns', 'pass_yards':'away_pass_yards', 'penalties':'away_penalties', 'points':'away_points', 'rush_attempts':'away_rush_attempts',
              'rush_touchdowns':'away_rush_touchdowns', 'rush_yards':'away_rush_yards', 'time_of_possession':'away_time_of_possession', 'times_sacked':'away_times_sacked',
              'total_yards':'away_total_yards', 'turnovers':'away_turnovers', 'yards_from_penalties':'away_yards_from_penalties',
              'yards_lost_from_sacks': 'away_yards_lost_from_sacks', 'fourth_down_perc':'away_fourth_down_perc', 'third_down_perc':'away_third_down_perc'})
        # create a new home team dataframe and rename fields to represent stats for home team
        home_df = pd.merge(games_df,agg_weekly_df,how = 'inner', left_on = ['home_name', 'home_abbr'], right_on = ['team_name', 'team_abbr']).drop(columns = ['team_name', 'team_abbr']).rename(columns = {
                'win_perc': 'home_win_perc',
              'first_downs': 'home_first_downs', 'fumbles': 'home_fumbles', 'fumbles_lost':'home_fumbles_lost', 'interceptions':'home_interceptions',
              'net_pass_yards': 'home_net_pass_yards', 'pass_attempts':'home_pass_attempts', 'pass_completions':'home_pass_completions',
              'pass_touchdowns':'home_pass_touchdowns', 'pass_yards':'home_pass_yards', 'penalties':'home_penalties', 'points':'home_points', 'rush_attempts':'home_rush_attempts',
              'rush_touchdowns':'home_rush_touchdowns', 'rush_yards':'home_rush_yards', 'time_of_possession':'home_time_of_possession', 'times_sacked':'home_times_sacked',
              'total_yards':'home_total_yards', 'turnovers':'home_turnovers', 'yards_from_penalties':'home_yards_from_penalties',
              'yards_lost_from_sacks': 'home_yards_lost_from_sacks', 'fourth_down_perc':'home_fourth_down_perc', 'third_down_perc':'home_third_down_perc'})
        # create final aggregate view with both home and away dataframes
        agg_weekly_df = pd.merge(away_df,home_df,left_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
              'winning_abbr', 'week'], right_on = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'winning_name',
              'winning_abbr', 'week'])
        # create the features that will be used in the ML model to predict outcomes
        agg_weekly_df['win_perc_dif'] = agg_weekly_df['away_win_perc'] - agg_weekly_df['home_win_perc']
        agg_weekly_df['first_downs_dif'] = agg_weekly_df['away_first_downs'] - agg_weekly_df['home_first_downs']
        agg_weekly_df['fumbles_dif'] = agg_weekly_df['away_fumbles'] - agg_weekly_df['home_fumbles']
        agg_weekly_df['interceptions_dif'] = agg_weekly_df['away_interceptions'] - agg_weekly_df['home_interceptions']
        agg_weekly_df['net_pass_yards_dif'] = agg_weekly_df['away_net_pass_yards'] - agg_weekly_df['home_net_pass_yards']
        agg_weekly_df['pass_attempts_dif'] = agg_weekly_df['away_pass_attempts'] - agg_weekly_df['home_pass_attempts']
        agg_weekly_df['pass_completions_dif'] = agg_weekly_df['away_pass_completions'] - agg_weekly_df['home_pass_completions']
        agg_weekly_df['pass_touchdowns_dif'] = agg_weekly_df['away_pass_touchdowns'] - agg_weekly_df['home_pass_touchdowns']
        agg_weekly_df['pass_yards_dif'] = agg_weekly_df['away_pass_yards'] - agg_weekly_df['home_pass_yards']
        agg_weekly_df['penalties_dif'] = agg_weekly_df['away_penalties'] - agg_weekly_df['home_penalties']
        agg_weekly_df['points_dif'] = agg_weekly_df['away_points'] - agg_weekly_df['home_points']
        agg_weekly_df['rush_attempts_dif'] = agg_weekly_df['away_rush_attempts'] - agg_weekly_df['home_rush_attempts']
        agg_weekly_df['rush_touchdowns_dif'] = agg_weekly_df['away_rush_touchdowns'] - agg_weekly_df['home_rush_touchdowns']
        agg_weekly_df['rush_yards_dif'] = agg_weekly_df['away_rush_yards'] - agg_weekly_df['home_rush_yards']
        agg_weekly_df['time_of_possession_dif'] = agg_weekly_df['away_time_of_possession'] - agg_weekly_df['home_time_of_possession']
        agg_weekly_df['times_sacked_dif'] = agg_weekly_df['away_times_sacked'] - agg_weekly_df['home_times_sacked']
        agg_weekly_df['total_yards_dif'] = agg_weekly_df['away_total_yards'] - agg_weekly_df['home_total_yards']
        agg_weekly_df['turnovers_dif'] = agg_weekly_df['away_turnovers'] - agg_weekly_df['home_turnovers']
        agg_weekly_df['yards_from_penalties_dif'] = agg_weekly_df['away_yards_from_penalties'] - agg_weekly_df['home_yards_from_penalties']
        agg_weekly_df['yards_lost_from_sacks_dif'] = agg_weekly_df['away_yards_lost_from_sacks'] - agg_weekly_df['home_yards_lost_from_sacks']
        agg_weekly_df['fourth_down_perc_dif'] = agg_weekly_df['away_fourth_down_perc'] - agg_weekly_df['home_fourth_down_perc']
        agg_weekly_df['third_down_perc_dif'] = agg_weekly_df['away_third_down_perc'] - agg_weekly_df['home_third_down_perc']
        
        # get rid of the home/away fields that were used to create features
        agg_weekly_df = agg_weekly_df.drop(columns = ['away_win_perc',
              'away_first_downs', 'away_fumbles', 'away_fumbles_lost', 'away_interceptions',
              'away_net_pass_yards', 'away_pass_attempts','away_pass_completions',
              'away_pass_touchdowns', 'away_pass_yards', 'away_penalties', 'away_points', 'away_rush_attempts',
              'away_rush_touchdowns', 'away_rush_yards', 'away_time_of_possession', 'away_times_sacked',
              'away_total_yards', 'away_turnovers', 'away_yards_from_penalties',
              'away_yards_lost_from_sacks','away_fourth_down_perc', 'away_third_down_perc','home_win_perc',
              'home_first_downs', 'home_fumbles', 'home_fumbles_lost', 'home_interceptions',
              'home_net_pass_yards', 'home_pass_attempts','home_pass_completions',
              'home_pass_touchdowns', 'home_pass_yards', 'home_penalties', 'home_points', 'home_rush_attempts',
              'home_rush_touchdowns', 'home_rush_yards', 'home_time_of_possession', 'home_times_sacked',
              'home_total_yards', 'home_turnovers', 'home_yards_from_penalties',
              'home_yards_lost_from_sacks','home_fourth_down_perc', 'home_third_down_perc'])
       # check if any of the games have not had a winner yet (have not been played yet or completed)
        if (agg_weekly_df['winning_name'].isnull().values.any()):
            agg_weekly_df['result'] = np.nan
            print(f"Week {w} games have not finished yet.")
        else:
            # create boolean flag for if the away team won the game or not
            agg_weekly_df['result'] = agg_weekly_df['winning_name'] == agg_weekly_df['away_name']
            agg_weekly_df['result'] = agg_weekly_df['result'].astype('float')
        agg_weekly_df = agg_weekly_df.drop(columns = ['winning_name', 'winning_abbr'])
        agg_games_df = pd.concat([agg_games_df, agg_weekly_df])
    
    agg_games_df = agg_games_df.reset_index().drop(columns = 'index')
    return agg_games_df

In [7]:
def get_elo():
    elo_df = pd.read_csv('nfl_elo_latest.csv')
    elo_df = elo_df.drop(columns = ['season','neutral' ,'playoff', 'elo_prob1', 'elo_prob2', 'elo1_post', 'elo2_post',
           'qbelo1_pre', 'qbelo2_pre', 'qb1', 'qb2', 'qb1_adj', 'qb2_adj', 'qbelo_prob1', 'qbelo_prob2',
           'qb1_game_value', 'qb2_game_value', 'qb1_value_post', 'qb2_value_post',
           'qbelo1_post', 'qbelo2_post', 'score1', 'score2'])
    elo_df.date = pd.to_datetime(elo_df.date)
#     elo_df = elo_df[elo_df.date < '01-05-2021']

    elo_df['team1'] = elo_df['team1'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    elo_df['team2'] = elo_df['team2'].replace(['KC', 'JAX', 'CAR', 'BAL', 'BUF', 'MIN', 'DET', 'ATL', 'NE', 'WSH',
           'CIN', 'NO', 'SF', 'LAR', 'NYG', 'DEN', 'CLE', 'IND', 'TEN', 'NYJ',
           'TB', 'MIA', 'PIT', 'PHI', 'GB', 'CHI', 'DAL', 'ARI', 'LAC', 'HOU',
           'SEA', 'OAK'],
            ['kan','jax','car', 'rav', 'buf', 'min', 'det', 'atl', 'nwe', 'was', 
            'cin', 'nor', 'sfo', 'ram', 'nyg', 'den', 'cle', 'clt', 'oti', 'nyj', 
             'tam','mia', 'pit', 'phi', 'gnb', 'chi', 'dal', 'crd', 'sdg', 'htx', 'sea', 'rai' ])
    return elo_df

In [8]:
def merge_rankings(agg_games_df,elo_df):
    agg_games_df = pd.merge(agg_games_df, elo_df, how = 'inner', left_on = ['home_abbr', 'away_abbr'], right_on = ['team1', 'team2']).drop(columns = ['date','team1', 'team2'])
    agg_games_df['elo_dif'] = agg_games_df['elo2_pre'] - agg_games_df['elo1_pre']
    agg_games_df['qb_dif'] = agg_games_df['qb2_value_pre'] - agg_games_df['qb1_value_pre']
    agg_games_df = agg_games_df.drop(columns = ['elo1_pre', 'elo2_pre', 'qb1_value_pre', 'qb2_value_pre'])
    return agg_games_df

In [9]:
def prep_test_train(current_week,weeks,year):
    ## get the schedule for the year provided
    schedule_df  = get_schedule(year)
    # get the weekly games for schedule
    weeks_games_df = game_data_up_to_week(weeks,year)
    agg_games_df = agg_weekly_data(schedule_df,weeks_games_df,current_week,weeks)
    unplayed_df = unplayed_games(schedule_df,weeks_games_df,current_week)
    # get latest ELO ratings
    elo_df = get_elo()
    # merge ratings into agg dataframe
    agg_games_df = merge_rankings(agg_games_df, elo_df)
    unplayed_df = merge_rankings(unplayed_df, elo_df)
    # create training dataset based on previous weeks in current season
    train_df = agg_games_df[agg_games_df.result.notna()]
    test_df = unplayed_df.copy()
    return test_df, train_df

## Generate the training data

In [10]:
current_week = 7
weeks = list(range(1,current_week + 1))
year = 2022

pred_games_df, comp_games_df = prep_test_train(current_week,weeks,year)

Week 7 games have not finished yet.


In [19]:
pred_games_df.head()

Unnamed: 0,away_name,away_abbr,home_name,home_abbr,week,win_perc_dif,first_downs_dif,fumbles_dif,interceptions_dif,net_pass_yards_dif,pass_attempts_dif,pass_completions_dif,pass_touchdowns_dif,pass_yards_dif,penalties_dif,points_dif,rush_attempts_dif,rush_touchdowns_dif,rush_yards_dif,time_of_possession_dif,times_sacked_dif,total_yards_dif,turnovers_dif,yards_from_penalties_dif,yards_lost_from_sacks_dif,fourth_down_perc_dif,third_down_perc_dif,result,quality,importance,total_rating,elo_dif,qb_dif
0,New Orleans Saints,nor,Arizona Cardinals,crd,7,0.0,-0.833333,0.5,0.333333,4.166667,-8.666667,-6.166667,0.333333,1.0,1.0,4.5,1.333333,0.5,32.166667,-62.666667,0.166667,36.333333,1.333333,10.333333,-3.166667,-0.212121,0.071591,,21,41,31,17.608852,-97.231018
1,Tampa Bay Buccaneers,tam,Carolina Panthers,car,7,0.333333,7.333333,-0.833333,-0.666667,94.833333,11.166667,10.666667,0.666667,85.166667,0.833333,3.0,1.333333,-0.166667,-22.833333,201.166667,-1.666667,72.0,-0.333333,-3.333333,-9.666667,0.1,0.141938,,18,20,19,244.148052,211.276104
2,Atlanta Falcons,atl,Cincinnati Bengals,cin,7,0.0,-0.833333,0.666667,-0.166667,-89.5,-15.333333,-11.5,-1.0,-97.333333,-0.333333,1.333333,8.333333,0.833333,76.166667,-194.5,-1.333333,-13.333333,0.0,-1.166667,-7.833333,0.380952,-0.027849,,51,54,53,-87.034313,-60.361417
3,Detroit Lions,det,Dallas Cowboys,dal,7,-0.466667,4.733333,0.1,0.133333,78.4,6.566667,5.066667,1.366667,79.866667,-1.0,9.666667,0.466667,0.733333,32.566667,69.4,-0.1,110.966667,0.366667,0.333333,1.466667,-0.055556,0.048718,,45,10,28,-205.96636,-10.62439
4,New York Giants,nyg,Jacksonville Jaguars,jax,7,0.5,-1.333333,-0.333333,-0.166667,-66.833333,-6.833333,-4.166667,-0.666667,-60.833333,1.333333,-1.833333,4.5,0.333333,26.333333,49.166667,1.5,-40.5,-0.5,9.666667,6.0,0.115385,-0.026344,,29,53,41,69.946744,-7.614501


In [18]:
comp_games_df.head()

Unnamed: 0,away_name,away_abbr,home_name,home_abbr,week,win_perc_dif,first_downs_dif,fumbles_dif,interceptions_dif,net_pass_yards_dif,pass_attempts_dif,pass_completions_dif,pass_touchdowns_dif,pass_yards_dif,penalties_dif,points_dif,rush_attempts_dif,rush_touchdowns_dif,rush_yards_dif,time_of_possession_dif,times_sacked_dif,total_yards_dif,turnovers_dif,yards_from_penalties_dif,yards_lost_from_sacks_dif,fourth_down_perc_dif,third_down_perc_dif,result,quality,importance,total_rating,elo_dif,qb_dif
0,Los Angeles Chargers,sdg,Kansas City Chiefs,kan,2,0.0,-15.0,-4.0,0.0,-81.0,-5.0,-4.0,-2.0,-81.0,2.0,-20.0,4.0,-1.0,-52.0,-130.0,0.0,-133.0,-1.0,-4.0,0.0,-1.0,-0.196429,0.0,90,92,91,-135.251303,-54.582214
1,New York Jets,nyj,Cleveland Browns,cle,2,-1.0,1.0,1.0,1.0,159.0,25.0,19.0,0.0,162.0,-3.0,-17.0,-22.0,-1.0,-134.0,-356.0,2.0,25.0,2.0,10.0,3.0,0.25,-0.301587,1.0,9,38,24,-167.896418,-2.477047
2,Washington Commanders,was,Detroit Lions,det,2,1.0,3.0,0.0,1.0,100.0,4.0,6.0,2.0,98.0,-1.0,-7.0,0.0,-3.0,-96.0,307.0,0.0,4.0,2.0,-2.0,-2.0,-1.0,0.057143,0.0,21,63,42,81.076993,-16.155109
3,Indianapolis Colts,clt,Jacksonville Jaguars,jax,2,,9.0,4.0,0.0,80.0,8.0,8.0,0.0,77.0,-6.0,-2.0,20.0,0.0,54.0,790.0,0.0,134.0,1.0,-1.0,-3.0,0.0,0.15,0.0,26,60,43,197.967309,20.254966
4,Tampa Bay Buccaneers,tam,New Orleans Saints,nor,2,0.0,0.0,-1.0,1.0,-39.0,-7.0,-5.0,-1.0,-57.0,-3.0,-8.0,14.0,-1.0,1.0,386.0,-2.0,-38.0,0.0,-74.0,-18.0,0.0,0.049451,1.0,82,93,88,90.145645,74.081116


## Training a Model to Predict NFL Games

##### First fix the games that were 'tied' , as we cannot have Null values going into our logistic regression

In [39]:
comp_games_df['win_perc_dif'].fillna(0,inplace=True)

In [103]:
def display_prediction(y_pred,X_test):
    for g in range(len(y_pred)):
        win_prob = round(y_pred[g],2)
        away_team = X_test.reset_index().drop(columns = 'index').loc[g,'away_name']
        home_team = X_test.reset_index().drop(columns = 'index').loc[g,'home_name']
        print(f'The {away_team} have a probability of {round(win_prob*100,2)}% of beating the {home_team}.') 

In [97]:
## Running code from the prediction website
msk = np.random.rand(len(comp_games_df)) < 0.8

train_df = comp_games_df[msk]
test_df = comp_games_df[~msk]

X_train = train_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_train = train_df[['result']] 
X_test = test_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])
y_test = test_df[['result']]

clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)
clf.fit(X_train, np.ravel(y_train.values))

y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]



In [104]:
display_prediction(y_pred,test_df)

The Indianapolis Colts have a probability of 12.0% of beating the Jacksonville Jaguars.
The Pittsburgh Steelers have a probability of 88.0% of beating the Cleveland Browns.
The Detroit Lions have a probability of 0.0% of beating the Minnesota Vikings.
The Philadelphia Eagles have a probability of 99.0% of beating the Washington Commanders.
The Minnesota Vikings have a probability of 99.0% of beating the New Orleans Saints.
The Los Angeles Chargers have a probability of 76.0% of beating the Houston Texans.
The New York Jets have a probability of 11.0% of beating the Pittsburgh Steelers.
The Pittsburgh Steelers have a probability of 21.0% of beating the Buffalo Bills.
The Tennessee Titans have a probability of 95.0% of beating the Washington Commanders.
The Dallas Cowboys have a probability of 31.0% of beating the Los Angeles Rams.
The Las Vegas Raiders have a probability of 68.0% of beating the Kansas City Chiefs.
The Washington Commanders have a probability of 4.0% of beating the Chica

In [105]:
accuracy_score(y_test,np.round(y_pred))

0.5333333333333333

## Now run model on unplayed games

In [116]:
X_pred = pred_games_df.drop(columns = ['away_name', 'away_abbr', 'home_name', 'home_abbr', 'week','result'])

In [118]:
y_pred_current = clf.predict_proba(X_pred)

In [119]:
y_pred_current = y_pred_current[:,1]

In [120]:
y_pred_current

array([0.18457058, 0.97892571, 0.81446934, 0.0489994 , 0.61426327,
       0.88137413, 0.34418307, 0.9601711 , 0.40293374, 0.24251018,
       0.6734478 , 0.15799469, 0.76782609, 0.92453205])

In [124]:
pred_games_df.shape

(14, 33)

In [123]:
display_prediction(y_pred,pred_games_df)

The New Orleans Saints have a probability of 12.0% of beating the Arizona Cardinals.
The Tampa Bay Buccaneers have a probability of 88.0% of beating the Carolina Panthers.
The Atlanta Falcons have a probability of 0.0% of beating the Cincinnati Bengals.
The Detroit Lions have a probability of 99.0% of beating the Dallas Cowboys.
The New York Giants have a probability of 99.0% of beating the Jacksonville Jaguars.
The Indianapolis Colts have a probability of 76.0% of beating the Tennessee Titans.
The Cleveland Browns have a probability of 11.0% of beating the Baltimore Ravens.
The Green Bay Packers have a probability of 21.0% of beating the Washington Commanders.
The New York Jets have a probability of 95.0% of beating the Denver Broncos.
The Houston Texans have a probability of 31.0% of beating the Las Vegas Raiders.
The Seattle Seahawks have a probability of 68.0% of beating the Los Angeles Chargers.
The Kansas City Chiefs have a probability of 4.0% of beating the San Francisco 49ers.


KeyError: 14