In [1]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('newData.csv')

In [3]:
df.drop(columns=['Unnamed: 0', 'posteam', 'defteam'], inplace=True)

In [4]:
df.home_team = df.home_team.apply(lambda x: 'SD' if x in ['LAC', 'LA'] else 'JAX' if x=='JAC' else x)
df.away_team = df.away_team.apply(lambda x: 'SD' if x in ['LAC', 'LA'] else 'JAX' if x=='JAC' else x)

In [5]:
# I noticed we had 35 teams but there are only 32 in the NFL. Some team abbreviations had changed
# over time, but they're still the same team. The next cell fixes the issue. (SD=LAC=LA) and (JAX=JAC) 
h_teams = sorted(list(df.home_team.unique()))
a_teams = sorted(list(df.away_team.unique()))
team_info=pd.DataFrame(data={'Home Teams': h_teams, 'Away Teams': a_teams})

In [6]:
# Our response column. Values = 1 if home team wins else 0.
result = list()
for i in df.index:
    if df.total_home_score[i] > df.total_away_score[i]:
        result.append(1)
    else:
        result.append(0)
Home_win = pd.Series(data=result, index=df.index, name='Home_win')
df = pd.concat([df, Home_win], axis=1)

In [7]:
df.columns

Index(['game_id', 'home_team', 'away_team', 'game_date', 'drive',
       'yards_gained', 'total_home_score', 'total_away_score', 'punt_blocked',
       'first_down_rush', 'first_down_pass', 'third_down_converted',
       'third_down_failed', 'fourth_down_converted', 'fourth_down_failed',
       'interception', 'punt_inside_twenty', 'punt_in_endzone',
       'kickoff_inside_twenty', 'kickoff_in_endzone', 'solo_tackle',
       'tackled_for_loss', 'own_kickoff_recovery', 'rush_attempt', 'sack',
       'punt_attempt', 'fumble', 'fourth_down_converted_away',
       'interception_away', 'own_kickoff_recovery_away', 'sack_away',
       'punt_blocked_away', 'punt_inside_twenty_away', 'punt_in_endzone_away',
       'yards_gained_away', 'Pass_comp_percentage',
       'Pass_comp_percentage_away', 'rush_attempt_away',
       'third_down_converted_away', 'away_drive', 'away_first_down_rush',
       'away_first_down_pass', 'away_third_down_failed',
       'away_fourth_down_failed', 'away_kickoff_ins

In [8]:
# Extract the year from the game ID.
df.game_date = df.game_id.apply(lambda x: int(str(x)[:4]))

In [9]:
# create list of unique NFL teams
teams = list(set(h_teams).union(set(a_teams)))
assert len(teams) == 32, len(teams)
# There are 32 teams total, so I map them to a numeric value, 0-31. 
l = [i for i in range(0, 32)]
dic = dict(zip(teams, l))

In [10]:
# A list to track each teams wins by year.
total_wins_per_team_by_year: dict[int, dict[str, int]] = dict((year,dict((key, value) for key, value in zip(
                                        dic.keys(), [0]*len(dic.keys())))) for year in df.game_date.unique())

In [11]:
# Get the total wins per team by year, and fill in the list.
for year in df.game_date.unique():
    temp_df = df[df.game_date==year]
    for i in temp_df.index:
        if temp_df.Home_win[i] == 1:
            total_wins_per_team_by_year[year][temp_df.home_team[i]] += 1
        else:
            total_wins_per_team_by_year[year][temp_df.away_team[i]] += 1 

In [12]:
# Here I am extreacting information to create a feature column. The values will be the ratio
# of the last seasons total wins for the home team against the away team. Since 2009 is the first
# year I will just set every value in that year to the average of all other years. 
result = []
for year in df.game_date.unique():
    df_temp = df[df.game_date==year]
    if not (year == 2009): 
        for i in df_temp.index:
            h_team, a_team = df_temp.home_team[i], df_temp.away_team[i]
            h_wins, a_wins = total_wins_per_team_by_year[year-1][h_team], total_wins_per_team_by_year[year-1][a_team]
            if not (a_wins == 0):
                result.append({'team': h_team, 'year': year, 'ratio': h_wins/a_wins})
            else: 
                result.append({'team': h_team, 'year': year, 'ratio': int(h_wins * 1.5)})
    else:
        for i in df_temp.index:
            h_team, a_team = df_temp.home_team[i], df_temp.away_team[i]
            result.append({'team': h_team, 'year': year, 'ratio': 1.0})

# Create a pd.Series from the list of dictionaries.
ratio = pd.Series(data=result, index=df.index, name='home_win_stat')
df = pd.concat([df, ratio], axis=1)
        



In [13]:
# There were some data entry errors such that the home team was also the away team. I removed
# these rows.
df.drop(df.loc[df['home_team']==df['away_team']].index, inplace=True)

In [14]:
# Remove games that ended in a tie.
idxs = df.loc[df.total_home_score == df.total_away_score].index
df.drop(idxs, axis=0, inplace=True)

In [15]:
df.drop(columns=['game_id', 'game_date', 'total_home_score', 'total_away_score'], inplace=True)

In [16]:
[column for column in df.columns if 'away' in column]

['away_team',
 'fourth_down_converted_away',
 'interception_away',
 'own_kickoff_recovery_away',
 'sack_away',
 'punt_blocked_away',
 'punt_inside_twenty_away',
 'punt_in_endzone_away',
 'yards_gained_away',
 'Pass_comp_percentage_away',
 'rush_attempt_away',
 'third_down_converted_away',
 'away_drive',
 'away_first_down_rush',
 'away_first_down_pass',
 'away_third_down_failed',
 'away_fourth_down_failed',
 'away_kickoff_inside_twenty',
 'away_kickoff_in_endzone',
 'away_solo_tackle',
 'away_tackled_for_loss',
 'away_punt_attempt',
 'away_fumble']

In [17]:
# Reorder the columns.
df = df.reindex(columns=['home_team', 'away_team',
       
       'home_win_stat', 'drive', 'yards_gained','punt_blocked',
       'first_down_rush', 'first_down_pass', 'third_down_converted',
       'third_down_failed', 'fourth_down_converted', 'fourth_down_failed',
       'interception', 'punt_inside_twenty', 'punt_in_endzone',
       'kickoff_inside_twenty', 'kickoff_in_endzone', 'solo_tackle',
       'tackled_for_loss', 'own_kickoff_recovery', 'rush_attempt', 'sack',
       'punt_attempt', 'fumble','Pass_comp_percentage',
        
        
       'away_drive', 'yards_gained_away','punt_blocked_away','away_first_down_rush',
       'away_first_down_pass','third_down_converted_away', 'away_third_down_failed', 'fourth_down_converted_away',
       'away_fourth_down_failed','interception_away','punt_inside_twenty_away', 'punt_in_endzone_away','away_kickoff_inside_twenty',
       'away_kickoff_in_endzone', 'away_solo_tackle', 'away_tackled_for_loss', 'own_kickoff_recovery_away',
       'rush_attempt_away','sack_away','away_punt_attempt','away_fumble','Pass_comp_percentage_away', 
       
       'Home_win'])
df.reset_index(drop=True, inplace=True)

In [18]:
# 3-D list with shape 32 (number of teams) X x (nuber of games each team played) X 8 (feature 
                                                                        # columns per team per game)  
team_stats = [[] for i in range(0, 32)]

# 2-D list to track the win (1) or loss (0) of each team each game.
team_recent_win_count = [[] for i in range(0, 32)]

In [19]:
# Transform home_team and away_team into numeric columns.
df.home_team = df.home_team.map(dic)
df.away_team = df.away_team.map(dic)

In [20]:
# New dataframe where I will combine the values of the last three games for every team into a
# single row. I will add the amount of recent wins for each team i.e, how many times did each
# team win in their last 3 games. The responce column will show who won the current game, data from
# the current game is not included in the current row. Thus, we are predicting who will win the
# current game based strictly on information from the previous three games for each team. 
df2 = pd.DataFrame(columns = np.insert(df.columns, [2,-1], ["Recent_wins", 'Recent_wins_away']))

In [21]:
df.columns

Index(['home_team', 'away_team', 'home_win_stat', 'drive', 'yards_gained',
       'punt_blocked', 'first_down_rush', 'first_down_pass',
       'third_down_converted', 'third_down_failed', 'fourth_down_converted',
       'fourth_down_failed', 'interception', 'punt_inside_twenty',
       'punt_in_endzone', 'kickoff_inside_twenty', 'kickoff_in_endzone',
       'solo_tackle', 'tackled_for_loss', 'own_kickoff_recovery',
       'rush_attempt', 'sack', 'punt_attempt', 'fumble',
       'Pass_comp_percentage', 'away_drive', 'yards_gained_away',
       'punt_blocked_away', 'away_first_down_rush', 'away_first_down_pass',
       'third_down_converted_away', 'away_third_down_failed',
       'fourth_down_converted_away', 'away_fourth_down_failed',
       'interception_away', 'punt_inside_twenty_away', 'punt_in_endzone_away',
       'away_kickoff_inside_twenty', 'away_kickoff_in_endzone',
       'away_solo_tackle', 'away_tackled_for_loss',
       'own_kickoff_recovery_away', 'rush_attempt_away', 'sac

In [22]:
home_cols=[col for col in df.columns[3:-1] if not 'away' in col]
away_cols=[col for col in df.columns[3:-1] if 'away' in col]
set(df.columns).difference(set(home_cols+away_cols))

{'Home_win', 'away_team', 'home_team', 'home_win_stat'}

In [23]:
len(away_cols)

22

In [24]:
# Fill the lists defined above, and add the rows to the newly created data frame (df2). 
all_data = []
for i in range(0, len(df)):
    idx = i
    home_team = df.home_team[idx] # Which team (0-31 is the home team)
    away_team = df.away_team[idx] # Which team (0-31 is the away team)
    
    if i > 47: # Dont add any game stats to the new data frame until each team has played 3 games.
                    # begin adding data on the first instance of a teams fourth game.
        length1 = len(team_stats[home_team])
        # Get recent win count (0-3) for the home team
        last3Home = [team_stats[home_team][length1-3][p] + team_stats[home_team][
                length1-2][p] + team_stats[home_team][length1-1][p] for p in range(22)]
        length2 = len(team_stats[away_team])
        # Get recent win count (0-3) for the away team
        last3Away = [team_stats[away_team][length2-3][p] + team_stats[away_team][
                length2-2][p] + team_stats[away_team][length2-1][p] for p in range(22)]
        
        length3 = len(team_recent_win_count[home_team])
        # Record number of recent wins for the current home team.
        last3HWs = sum(team_recent_win_count[home_team][length3-3:])
        length4 = len(team_recent_win_count[away_team])
        # Record number of recent wins for the current away team.
        last3AWs = sum(team_recent_win_count[away_team][length4-3:])
        
        # Add the stats of the last three games (summed) for each team to the dataframe as well
            # as the result of the current game to be used for the predictive model.
        all_data.append([home_team, away_team, last3HWs, df.home_win_stat[i]['ratio']  # type: ignore
                                  ] + last3Home + last3Away + [last3AWs, df.Home_win[idx]])
    # Store current game stats to be added to the data frame at the teams next "4th" game.
    team_stats[home_team].append(list(df[home_cols].iloc[idx])) # len = 22
    team_stats[away_team].append(list(df[away_cols].iloc[idx])) # len = 22

    # Store the result of the current game to be used in the recent win count at the teams
        # next "4th" game.
    if df.Home_win[idx] == 1:
        team_recent_win_count[home_team].append(1)
        team_recent_win_count[away_team].append(0)
    else:
        team_recent_win_count[home_team].append(0)
        team_recent_win_count[away_team].append(1)    

In [25]:
# Fill the rows of df2 with the data from all_data.
df2 = pd.DataFrame(np.array(all_data), columns = df2.columns)

In [28]:
df2.head()

Unnamed: 0,home_team,away_team,Recent_wins,home_win_stat,drive,yards_gained,punt_blocked,first_down_rush,first_down_pass,third_down_converted,third_down_failed,fourth_down_converted,fourth_down_failed,interception,punt_inside_twenty,punt_in_endzone,kickoff_inside_twenty,kickoff_in_endzone,solo_tackle,tackled_for_loss,own_kickoff_recovery,rush_attempt,sack,punt_attempt,fumble,Pass_comp_percentage,away_drive,yards_gained_away,punt_blocked_away,away_first_down_rush,away_first_down_pass,third_down_converted_away,away_third_down_failed,fourth_down_converted_away,away_fourth_down_failed,interception_away,punt_inside_twenty_away,punt_in_endzone_away,away_kickoff_inside_twenty,away_kickoff_in_endzone,away_solo_tackle,away_tackled_for_loss,own_kickoff_recovery_away,rush_attempt_away,sack_away,away_punt_attempt,away_fumble,Pass_comp_percentage_away,Recent_wins_away,Home_win
0,18.0,7.0,2.0,1.0,34.0,947.0,0.0,9.0,36.0,15.0,26.0,2.0,2.0,5.0,5.0,0.0,0.0,5.0,128.0,11.0,0.0,77.0,5.0,14.0,4.0,2.049469,31.0,877.0,0.0,19.0,28.0,18.0,27.0,4.0,0.0,5.0,6.0,0.0,5.0,4.0,145.0,12.0,0.0,90.0,5.0,15.0,5.0,1.696779,1.0,1.0
1,5.0,31.0,0.0,1.0,34.0,654.0,0.0,8.0,27.0,9.0,29.0,0.0,1.0,6.0,7.0,0.0,3.0,8.0,135.0,8.0,0.0,64.0,11.0,16.0,5.0,1.956777,31.0,901.0,0.0,16.0,34.0,17.0,24.0,4.0,1.0,4.0,8.0,0.0,2.0,3.0,111.0,7.0,1.0,80.0,7.0,16.0,3.0,1.932246,2.0,0.0
2,19.0,12.0,1.0,1.0,32.0,1000.0,0.0,11.0,39.0,12.0,24.0,3.0,1.0,2.0,3.0,0.0,2.0,10.0,106.0,10.0,0.0,64.0,4.0,16.0,6.0,1.968232,31.0,669.0,0.0,15.0,19.0,13.0,24.0,2.0,1.0,4.0,4.0,0.0,2.0,3.0,115.0,4.0,0.0,80.0,6.0,15.0,6.0,1.389912,1.0,1.0
3,3.0,24.0,3.0,1.0,29.0,1226.0,0.0,13.0,41.0,14.0,18.0,1.0,3.0,2.0,2.0,0.0,0.0,3.0,107.0,6.0,0.0,73.0,2.0,9.0,2.0,2.071335,36.0,1075.0,0.0,16.0,41.0,19.0,27.0,2.0,2.0,4.0,3.0,0.0,1.0,3.0,150.0,7.0,0.0,85.0,4.0,14.0,5.0,1.964945,1.0,1.0
4,23.0,29.0,1.0,1.0,30.0,998.0,0.0,22.0,27.0,17.0,25.0,3.0,2.0,1.0,4.0,0.0,3.0,4.0,102.0,7.0,0.0,78.0,5.0,12.0,7.0,1.647619,40.0,1055.0,0.0,17.0,33.0,13.0,25.0,0.0,3.0,4.0,3.0,0.0,2.0,6.0,114.0,14.0,0.0,81.0,4.0,17.0,4.0,1.73188,0.0,1.0


In [29]:
df2.drop(columns=['home_team', 'away_team'], inplace=True)
df2.Pass_comp_percentage = df2.Pass_comp_percentage.apply(lambda x: x / 3)
df2.Pass_comp_percentage_away = df2.Pass_comp_percentage_away.apply(lambda x: x / 3)

In [30]:
df2.columns

Index(['Recent_wins', 'home_win_stat', 'drive', 'yards_gained', 'punt_blocked',
       'first_down_rush', 'first_down_pass', 'third_down_converted',
       'third_down_failed', 'fourth_down_converted', 'fourth_down_failed',
       'interception', 'punt_inside_twenty', 'punt_in_endzone',
       'kickoff_inside_twenty', 'kickoff_in_endzone', 'solo_tackle',
       'tackled_for_loss', 'own_kickoff_recovery', 'rush_attempt', 'sack',
       'punt_attempt', 'fumble', 'Pass_comp_percentage', 'away_drive',
       'yards_gained_away', 'punt_blocked_away', 'away_first_down_rush',
       'away_first_down_pass', 'third_down_converted_away',
       'away_third_down_failed', 'fourth_down_converted_away',
       'away_fourth_down_failed', 'interception_away',
       'punt_inside_twenty_away', 'punt_in_endzone_away',
       'away_kickoff_inside_twenty', 'away_kickoff_in_endzone',
       'away_solo_tackle', 'away_tackled_for_loss',
       'own_kickoff_recovery_away', 'rush_attempt_away', 'sack_away',
  

In [27]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [31]:
# A function to normalize all the columns.
def min_max_scaling(dfIn):
    dfIn_norm = dfIn.copy(deep=True)
    for column in dfIn_norm.columns[:-1]:
        dfIn_norm[column] = (dfIn_norm[column] / dfIn_norm[column].max()) * 2 -1 
    return dfIn_norm

In [32]:
# Create new data frame containing all the normalized values.
df_normalized = min_max_scaling(df2)

In [36]:
df_normalized.tail()

Unnamed: 0,Recent_wins,Home_win_stat,Drive,Yards_gained,Punt_blocked,First_down_rush,First_down_pass,Third_down_converted,Third_down_failed,Fourth_down_converted,Fourth_down_failed,Interception,Punt_inside_twenty,Punt_in_endzone,Kickoff_inside_twenty,Kickoff_in_endzone,Solo_tackle,Tackled_for_loss,Own_kickoff_recovery,Rush_attempt,Sack,Punt_attempt,Fumble,Pass_comp_percentage,Away_drive,Yards_gained_away,Punt_blocked_away,Away_first_down_rush,Away_first_down_pass,Third_down_converted_away,Away_third_down_failed,Fourth_down_converted_away,Away_fourth_down_failed,Interception_away,Punt_inside_twenty_away,Punt_in_endzone_away,Away_kickoff_inside_twenty,Away_kickoff_in_endzone,Away_solo_tackle,Away_tackled_for_loss,Own_kickoff_recovery_away,Rush_attempt_away,Sack_away,Away_punt_attempt,Away_fumble,Pass_comp_percentage_away,Recent_wins_away,Home_win
2458,0.333333,-0.936508,0.478261,0.118529,-1.0,-0.576923,0.240506,-0.212121,0.318182,-0.636364,-0.142857,-0.2,-0.2,-1.0,-1.0,-1.0,0.434343,-0.565217,-1.0,-0.212121,-0.52,0.071429,-0.384615,0.760228,0.478261,0.369484,-1.0,-0.176471,0.111111,0.375,0.142857,-0.75,-0.75,-0.636364,-0.428571,-1.0,-0.6,-0.846154,0.536082,-0.130435,-1.0,0.246914,0.272727,-0.153846,0.166667,0.931701,1.0,1.0
2459,-0.333333,-0.942857,0.347826,0.144654,-1.0,-0.384615,0.139241,-0.272727,0.136364,-0.818182,-0.428571,-0.2,-0.2,-1.0,-0.333333,-0.571429,0.373737,-0.391304,-1.0,-0.054545,-0.2,0.142857,-0.230769,0.502324,0.304348,0.067517,-1.0,0.098039,-0.407407,-0.1875,0.0,-0.25,-1.0,-0.818182,-0.428571,-1.0,-1.0,-0.846154,-0.103093,-0.304348,-1.0,0.222222,-0.363636,-0.076923,-0.666667,0.53189,1.0,1.0
2460,-1.0,-0.904762,0.26087,0.164006,-1.0,-0.538462,0.291139,-0.151515,-0.045455,-0.636364,-0.714286,-0.2,-0.2,-1.0,-1.0,-1.0,0.090909,-0.826087,-1.0,-0.369697,-0.68,-0.285714,-0.692308,0.768053,0.347826,0.477937,-1.0,-0.019608,0.160494,0.3125,0.0,-0.5,-1.0,-0.818182,-0.428571,-1.0,-1.0,-0.846154,0.350515,-0.826087,-1.0,0.296296,-0.818182,-0.153846,-0.833333,0.665477,0.333333,1.0
2461,0.333333,-0.863946,0.26087,-0.118529,-1.0,-0.269231,-0.063291,-0.151515,-0.045455,-0.454545,-0.428571,0.2,-0.866667,-1.0,-0.555556,-0.285714,0.090909,-0.565217,-1.0,-0.248485,-0.2,-0.357143,-0.692308,0.523523,0.304348,0.104732,-1.0,-0.411765,0.012346,-0.3125,0.047619,-0.5,-0.75,-0.818182,-0.142857,-1.0,-1.0,-0.538462,0.309278,-0.043478,-1.0,-0.037037,-0.545455,-0.076923,-0.333333,0.718397,0.333333,0.0
2462,-1.0,-0.896104,0.26087,0.27044,-1.0,-0.076923,0.113924,-0.333333,-0.136364,-0.636364,-0.142857,0.2,-0.6,-1.0,-0.555556,-0.428571,0.282828,-0.652174,-1.0,-0.139394,-0.6,-0.5,-0.076923,0.807474,0.26087,-0.162148,-1.0,-0.254902,-0.308642,-0.125,-0.047619,-0.75,-0.75,-0.454545,-0.571429,-1.0,-0.8,-0.538462,0.113402,-0.478261,-1.0,0.0,-0.545455,-0.230769,-0.333333,0.774237,0.333333,0.0


In [35]:
df_normalized.rename(columns=lambda x: str.capitalize(x), inplace=True)

In [37]:
# Save new data frame as '.csv' file.
df_normalized.to_csv('normData.csv')

In [38]:
df_normalized.shape

(2463, 48)

In [6]:
dat = pd.read_csv('normData.csv')
dat.drop(columns=['Unnamed: 0'], inplace=True)
dat.head()


Unnamed: 0,Recent_wins,Home_win_stat,Drive,Yards_gained,Punt_blocked,First_down_rush,First_down_pass,Third_down_converted,Third_down_failed,Fourth_down_converted,...,Away_solo_tackle,Away_tackled_for_loss,Own_kickoff_recovery_away,Rush_attempt_away,Sack_away,Away_punt_attempt,Away_fumble,Pass_comp_percentage_away,Recent_wins_away,Home_win
0,0.333333,-0.904762,0.478261,-0.083696,-1.0,-0.653846,-0.088608,-0.090909,0.181818,-0.636364,...,0.494845,0.043478,-1.0,0.111111,-0.545455,0.153846,-0.166667,0.380354,-0.333333,1.0
1,-1.0,-0.904762,0.478261,-0.367199,-1.0,-0.692308,-0.316456,-0.454545,0.318182,-1.0,...,0.14433,-0.391304,0.0,-0.012346,-0.363636,0.230769,-0.5,0.57191,0.333333,0.0
2,-0.333333,-0.904762,0.391304,-0.032414,-1.0,-0.576923,-0.012658,-0.272727,0.090909,-0.454545,...,0.185567,-0.652174,-1.0,-0.012346,-0.454545,0.153846,0.0,0.130714,-0.333333,1.0
3,1.0,-0.904762,0.26087,0.18626,-1.0,-0.5,0.037975,-0.151515,-0.181818,-0.818182,...,0.546392,-0.391304,-1.0,0.049383,-0.636364,0.076923,-0.166667,0.598512,-0.333333,1.0
4,-0.333333,-0.904762,0.304348,-0.034349,-1.0,-0.153846,-0.316456,0.030303,0.136364,-0.454545,...,0.175258,0.217391,-1.0,0.0,-0.636364,0.307692,-0.333333,0.40891,-1.0,1.0


In [7]:
dat_arr = dat.to_numpy(dtype=np.float32, copy=True)
dat_arr.shape

(2463, 48)

In [8]:
np.save('normData.npy', dat_arr)