In [1]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('newData.csv')

In [3]:
df.drop(columns=['Unnamed: 0', 'posteam', 'defteam'], inplace=True)

In [4]:
# I noticed we had 35 teams but there are only 32 in the NFL. Some team abbreviations had changed
# over time, but they're still the same team. The next cell fixes the issue. (SD=LAC=LA) and (JAX=JAC) 
h_teams = sorted(list(df.home_team.unique()))
a_teams = sorted(list(df.away_team.unique()))
team_info=pd.DataFrame(data={'Home Teams': h_teams, 'Away Teams': a_teams})

In [5]:
df.home_team = df.home_team.apply(lambda x: 'SD' if x in ['LAC', 'LA'] else 'JAX' if x=='JAC' else x)
df.away_team = df.away_team.apply(lambda x: 'SD' if x in ['LAC', 'LA'] else 'JAX' if x=='JAC' else x)

In [6]:
# Our response column. Values = 1 if home team wins else 0.
result = list()
for i in df.index:
    if df.total_home_score[i] > df.total_away_score[i]:
        result.append(1)
    else:
        result.append(0)
Home_win = pd.Series(data=result, index=df.index, name='Home_win')
df = pd.concat([df, Home_win], axis=1)

In [7]:
# Extract the year from the game ID.
df.game_id = df.game_id.apply(lambda x: str(x)[:4])
df.game_id = df.game_id.apply(lambda x: int(x))

In [8]:
# create list of unique NFL teams
teams = df.home_team.unique()
# There are 32 teams total, so I map them to a numeric value, 0-31. 
l = [i for i in range(0, 32)]
dic = dict(zip(teams, l))

In [9]:
# A list to track each teams wins by year.
total_wins_per_team_by_year = [dict((key, 0) for key in list(
                                        dic.keys())) for year in df.game_id.unique()]

In [10]:
# Get the total wins per team by year, and fill in the list.
p = 0
for year in df.game_id.unique():
    temp_df = df[df.game_id==year]
    for i in temp_df.index:
        if temp_df.Home_win[i] == 1:
            total_wins_per_team_by_year[p][temp_df.home_team[i]] += 1
        else:
            total_wins_per_team_by_year[p][temp_df.away_team[i]] += 1 
    p+=1

In [11]:
# Here I am extreacting information to create a feature column. The values will be the ratio
# of the last seasons total wins for the home team against the away team. Since 2009 is the first
# year I will just set every value in that year to the average of all other years. 
i = 0
p = -1
year = 2009
result = list()
while i in range(0, len(df.index)):
    if df.game_id[i] == 2009:
        result.append(1)
        i+=1
    elif df.game_id[i] == year:
        if total_wins_per_team_by_year[p][df.away_team[i]] != 0:
            result.append(total_wins_per_team_by_year[p][df.home_team[
                                   i]] / total_wins_per_team_by_year[p][df.away_team[i]])   
        else:
            result.append(total_wins_per_team_by_year[p][df.home_team[i]] * 1.5)
        i+=1
    else:
        p+=1
        year+=1  

In [12]:
# "home_win_stat" is the feature column of data extracted above.
home_win_stat = pd.Series(data=result, dtype='float64', name='home_win_stat')
df = pd.concat([df, home_win_stat], axis=1)
df.head(3)

Unnamed: 0,game_id,home_team,away_team,yards_gained,total_home_score,total_away_score,punt_blocked,third_down_converted,fourth_down_converted,interception,...,punt_blocked_away,punt_inside_twenty_away,yards_gained_away,Pass_comp_percentage,Pass_comp_percentage_away,rush_attempt_away,qb_hit_away,third_down_converted_away,Home_win,home_win_stat
0,2009,PIT,TEN,357.0,13,10,0.0,4.0,0.0,2.0,...,0.0,2.0,320.0,0.804878,0.647059,25.0,7.0,4.0,1,1.0
1,2009,ATL,MIA,281.0,19,7,0.0,6.0,0.0,0.0,...,0.0,1.0,259.0,0.611111,0.724138,22.0,2.0,4.0,1,1.0
2,2009,BAL,KC,501.0,38,23,0.0,10.0,1.0,1.0,...,1.0,2.0,188.0,0.619048,0.666667,17.0,0.0,2.0,1,1.0


In [13]:
# There were some data entry errors such that the home team was also the away team. I removed
# these rows.
df.drop(df.loc[df['home_team']==df['away_team']].index, inplace=True)

In [14]:
# Remove games that ended in a tie.
df.loc[df.total_home_score == df.total_away_score].index
df.drop(df.loc[df.total_home_score == df.total_away_score].index, inplace=True)

In [15]:
df.drop(columns=['game_id', 'total_home_score', 'total_away_score'], inplace=True)

In [16]:
# Reorder the columns.
df = df.reindex(columns=['home_team', 'away_team', 'home_win_stat', 'Pass_comp_percentage', 'rush_attempt', 'yards_gained', 'third_down_converted', 
                         'fourth_down_converted', 'punt_inside_twenty', 
                        'interception', 'qb_hit', 'sack', 'punt_blocked', 'own_kickoff_recovery', 
                        'penalty_yards', 'Pass_comp_percentage_away', 'rush_attempt_away', 'yards_gained_away', 'third_down_converted_away', 
                        'fourth_down_converted_away', 'punt_inside_twenty_away', 'interception_away', 'qb_hit_away', 
                        'sack_away', 'punt_blocked_away', 'own_kickoff_recovery_away', 
                        'penalty_yards_away', 'Home_win'])
df.reset_index(drop=True, inplace=True)

In [17]:
# 3-D list with shape 32 (number of teams) X x (nuber of games each team played) X 8 (feature 
                                                                        # columns per team per game)  
team_stats = [[] for i in range(0, 32)]

# 2-D list to track the win (1) or loss (0) of each team each game.
team_recent_win_count = [[] for i in range(0, 32)]

In [18]:
# Transform home_team and away_team into numeric columns.
df.home_team = df.home_team.map(dic)
df.away_team = df.away_team.map(dic)

In [19]:
# New dataframe where I will combine the values of the last three games for every team into a
# single row. I will add the amount of recent wins for each team i.e, how many times did each
# team win in their last 3 games. The responce column will show who won the current game, data from
# the current game is not included in the current row. Thus, we are predicting who will win the
# current game based strictly on information from the previous three games for each team. 
df2 = pd.DataFrame(columns = np.insert(df.columns, -1, ["Recent_wins", 'Recent_wins_away']))

In [20]:
# Fill the lists defined above, and add the rows to the newly created data frame (df2). 
for i in range(0, len(df.index)):
    home_team = df.home_team[df.index[i]] # Which team (0-31 is the home team)
    away_team = df.away_team[df.index[i]] # Which team (0-31 is the away team)
    
    if i > 47: # Dont add any game stats to the new data frame until each team has played 3 games.
                    # begin adding data on the first instance of a teams fourth game.
        length1 = len(team_stats[home_team])
        # Get recent win count (0-3) for the home team
        last3Home = [team_stats[home_team][length1-3][p] + team_stats[home_team][
                length1-2][p] + team_stats[home_team][length1-1][p] for p in range(0, 12)]
        length2 = len(team_stats[away_team])
        # Get recent win count (0-3) for the away team
        last3Away = [team_stats[away_team][length2-3][p] + team_stats[away_team][
                length2-2][p] + team_stats[away_team][length2-1][p] for p in range(0, 12)]
        
        length3 = len(team_recent_win_count[home_team])
        # Record number of recent wins for the current home team.
        last3HWs = sum(team_recent_win_count[home_team][length3-3:])
        length4 = len(team_recent_win_count[away_team])
        # Record number of recent wins for the current away team.
        last3AWs = sum(team_recent_win_count[away_team][length4-3:])
        
        # Add the stats of the last three games (summed) for each team to the dataframe as well
            # as the result of the current game to be used for the predictive model.
        df2.loc[len(df2.index)] = [home_team, away_team, df.home_win_stat[i]
                                  ] + last3Home + last3Away + [last3HWs, last3AWs, df.Home_win[
                                                                                    df.index[i]]]
    # Store current game stats to be added to the data frame at the teams next "4th" game.
    team_stats[home_team].append(list(df.iloc[df.index[i], 3:15]))
    team_stats[away_team].append(list(df.iloc[df.index[i], 15:27]))

    # Store the result of the current game to be used in the recent win count at the teams
        # next "4th" game.
    if df.Home_win[df.index[i]] == 1:
        team_recent_win_count[home_team].append(1)
        team_recent_win_count[away_team].append(0)
    else:
        team_recent_win_count[home_team].append(0)
        team_recent_win_count[away_team].append(1)    

In [21]:
df2.drop(columns=['home_team', 'away_team'], inplace=True)
df2.Pass_comp_percentage = df2.Pass_comp_percentage.apply(lambda x: x / 3)
df2.Pass_comp_percentage_away = df2.Pass_comp_percentage_away.apply(lambda x: x / 3)

In [22]:
df2 = df2.reindex(columns=['home_win_stat', 'Recent_wins', 'Pass_comp_percentage', 'rush_attempt', 'yards_gained', 'third_down_converted', 
                         'fourth_down_converted', 'punt_inside_twenty', 
                        'interception', 'qb_hit', 'sack', 'punt_blocked', 'own_kickoff_recovery', 
                        'penalty_yards', 'Recent_wins_away', 'Pass_comp_percentage_away', 'rush_attempt_away', 'yards_gained_away', 'third_down_converted_away', 
                        'fourth_down_converted_away', 'punt_inside_twenty_away', 'interception_away', 'qb_hit_away', 
                        'sack_away', 'punt_blocked_away', 'own_kickoff_recovery_away', 
                        'penalty_yards_away', 'Home_win'])
df.reset_index(drop=True, inplace=True)

In [23]:
# A function to normalize all the columns.
def min_max_scaling(dfIn):
    dfIn_norm = dfIn.copy()
    for column in dfIn_norm.columns:
        dfIn_norm[column] = ((dfIn_norm[column] - dfIn_norm[column].min())/
                (dfIn_norm[column].max() - dfIn_norm[column].min()))
    return dfIn_norm

In [24]:
# Create new data frame containing all the normalized values.
df_normalized = min_max_scaling(df2)

In [25]:
df_normalized.head()

Unnamed: 0,home_win_stat,Recent_wins,Pass_comp_percentage,rush_attempt,yards_gained,third_down_converted,fourth_down_converted,punt_inside_twenty,interception,qb_hit,...,third_down_converted_away,fourth_down_converted_away,punt_inside_twenty_away,interception_away,qb_hit_away,sack_away,punt_blocked_away,own_kickoff_recovery_away,penalty_yards_away,Home_win
0,0.047619,0.666667,0.697985,0.301587,0.317489,0.419355,0.181818,0.333333,0.5,0.526316,...,0.517241,0.5,0.428571,0.454545,0.242424,0.25,0.0,0.0,0.441088,1.0
1,0.047619,0.0,0.633046,0.198413,0.13894,0.225806,0.0,0.466667,0.6,0.289474,...,0.482759,0.5,0.571429,0.363636,0.454545,0.5,0.0,0.5,0.362538,0.0
2,0.047619,0.333333,0.641071,0.198413,0.349787,0.322581,0.272727,0.2,0.2,0.368421,...,0.344828,0.25,0.285714,0.363636,0.242424,0.25,0.0,0.0,0.305136,1.0
3,0.047619,1.0,0.713304,0.269841,0.487508,0.387097,0.090909,0.133333,0.2,0.5,...,0.551724,0.25,0.214286,0.363636,0.424242,0.45,0.0,0.0,0.23565,1.0
4,0.047619,0.333333,0.416452,0.309524,0.348568,0.483871,0.272727,0.266667,0.1,0.289474,...,0.344828,0.0,0.214286,0.363636,0.393939,0.35,0.0,0.0,0.21148,1.0


In [29]:
df_normalized.rename(columns=lambda x: str.capitalize(x), inplace=True)

In [30]:
# Save new data frame as '.csv' file.
df_normalized.to_csv('normData.csv')