# Major Leagues

In [1]:
import numpy as np
import pandas as pd

## Exploratory Data Analysis and Feature Engineering

In [2]:
games_df = pd.read_csv('../data/nfl_games.csv')
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0


The date here is formatted as a string. Parsing it into a standard Python date object will make it convenient to analyze, extract additional features, and further reformat.

In [3]:
import datetime

def parse_date(s: str):
    ymd = s.strip().split(sep='-')
    if len(ymd) != 3:
        return None
    else:
        [y, m, d] = ymd
        return datetime.date(int(y), int(m), int(d))
        

games_df['date']= games_df['date'].apply(parse_date)
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0


While we have many opportunities with the date feature, I'd like to start by reducing it into a numerical format. Normally, I would choose the "unix time" format for this use case. However, unix time cannot represent a date preceding 1970. So, instead, I will simply count the seconds since the earliest date which appears in the dataset.

In [13]:
first_game = min(games_df['date'])

games_df['seconds'] = games_df['date'].apply(lambda d: (d - first_game).total_seconds())
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1,seconds
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0,0.000000e+00
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0,6.048000e+05
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0,6.048000e+05
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0,6.048000e+05
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0,6.048000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0,3.102019e+09
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0,3.102019e+09
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0,3.102624e+09
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0,3.102624e+09


## Data Preprocessing

Now, we need to remove or transform the columns which the modeling algorithm can't digest - namely the nonnumerical values. First, we need to one-hot encode the two team labels. Both features have the same set of values, so we differentiate the newly introduced columns by prepending which team they represent.

In [26]:
team1 = pd.get_dummies(games_df['team1'])
team2 = pd.get_dummies(games_df['team2'])

prepend = lambda x: lambda y: x + y
def listmap(f, l):
    return list(map(f, l))

team1_labels = listmap(prepend("team1_"), team1.columns)
team2_labels = listmap(prepend("team2_"), team2.columns)

games_df[team1_labels] = team1
games_df[team2_labels] = team2
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,...,0,0,0,0,0,0,0,0,0,0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,...,0,0,0,0,0,0,0,1,0,0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,...,0,0,0,0,0,0,0,0,0,0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,...,0,0,0,0,0,0,0,0,0,0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,...,0,0,0,0,0,0,0,0,0,0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,...,0,0,0,0,0,0,0,0,0,0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,...,0,0,0,0,0,0,0,0,0,0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,...,0,0,0,0,0,0,0,0,0,0


The season feature could also use some work. Currently, it ranges from 1920 to 2018. Starting at 0 tends to achieve better results (it certainly won't hurt).

In [None]:
# TODO

Now, we delete the unused columns.

In [27]:
del games_df['date']
del games_df['team1']
del games_df['team2']
games_df

Unnamed: 0,season,neutral,playoff,elo1,elo2,elo_prob1,score1,score2,result1,seconds,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
0,1920,0,0,1503.947000,1300.000000,0.824651,48,0,1.0,0.000000e+00,...,0,0,0,0,0,0,0,0,0,0
1,1920,0,0,1503.420000,1300.000000,0.824212,43,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,1,0,0
2,1920,0,0,1503.420000,1300.000000,0.824212,10,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
3,1920,0,0,1493.002000,1504.908000,0.575819,14,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
4,1920,0,0,1516.108000,1478.004000,0.644171,45,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2018,0,1,1640.171960,1647.624483,0.582068,41,28,1.0,3.102019e+09,...,0,0,0,0,0,0,0,0,0,0
16270,2018,0,1,1669.105861,1633.114673,0.641378,20,14,1.0,3.102019e+09,...,0,0,0,0,0,0,0,0,0,0
16271,2018,0,1,1682.450194,1648.424105,0.638772,23,26,0.0,3.102624e+09,...,0,0,0,0,0,0,0,0,0,0
16272,2018,0,1,1675.286412,1661.668566,0.611248,31,37,0.0,3.102624e+09,...,0,0,0,0,0,0,0,0,0,0


Now, we shuffle our data so that we can split our training and testing data without bias (as it is now, the data is ordered chronologically).

In [28]:
shuffled = games_df.sample(frac=1)
shuffled

NameError: name 'labels' is not defined

We also need to separate the input features from the output features. That is,...

In [None]:
lenTotal = len(shuffled)
lenTrain = int(.9*lenTotal)
lenTest  = lenTotal - lenTrain

trainingInput  = shuffled.head(lenTrain).to_numpy()
trainingLabels = labels.head(lenTrain).to_numpy()

testingInput  = shuffled.head(lenTest).to_numpy()
testingLabels = labels.head(lenTest).to_numpy()

90% of the dataset will be used to train the model, and the last 10% for testing.