# Major Leagues

In [1]:
import numpy as np
import pandas as pd

## Exploratory Data Analysis and Feature Engineering

In [2]:
games_df = pd.read_csv('../data/nfl_games.csv')
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0


The date here is formatted as a string. Parsing it into a standard Python date object will make it convenient to analyze, extract additional features, and further reformat.

In [3]:
import datetime

def parse_date(s: str):
    ymd = s.strip().split(sep='-')
    if len(ymd) != 3:
        return None
    else:
        [y, m, d] = ymd
        return datetime.date(int(y), int(m), int(d))
        

games_df['date']= games_df['date'].apply(parse_date)
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0


While we have many opportunities with the date feature, I'd like to start by reducing it into a numerical format. Normally, I would choose the "unix time" format for this use case. However, unix time cannot represent a date preceding 1970. So, instead, I will simply count the seconds since the earliest date which appears in the dataset.

In [4]:
first_game = min(games_df['date'])

games_df['seconds'] = games_df['date'].apply(lambda d: (d - first_game).total_seconds())
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1,seconds
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0,0.000000e+00
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0,6.048000e+05
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0,6.048000e+05
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0,6.048000e+05
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0,6.048000e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0,3.102019e+09
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0,3.102019e+09
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0,3.102624e+09
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0,3.102624e+09


## Data Preprocessing

Now, we need to remove or transform the columns which the modeling algorithm can't digest - i.e. the nonnumerical values. First, we need to one-hot encode the two team labels. Both features have the same set of values, so we differentiate the newly introduced columns by prepending which team they represent.

In [5]:
team1 = pd.get_dummies(games_df['team1'])
team2 = pd.get_dummies(games_df['team2'])

prepend = lambda x: lambda y: x + y
listmap = lambda f, l: list(map(f, l))

team1_labels = listmap(prepend("team1_"), team1.columns)
team2_labels = listmap(prepend("team2_"), team2.columns)

games_df[team1_labels] = team1
games_df[team2_labels] = team2
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,...,0,0,0,0,0,0,0,0,0,0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,...,0,0,0,0,0,0,0,1,0,0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,...,0,0,0,0,0,0,0,0,0,0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,...,0,0,0,0,0,0,0,0,0,0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,...,0,0,0,0,0,0,0,0,0,0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,...,0,0,0,0,0,0,0,0,0,0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,...,0,0,0,0,0,0,0,0,0,0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,...,0,0,0,0,0,0,0,0,0,0


The season feature could also use some work. Currently, it ranges from 1920 to 2018. Starting at 0 tends to achieve better results (or, at least, it certainly won't hurt).

In [6]:
games_df['season'] = games_df['season'].apply(lambda x: x - 1920)
games_df

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
0,1920-09-26,0,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,...,0,0,0,0,0,0,0,0,0,0
1,1920-10-03,0,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,...,0,0,0,0,0,0,0,1,0,0
2,1920-10-03,0,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,...,0,0,0,0,0,0,0,0,0,0
3,1920-10-03,0,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,...,0,0,0,0,0,0,0,0,0,0
4,1920-10-03,0,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,98,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,...,0,0,0,0,0,0,0,0,0,0
16270,2019-01-13,98,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,...,0,0,0,0,0,0,0,0,0,0
16271,2019-01-20,98,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,...,0,0,0,0,0,0,0,0,0,0
16272,2019-01-20,98,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,...,0,0,0,0,0,0,0,0,0,0


Now, we delete the unused columns.

In [7]:
del games_df['date']
del games_df['team1']
del games_df['team2']
games_df

Unnamed: 0,season,neutral,playoff,elo1,elo2,elo_prob1,score1,score2,result1,seconds,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
0,0,0,0,1503.947000,1300.000000,0.824651,48,0,1.0,0.000000e+00,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1503.420000,1300.000000,0.824212,43,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,1503.420000,1300.000000,0.824212,10,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1493.002000,1504.908000,0.575819,14,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1516.108000,1478.004000,0.644171,45,0,1.0,6.048000e+05,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16269,98,0,1,1640.171960,1647.624483,0.582068,41,28,1.0,3.102019e+09,...,0,0,0,0,0,0,0,0,0,0
16270,98,0,1,1669.105861,1633.114673,0.641378,20,14,1.0,3.102019e+09,...,0,0,0,0,0,0,0,0,0,0
16271,98,0,1,1682.450194,1648.424105,0.638772,23,26,0.0,3.102624e+09,...,0,0,0,0,0,0,0,0,0,0
16272,98,0,1,1675.286412,1661.668566,0.611248,31,37,0.0,3.102624e+09,...,0,0,0,0,0,0,0,0,0,0


Next, we shuffle our data so that we can split our training and testing data without bias (as it is now, the data is ordered chronologically).

In [8]:
shuffled = games_df.sample(frac=1)
shuffled

Unnamed: 0,season,neutral,playoff,elo1,elo2,elo_prob1,score1,score2,result1,seconds,...,team2_THO,team2_TLM,team2_TOL,team2_TON,team2_TOR,team2_UTI,team2_WBU,team2_WHE,team2_WJA,team2_WSH
13682,89,0,0,1580.026,1553.606,0.628609,21,14,1.0,2.810506e+09,...,0,0,0,0,0,0,0,0,0,0
13432,88,0,0,1557.537,1618.557,0.505727,34,14,1.0,2.779056e+09,...,0,0,0,0,0,0,0,0,0,0
2155,29,0,0,1670.047,1599.119,0.686211,52,21,1.0,9.217152e+08,...,0,0,0,0,0,0,0,0,0,0
8341,68,0,0,1355.881,1442.656,0.468704,24,30,0.0,2.145226e+09,...,0,0,0,0,0,0,0,0,0,0
4072,47,0,0,1457.086,1432.270,0.626451,10,3,1.0,1.485994e+09,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,5,0,0,1461.288,1534.960,0.487523,3,6,0.0,1.630368e+08,...,0,0,0,0,0,0,0,0,0,0
13312,87,0,0,1516.037,1426.162,0.709206,23,31,0.0,2.753654e+09,...,0,0,0,0,0,0,0,0,0,0
11693,81,0,0,1364.174,1694.779,0.178148,32,38,0.0,2.563747e+09,...,0,0,0,0,0,0,0,0,0,0
10565,77,0,0,1506.227,1650.892,0.387321,20,23,0.0,2.432506e+09,...,0,0,0,0,0,0,0,0,0,0


We also need to separate the output features from the input features; the output being the scores of each team, and input being everything else.

In [9]:
labels = games_df[['score1','score2']]
del games_df['score1']
del games_df['score2']

Finally, we split the training and test data. 90% of the dataset will be used to train the model, and the last 10% to test.

In [10]:
len_total = len(shuffled)
len_train = int(.9 * len_total)
len_test  = len_total - len_train

train_input  = games_df.head(len_train).to_numpy()
train_labels = labels.head(len_train).to_numpy()

test_input  = games_df.tail(len_test).to_numpy()
test_labels = labels.tail(len_test).to_numpy()

# Modeling
Since I was so pleased with my results in the second assignment, in which I applied a clustering neural network to the Shakespeare dataset, here I will apply a regression neural network.

In [11]:
import tensorflow as tf
from tensorflow import keras

In [23]:
len_hidden = len(games_df.columns)

model = tf.keras.Sequential([
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(len_hidden, activation='relu'),
    keras.layers.Dense(2)
])

model.compile(loss = 'mse',
              optimizer = tf.keras.optimizers.RMSprop(0.001),
              metrics=['mae','mse']
             )

In [24]:
model.fit(train_input, train_labels, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f534536b2e8>

In [25]:
model.evaluate(test_input, test_labels)



[112.77937294400002, 8.365741, 112.779366]

## Conclusion
The neural network achieved admirable accuracy, averaging just 8.26 error in its predictions over the test data. Looking back to the training history, it appears that the mean absolute and square error both plateued around 35 epochs. The fact that both were even better on the testing data than the training data gives me confidence that the neural network did not overfit.