## Model 1
In this model in our two-model layer, we start with input the recent historical stats of the teams in a game and predict the stats for the team in their pending matchup. 

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import Ridge


In [3]:
games_df = pd.read_csv("clean_games_with_opp.csv", index_col=0)
games_df.reset_index(drop=True, inplace=True)
games_df.head()

Unnamed: 0,school,school_id,points,fumblesRecovered,rushingTDs,passingTDs,kickReturnYards,kickReturnTDs,kickReturns,kickingPoints,...,opposing_rushingTDs,opposing_points,side,thirdDownConverts,thirdDownAttempts,fourthDownConverts,fourthDownAttempts,completions,passAttempts,passCompletionPercentage
0,Navy,2426,17,0.0,2.0,0.0,64.0,0.0,4.0,5.0,...,1.0,34,A,4.0,12.0,1.0,1.0,2.0,4.0,0.5
1,UAB,5,48,1.0,4.0,2.0,64.0,0.0,2.0,12.0,...,1.0,10,A,9.0,14.0,0.0,0.0,13.0,20.0,0.65
2,UMass,113,7,0.0,0.0,1.0,95.0,0.0,4.0,1.0,...,2.0,30,A,3.0,11.0,1.0,1.0,9.0,22.0,0.409091
3,UCF,2116,24,1.0,2.0,1.0,142.0,0.0,5.0,6.0,...,1.0,26,A,5.0,13.0,2.0,2.0,12.0,22.0,0.545455
4,Houston,248,7,1.0,1.0,0.0,89.0,0.0,6.0,1.0,...,3.0,27,A,4.0,16.0,4.0,4.0,25.0,50.0,0.5


In [5]:
# Dropping all columns with more than 25 nulls, then dropping the individual rows that have nulls
x = games_df.isna().sum().to_dict()
column_drop = []

for k, v in x.items():
    if v > 25 :
        column_drop.append(k)

# Additionally, these columns are weird formats or already covered by other variables (ie the possession stuff)
extend_list = ['completionAttempts','totalPenaltiesYards','possessionTime','possession_minutes','possession_seconds','year','week','school','points','opposing_points']

#column_drop.extend(extend_list)
column_drop = column_drop + extend_list

games_df.drop(columns=column_drop, inplace = True)
games_df.dropna(axis = 0, inplace = True)

games_df = games_df.select_dtypes(exclude=["object"])

In [6]:
games_df.keys()

Index(['school_id', 'fumblesRecovered', 'rushingTDs', 'passingTDs',
       'firstDowns', 'thirdDownEff', 'totalYards', 'netPassingYards',
       'yardsPerPass', 'rushingYards', 'rushingAttempts',
       'yardsPerRushAttempt', 'turnovers', 'fumblesLost', 'interceptions',
       'poss_total_sec', 'id', 'opposing_netPassingYards',
       'opposing_yardsPerPass', 'opposing_rushingYards',
       'opposing_yardsPerRushAttempt', 'opposing_passingTDs',
       'opposing_rushingTDs', 'thirdDownConverts', 'thirdDownAttempts',
       'fourthDownConverts', 'fourthDownAttempts', 'completions',
       'passAttempts', 'passCompletionPercentage'],
      dtype='object')

We initially set the historical memory equal to 12 games. This is roughly the past seasons-worth of games for an average team. 

In [7]:
memory = 12

In [8]:
teams = set(games_df["school_id"])
print(teams)
print(len(teams))

{2, 2050, 5, 6, 2567, 8, 9, 2569, 2571, 12, 2572, 13, 16, 2065, 2579, 21, 2582, 23, 24, 25, 26, 3101, 30, 2083, 2084, 36, 38, 2598, 41, 43, 47, 48, 2097, 50, 52, 55, 57, 58, 59, 2617, 61, 62, 2110, 2619, 2623, 66, 2115, 2116, 2117, 2628, 68, 70, 2633, 2630, 2635, 2636, 77, 2638, 79, 2127, 2641, 2634, 2643, 84, 2132, 2640, 87, 2649, 2653, 2142, 2655, 96, 97, 98, 99, 93, 103, 107, 113, 2674, 2678, 119, 120, 2681, 2169, 127, 130, 2692, 2181, 135, 2184, 142, 145, 2193, 147, 149, 150, 151, 152, 153, 154, 2711, 2199, 2717, 158, 155, 160, 2210, 164, 166, 167, 2729, 2226, 2229, 2230, 183, 2747, 189, 2751, 193, 194, 195, 2754, 197, 2755, 2247, 2241, 201, 202, 204, 2771, 213, 2261, 218, 221, 222, 227, 228, 2277, 231, 233, 235, 236, 238, 239, 2287, 242, 2803, 245, 2294, 248, 249, 2296, 251, 252, 253, 254, 256, 2305, 258, 259, 2306, 2309, 257, 264, 265, 2320, 275, 276, 277, 278, 2837, 2329, 282, 2335, 290, 2341, 295, 2348, 301, 2197, 302, 304, 2198, 309, 311, 322, 2710, 324, 326, 328, 2377, 331, 3

In [9]:
X = []
y = []
for team in teams:
    team_df = games_df[games_df["school_id"] == team]
    team_df = team_df.sort_values("id")
    team_df.drop(columns=["id", "school_id"], inplace=True)
    team_array = np.array(team_df)
    
    for i in range(memory, len(team_array)):
        X.append(np.hstack(team_array[i-memory:i]))
        y.append(team_array[i])
   

In [19]:
print(len(team_df.keys()))
print(len(y[0]))

28
28


We now split the data into the training and testing sets:

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Now we have the train and test data, we will scale the input to the model. Since the output is stats each working on the same scales as the inputs, we scale the output as well.

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaler2 = StandardScaler()
scaler2.fit(y_train)
y_train_scaled = scaler2.transform(y_train)
y_test_scaled = scaler2.transform(y_test)

## Training and testing models
We first try some linear models for predicting the stats in the next game.

In [22]:
from sklearn.linear_model import Ridge, ElasticNet, Lasso, LinearRegression

reg25 = Ridge(alpha=0.25)
reg25.fit(X_train_scaled, y_train_scaled)

reg50 = Ridge(alpha=0.5)
reg50.fit(X_train_scaled, y_train_scaled)

reg100 = Ridge(alpha=1)
reg100.fit(X_train_scaled, y_train_scaled)

We now make predictions with the ridge models on the test set and compute the mean squared error. 

In [23]:
from sklearn.metrics import mean_squared_error

pred_scores25 = reg25.predict(X_test_scaled)
pred_scores50 = reg50.predict(X_test_scaled)
pred_scores100 = reg100.predict(X_test_scaled)

print(f"alpha: MSE are: \n 0.25:{mean_squared_error(pred_scores25, y_test_scaled)} \n 0.50:{mean_squared_error(pred_scores50, y_test_scaled)} \n 1.00:{mean_squared_error(pred_scores100, y_test_scaled)}")

alpha: MSE are: 
 0.25:0.9289504736159343 
 0.50:0.9289502575653117 
 1.00:0.9289266963929623


In [26]:
print(len(pred_scores100[0]))

28


In [33]:
print(pred_scores100[0] - y_test_scaled[0])

a = scaler2.inverse_transform(pred_scores100[0].reshape(1,-1))
b = scaler2.inverse_transform(y_test_scaled[0].reshape(1,-1))
print((a-b).reshape(-1))

[ 0.54863844 -0.09488451  0.59579458  0.30320901  0.24223859  0.16659059
 -0.17382107  0.01817069  0.40507817 -0.59928104  0.98281807 -2.90289124
 -1.63475873 -2.46872195 -0.480033    0.33029835  0.11447753  1.02179322
  1.35660394 -0.2709794   1.03711741 -0.23207744 -0.99750956 -0.02292588
 -0.02292588 -0.41954126 -0.48914405  0.03969392]
[ 5.53311191e-01 -1.44025140e-01  8.17122805e-01  1.78666514e+00
  3.57553707e-02  2.01220718e+01 -1.69092247e+01  5.09998737e-02
  3.70312965e+01 -5.96431270e+00  1.75586845e+00 -3.63481143e+00
 -1.31959495e+00 -2.31521649e+00 -1.36031305e+02  3.22269812e+01
  3.18443679e-01  9.15872791e+01  2.39760829e+00 -3.62182208e-01
  1.50950005e+00 -5.66574738e-01 -2.99453941e+00 -3.21962678e-02
 -3.21962678e-02 -3.06550131e+00 -5.07732004e+00  4.76251063e-03]


In [35]:
lasso = Lasso(alpha=1)
lasso.fit(X_train_scaled, y_train_scaled)

ela_net = ElasticNet(alpha=1)
ela_net.fit(X_train_scaled, y_train_scaled)

print(f"MSE are \n Lasso:{mean_squared_error(lasso.predict(X_test_scaled), y_test_scaled)}\n ElasticNet:{mean_squared_error(ela_net.predict(X_test_scaled), y_test_scaled)}")

MSE are 
 Lasso:0.9948268539848032
 ElasticNet:0.9948268539848032


Interestingly, with some tests we found that as we increase the alpha parameter, performance goes down overall. This suggests that keeping all variables under consideration is nontrivially helping performance in the model. 

In [39]:
linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train_scaled)
y_pred = linreg.predict(X_test_scaled)

MSE with linear regression:6.577345167430221e+16


In [50]:
print(f"MSE with linear regression:{mean_squared_error(y_pred, y_test_scaled)}")

MSE with linear regression:3.3036381589701664e+16


In [44]:
diffs = [mean_squared_error(y_pred[i], y_test_scaled[i]) for i in range(len(y_pred))]

In [47]:
diffs = np.array(diffs)
print(np.max(diffs), np.argmax(diffs))

1.1173162019874167e+20 2979


In [49]:
y_pred[2979] = y_test_scaled[2979]

In [48]:
print(y_test_scaled[2979], y_pred[2979])

[ 1.2690857   2.79245604  0.24496899  1.3854571   1.56119285  1.61318648
  0.02741945 -0.1647401   2.10228013  1.58293767  1.35604091 -0.36530029
  0.45820873 -0.88212706  0.71941563  1.26164862  0.48369687 -0.04978026
 -0.20009726  1.06533808 -0.43179546  1.801169    0.62560544 -1.22629017
 -1.22629017  0.15908047  0.16862153  0.06027031] [-2.22346090e+10 -1.41264678e+10 -1.84013017e+10  4.83212683e+08
 -1.13906433e+10 -7.96384249e+09 -7.10497952e+09 -1.55576407e+10
 -2.96183912e+09 -9.06871617e+09 -6.75760902e+09  1.07658253e+10
 -8.24833074e+08  1.50840053e+10  9.48822617e+09  3.51696128e+09
 -5.07848044e+09 -1.03656215e+10 -4.97129588e+09  1.52220943e+09
 -1.47053489e+10 -1.25158562e+10 -1.91786456e+09 -1.34233821e+10
 -1.34233821e+10  4.98713339e+09  8.35881321e+09 -5.90571124e+09]


In [40]:
for i in range(len(y_pred)):
    print(y_pred[i]-y_test_scaled[i])

[ 0.54664404 -0.10221318  0.60032844  0.30468911  0.24034673  0.16720646
 -0.1732224   0.01908383  0.40501557 -0.59753454  0.98056052 -2.89773263
 -1.63054619 -2.46553882 -0.47655884  0.32643642  0.11536499  1.03283297
  1.36383231 -0.27151603  1.04503982 -0.23639561 -1.0013045  -0.02190161
 -0.02198395 -0.41830671 -0.48917267  0.04052883]
[-1.06321048e+00  1.59247359e-01 -9.84805210e-01  2.75202234e-01
 -6.33499575e-01  9.23433740e-02 -6.63224921e-01 -8.49132174e-01
  8.28369708e-01 -7.31273096e-02  9.63091434e-01  2.28526370e-01
  5.78418105e-01 -1.92677042e-01  2.66072228e-01  1.25658268e+00
  7.40836837e-01  9.16488792e-02  1.19368968e-01 -1.64682973e-01
 -3.39930414e-01 -8.86269569e-01 -6.58882398e-01  6.75571078e-04
  5.76165469e-04 -3.05771222e-01  1.75824661e-01 -8.99433243e-01]
[ 1.03438719  0.43945058  1.08694093  0.11465794  0.38007606 -0.10942679
 -0.20960627  0.20896667  0.0782498   1.49070384 -1.2942239   1.4519263
  1.09909728  0.99196944  0.6659285  -0.55761422 -0.69874