In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from vardata import years, team_abbr, columns, teams_idx
from results_processor import *

In [16]:
# read in data csv to df
df = pd.read_csv('stats/final_data.csv', index_col=[0])

In [17]:
# preview the data to ensure it was read correctly
print('Dimension of dataset= ', df.shape)
df.head() # To get first n rows from the dataset default value of n is 5

Dimension of dataset=  (3417, 75)


Unnamed: 0,home_team,away_team,ht_points_for,ht_yards,ht_plays,ht_yards_per_play,ht_turnovers,ht_fumbles_lost,ht_first_downs,ht_completions,...,at_4D_conversions,at_4D_conversion_percentage,at_red_zone_attempts,at_red_zone_tds,at_red_zone_td_percentage,ht_score,at_score,total_points,ht_pts_diff,ht_vegas_line
0,nor,min,366.0,5481.0,1022.0,5.35,28.5,10.0,313.5,391.5,...,11.5,56.15,45.5,20.0,43.95,14,9,23,5,-5.0
1,pit,atl,331.5,5422.0,975.0,5.6,24.5,9.0,289.5,331.0,...,10.5,67.9,49.0,25.5,50.15,15,9,24,6,1.5
2,buf,mia,308.0,4913.5,971.0,5.05,29.0,13.0,263.5,292.0,...,5.0,50.0,44.0,25.5,57.0,10,15,25,-5,3.0
3,chi,det,351.5,5064.0,970.5,5.2,30.0,12.5,291.5,303.0,...,8.0,54.35,43.5,24.5,56.0,19,14,33,5,-6.5
4,nwe,cin,456.5,5566.0,971.5,5.75,18.0,7.5,312.5,328.5,...,12.5,64.2,52.5,27.0,51.55,38,24,62,14,-5.5


In [18]:
X = df.values[:, 2:70]  # get input values from first two columns
y = df.values[:, 73]  # get output values from last coulmn
m = len(y) # number of training examples

In [19]:
# preview the data
print('Total no of samples (m) = %s \n' %(m))

# Show only first 5 records
for i in range(5):
    print('X =', X[i, ], ', y =', y[i])

Total no of samples (m) = 3417 

X = [366.0 5481.0 1022.0 5.35 28.5 10.0 313.5 391.5 595.0 3904.0 29.0 18.5
 6.25 207.0 398.5 1577.0 9.5 3.95 90.5 94.0 835.0 16.0 179.0 33.65 14.6
 294.0 4969.5 965.0 5.15 31.0 13.5 281.0 305.5 499.5 3100.0 13.5 17.5
 5.800000000000001 154.5 431.0 1869.5 14.5 4.35 99.0 94.0 735.0 27.5 181.0
 28.35 15.85 215.0 96.5 44.8 10.5 5.0 47.75 53.5 28.5 53.65 204.0 70.5
 34.55 20.0 11.5 56.150000000000006 45.5 20.0 43.95] , y = 5
X = [331.5 5422.0 975.0 5.6 24.5 9.0 289.5 331.0 519.5 3613.0 22.5 15.5 6.5
 179.0 418.5 1809.0 12.0 4.35 96.5 108.0 953.0 14.0 179.5 31.5 12.8 323.0
 4943.5 1035.5 4.75 26.0 11.0 312.5 362.0 585.0 3496.0 21.5 15.0 5.6 191.0
 415.0 1447.5 9.5 3.4 86.0 72.0 657.0 35.5 178.5 32.5 13.15 209.5 86.5
 41.2 11.0 5.5 46.45 46.0 23.5 51.4 224.5 91.0 40.1 15.5 10.5 67.9 49.0
 25.5 50.15] , y = 6
X = [308.0 4913.5 971.0 5.05 29.0 13.0 263.5 292.0 510.5 3253.0 23.0 16.0 5.95
 166.0 424.0 1660.5 7.0 3.95 83.0 83.5 681.5 14.5 181.5 28.4 14.55 349.0
 5

In [20]:
# split data into train and test sets
# 10 years of data for training, 3 years of data for testing
X_train = X[:2560]
X_test = X[2560:]
y_train = y[:2560]
y_test = y[2560:]

In [21]:
X_train = X
y_train = y

In [22]:
# create linear regression model with huber loss
model_r = linear_model.HuberRegressor(epsilon = 3, max_iter = 2000)

In [23]:
# define scaler and transform input data
scaler = preprocessing.MinMaxScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [24]:
# fit model to training data
model_r.fit(X_scaled_train,y_train)

In [25]:
# get results of model on test data
results = np.array([])
for each in X_scaled_test:
    outcome = model_r.predict([each])
    results = np.append(results, [outcome])

In [26]:
results_df = pd.DataFrame(columns = ['ht_pts_diff', 'ht_vegas_line', 'model_pred'])
results_df['ht_pts_diff'] = df['ht_pts_diff'][2560:]
results_df['ht_vegas_line'] = df['ht_vegas_line'][2560:]
results_df = results_df.reset_index(drop = True)
results_df['model_pred'] = results.tolist()

In [27]:
# get results of model on training data
train_results = np.array([])
for each in X_scaled_train:
    outcome = model_r.predict([each])
    train_results = np.append(train_results, [outcome])

In [28]:
train_results_df = pd.DataFrame(columns = ['ht_pts_diff', 'ht_vegas_line', 'model_pred'])
train_results_df['ht_pts_diff'] = df['ht_pts_diff'][:2560]
train_results_df['ht_vegas_line'] = df['ht_vegas_line'][:2560]
train_results_df = train_results_df.reset_index(drop = True)
train_results_df['model_pred'] = train_results.tolist()

ValueError: Length of values (3417) does not match length of index (2560)

In [14]:
results_df.to_csv('results/lin_reg_huber_loss_results.csv')

*************************************************************
**The following are the results of the model on Training Data:**
*************************************************************

In [15]:
process_results(train_results_df)

Total Number of Games:
 2560
Winning Bets Count:
 1567
No Bet Placed Count:
 103
Winning Percentage:
 0.6378
______________________________________
Winner Correctly Predicted Count:
 1834
Accuracy in Classifaction:
 0.7164
______________________________________
Mean Squared Error of Results:
 141.7075
Root Mean Squared Error of Results:
 11.9041


*************************************************************
**The following are the results of the model on Testing Data:**
*************************************************************

In [16]:
process_results(results_df)

Total Number of Games:
 799
Winning Bets Count:
 487
No Bet Placed Count:
 28
Winning Percentage:
 0.6316
______________________________________
Winner Correctly Predicted Count:
 552
Accuracy in Classifaction:
 0.6909
______________________________________
Mean Squared Error of Results:
 139.6670
Root Mean Squared Error of Results:
 11.8181


In [16]:
year = '2023'
week = '9'

In [17]:
df_wkly = pd.read_csv('stats_' + year + '_week_' + week + '.csv', index_col=[0])
final_wkly = pd.read_csv('games_' + year + '_week_' + week +'.csv', index_col=[0])
model_lines = np.array([])

In [18]:
df_wkly.head()

Unnamed: 0,home_team,away_team,ht_points_for,ht_yards,ht_plays,ht_yards_per_play,ht_turnovers,ht_fumbles_lost,ht_first_downs,ht_completions,...,ht_red_zone_td_percentage,at_3D_attempts,at_3D_conversions,at_3D_conversion_percentage,at_4D_attempts,at_4D_conversions,at_4D_conversion_percentage,at_red_zone_attempts,at_red_zone_tds,at_red_zone_td_percentage
0,kan,den,154.5,2079.0,327.5,6.4,6.5,2.0,117.0,122.5,...,62.55,60.5,22.5,37.35,7.5,4.0,58.35,16.5,8.5,51.15
1,oti,rav,81.5,1400.0,315.0,4.5,5.5,1.0,86.0,105.0,...,30.15,68.0,28.5,41.8,5.0,2.5,45.85,17.5,9.5,53.75
2,mia,car,162.5,2098.5,303.0,6.9,6.0,1.5,112.0,106.5,...,72.3,70.0,29.0,41.4,7.0,3.0,41.65,16.0,9.5,58.5
3,jax,clt,110.0,1850.5,352.0,5.25,7.0,4.5,107.5,125.0,...,51.25,66.5,26.5,39.85,8.5,2.5,29.85,16.5,9.0,54.6
4,chi,min,118.5,1692.0,322.5,5.3,5.5,2.5,98.0,115.0,...,63.15,64.0,31.0,47.85,7.5,5.0,61.35,17.0,11.0,63.9


In [19]:
X = df_wkly.values[:, 2:]
X_scaled = scaler.transform(X)
for each in X_scaled:
    line = model_r.predict([each])
    model_lines = np.append(model_lines, [line])

In [20]:
final_wkly['ht_point_differential'] = model_lines
final_wkly.to_csv('games_'+ year + '_week_' + week +'.csv')