In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import sys
import pickle
from datetime import datetime

In [2]:
sys.stdout=open('test_results.txt','a')

In [3]:
print(datetime.now())

In [4]:
data = pd.read_csv('data/mergedDataset.csv',index_col=0)
data.drop(['kickoff_time'],axis=1,inplace=True)
obj_cols=data.columns[data.dtypes==object]
obj_cols

Index(['season_x', 'name', 'position', 'team_x', 'opp_team_name'], dtype='object')

In [5]:
data_refined = data.loc[:,[i for i in list(data.columns) if i not in ['name','team_x', 'opp_team_name', 'season_x']]]
data_refined = pd.get_dummies(data_refined, columns = ['position', 'GW'], drop_first=True)

cols=data_refined.columns[data_refined.isna().sum()>0]
for c in cols:
    data_refined[c].fillna(0,inplace=True)

In [6]:
data_refined.columns[data_refined.isna().sum()>0]

Index([], dtype='object')

In [7]:

X_train, X_test, y_train, y_test = train_test_split(data_refined.loc[:, data_refined.columns != 'total_points'], data_refined['total_points'], test_size=0.2, random_state=123)


In [8]:
results_train = {}
results_test = {}

In [9]:
#Random Forest
reg = RandomForestRegressor(random_state=0, criterion = 'mse')
#Apply grid search for best parameters
params = {'randomforestregressor__n_estimators' : range(100,500,200),
          'randomforestregressor__min_samples_split' : range(2,10,3)}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1, cv=5)
reg = grid.fit(X_train, y_train)


In [10]:
print("*"*30+"Random Forest"+"*"*30)

In [11]:

print('Best MSE for Random Forest Regressor: ', grid.best_score_)
print(f'Best Parameters for Random Forest Regressor: Estimators: {grid.best_estimator_} and Min Samples Split: {grid.n_splits_} Params: {grid.best_params_}')

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print("Training Error: " + str(tr_err)+ " Testing Error: " + str(ts_err))
results_train['random_forest'] = tr_err
results_test['random_forest'] = ts_err

In [12]:
print("*"*30+"Ridge Linear Regression"+"*"*30)

In [13]:
#Ridge Linear Regression
reg = Ridge()
#Apply grid search for best parameters
params = {'ridge__alpha':[0.0001, 0.01, 0.1, 0.5, 0.9],
          'ridge__solver':['lsqr']}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1, cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print("Training Error: " + str(tr_err)+ " Testing Error: " + str(ts_err))
results_train['Ridge_regression'] = tr_err
results_test['Ridge_regression'] = ts_err

In [14]:
print("*"*30+"LinearRegression"+"*"*30)

In [15]:

#Linear Regression
reg = LinearRegression()
#Apply grid search for best parameters
params = {}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1,  cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print("Training Error: " + str(tr_err)+ " Testing Error: " + str(ts_err))
results_train['Linear_regression'] = tr_err
results_test['Linear_regression'] = ts_err

In [16]:
print("*"*30+"Gradient Boosting"+"*"*30)

In [17]:
#GBR
reg = GradientBoostingRegressor()
params = {'gradientboostingregressor__n_estimators' : range(100,300,50),
          'gradientboostingregressor__min_samples_split' : range(2,10,2)}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1,cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print("Training Error: " + str(tr_err)+ " Testing Error: " + str(ts_err))

In [18]:
print("*"*30+"LASSO Linear"+"*"*30)

In [19]:
#Lasso Linear Regression
reg = linear_model.Lasso()
#Apply grid search for best parameters
params = {'lasso__alpha':[0.001, 0.01, 0.1, 0.5, 0.9]}
pipe = make_pipeline(reg)
grid = GridSearchCV(pipe, param_grid = params, scoring='neg_mean_squared_error', n_jobs=-1, cv=5)
reg = grid.fit(X_train, y_train)
print('Best MSE: ', grid.best_score_)
print('Best Parameters: ', grid.best_estimator_)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)
tr_err = mean_squared_error(y_train_pred, y_train)
ts_err = mean_squared_error(y_test_pred, y_test)
print("Training Error: " + str(tr_err)+ " Testing Error: " + str(ts_err))
results_train['Lasso_regression'] = tr_err
results_test['Lasso_regression'] = ts_err

  model = cd_fast.enet_coordinate_descent(


In [20]:
print(datetime.now())

In [None]:
sys.stdout.close()