In [119]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats


import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedKFold
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

In [75]:
from xgboost import XGBRegressor

In [2]:
# Loading in the Dataframe I prepared in Notebook 02, Data Preparation
df = pd.read_csv('C:\\Users\\12242\\103122\\NBA-Fantasy-Point-Projections\\data\\Prepared_Data.csv')

In [3]:
df.drop(['Date', 'FPTS/M', 'FPTS_Shtng'], axis=1, inplace=True)

In [5]:
df_new = df[(np.abs(stats.zscore(df['FPTS'])) < 3)]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61921 entries, 0 to 62569
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        61921 non-null  object 
 1   Team        61921 non-null  object 
 2   Home        61921 non-null  int64  
 3   Spread      61921 non-null  int64  
 4   MP          61921 non-null  float64
 5   USG_perc    61921 non-null  float64
 6   FPTS        61921 non-null  float64
 7   PTS         61921 non-null  int64  
 8   Opp         61921 non-null  object 
 9   FPTS/M      61921 non-null  float64
 10  FPTS_Shtng  61921 non-null  float64
 11  Pace        61921 non-null  float64
 12  ORtg        61921 non-null  float64
 13  DRtg        61921 non-null  float64
 14  Off_eFG%    61921 non-null  float64
 15  DEF_eFG%    61921 non-null  float64
 16  Opp_pace    61921 non-null  float64
 17  Opp_DRtg    61921 non-null  float64
dtypes: float64(12), int64(3), object(3)
memory usage: 9.0+ MB


In [20]:
df_new.drop(['FPTS/M', 'FPTS_Shtng'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61921 entries, 0 to 62569
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      61921 non-null  object 
 1   Team      61921 non-null  object 
 2   Home      61921 non-null  int64  
 3   Spread    61921 non-null  int64  
 4   MP        61921 non-null  float64
 5   USG_perc  61921 non-null  float64
 6   FPTS      61921 non-null  float64
 7   PTS       61921 non-null  int64  
 8   Opp       61921 non-null  object 
 9   Pace      61921 non-null  float64
 10  ORtg      61921 non-null  float64
 11  DRtg      61921 non-null  float64
 12  Off_eFG%  61921 non-null  float64
 13  DEF_eFG%  61921 non-null  float64
 14  Opp_pace  61921 non-null  float64
 15  Opp_DRtg  61921 non-null  float64
dtypes: float64(10), int64(3), object(3)
memory usage: 8.0+ MB


## Train-Test Split Data for Modeling

In [43]:
y = df_new['FPTS']
X = df_new.drop(['FPTS', 'DEF_eFG%', 'DRtg'], axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Creating The pipeline

In [53]:
numeric_features = ['Spread','MP','USG_perc','Pace','Opp_pace','Opp_DRtg']
categorical_features = ['Name','Team', 'Opp', 'Home']

## The Pipeline Steps

In [54]:
# One hot encode all Categorical Features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

In [55]:
# Scale all the Numeric Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [56]:
# Joining the two pipelines together
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

# First Simple Models
Using 3 different simple models to get baseline estimates
- Dummy regressor using median 
- Dummy Regressor using mean
- Linear regressor

In [57]:
reg_median = Pipeline([
     ('preprocessor', preprocessor),
     ('reg_median', DummyRegressor(strategy = 'median'))
])

In [58]:
reg_mean = Pipeline([
     ('preprocessor', preprocessor),
     ('reg_mean', DummyRegressor(strategy = 'mean'))
])

In [59]:
lr = Pipeline([
     ('preprocessor', preprocessor),
     ('lr', LinearRegression(normalize=True))
])

In [60]:
ridge = Pipeline([
     ('preprocessor', preprocessor),
     ('ridge', Ridge(normalize=True))
])

In [61]:
# Fit the training data to pipeline models
dummy_median = reg_median.fit(X_train, y_train)
dummy_mean = reg_mean.fit(X_train, y_train)
lm = lr.fit(X_train, y_train)


y_predict = lm.predict(X_test)
y_predict_dummy_mean = dummy_mean.predict(X_test)
y_predict_dummy_median = dummy_median.predict(X_test)

In [62]:
rm = ridge.fit(X_train, y_train)
y_predict_ridge = rm.predict(X_test)

### Error metrics to assess models

In [63]:
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 
                                                                     y_predict_dummy_mean)))
print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict_ridge)))

print("Mean squared error (ridge model): {:.2f}".format(mean_squared_error(y_test, y_predict)))

print("Mean absolute error (linear model): {:.2f}".format(mean_absolute_error(y_test, y_predict)))

print("Mean absolute error (ridge model): {:.2f}".format(mean_absolute_error(y_test, y_predict_ridge)))

print("Median absolute error (dummy): {:.2f}".format(median_absolute_error(y_test, 
                                                                    y_predict_dummy_median)))
print("Median absolute error (linear model): {:.2f}".format(median_absolute_error(y_test, y_predict)))
  
print("r2_score (dummy mean): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean)))
print("r2_score (dummy median): {:.2f}".format(r2_score(y_test, y_predict_dummy_median)))
print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict)))
print("r2_score (ridge model): {:.2f}".format(r2_score(y_test, y_predict_ridge)))

Mean squared error (dummy): 130.89
Mean squared error (linear model): 60.08
Mean squared error (ridge model): 43.08
Mean absolute error (linear model): 5.07
Mean absolute error (ridge model): 6.15
Median absolute error (dummy): 7.75
Median absolute error (linear model): 4.06
r2_score (dummy mean): -0.00
r2_score (dummy median): -0.02
r2_score (linear model): 0.67
r2_score (ridge model): 0.54


- Linear regression model performed by far the best with a decent r2 score as well as a mean absolute error of about 5. While, Draftkings stock predictions have a mean absolute error closer to 6

In [72]:
print(cross_val_score(lr, X_train, y_train, cv=5))

[0.65635499 0.66279356 0.66318683 0.66460632 0.65051829]


 The model is not overfit with cross val scores on training data being very close to r2 of test data

In [146]:
lr_poly = Pipeline([
     ('preprocessor', preprocessor),
     ('poly', PolynomialFeatures(=)),
     ('lr', LinearRegression(normalize=True))
])

In [147]:
lm_poly = lr_poly.fit(X_train, y_train)
y_predict_final = lm_poly.predict(X_test)

TypeError: 'float' object cannot be interpreted as an integer

In [129]:
print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict_final)))
print("Mean absolute error (linear model): {:.2f}".format(mean_absolute_error(y_test, y_predict_final)))
print("Median absolute error (linear model): {:.2f}".format(median_absolute_error(y_test, y_predict_final))

SyntaxError: invalid syntax (<ipython-input-129-ecdb719a7915>, line 4)

In [131]:
print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict_final)))

r2_score (linear model): 0.26


# More Advanced Modeling

In [79]:
# Function to get performance of models
def performance(y_true, y_predict):
    """ 
    Calculates and returns the two performance scores between 
    true and predicted values - first R-Squared, then RMSE
    """

    # Calculate the r2 score between 'y_true' and 'y_predict'
    r2 = r2_score(y_true, y_predict)

    # Calculate the root mean squared error between 'y_true' and 'y_predict'
    rmse = mean_squared_error(y_true, y_predict, squared=False)

    #Calculate the mean absolute error
    mae = mean_absolute_error(y_true, y_predict)

    
    # Return the score
    return [r2, rmse, mae]

### Let's Try a Decison Tree Regressor

In [None]:
dtr_processed = Pipeline([
     ('preprocessor', preprocessor),
     ('dtr', DecisionTreeRegressor(random_state=42))
])

### Use a Grid Search to get the best parameters

In [None]:
dtr_param_grid = [{'dtr__max_depth': [4, 5, 6, 10, 20, 40, 50,], 
         'dtr__min_samples_split': [2, 5, 10, 20, 40, 50], 
         'dtr__min_samples_leaf': [1, 3, 5, 9, 15, 25, 40]}]

In [None]:
dtr_gs = GridSearchCV(dtr_processed, dtr_param_grid, scoring='r2', cv=10)
dtr_gs.fit(X_train, y_train)

best_dtr_parameters = dtr_gs.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(rf_best_parameters.keys()):
    print('%s: %r' % (param_name, rf_best_parameters[param_name]))
    
y_pred_dtr = dtr_gs.predict(X_test)
score = performance(y_test, y_pred_dtr)
print(score)

The Decision tree model is worse than the linear regression model by both metrics

## Let's try a Random Forest Regressor

In [64]:
RF_processed = Pipeline([
     ('preprocessor', preprocessor),
     ('RF', RandomForestRegressor(random_state=42,max_depth=20, min_samples_leaf=1, min_samples_split=60, n_estimators=400))
])

### Once again use a Grid Search to get the best parameters

In [65]:
rf_param_grid = [{ 
    'n_estimators': [200,300,400,500],
    'max_features': ['sqrt','log2']}]

In [66]:
rf_gs = GridSearchCV(RF_processed, rf_param_grid, scoring='r2', cv=10)
rf_gs.fit(X_train, y_train)

best_rf_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(rf_best_parameters.keys()):
    print('%s: %r' % (param_name, rf_best_parameters[param_name]))

y_pred_rf = rg_gs.predict(X_test)
score = performance(y_test, y_pred_rf)
print(score)    

 Better than the decision tree model but still not as good as the linear regression model

## Lets Try a XGB Boost model

In [76]:
XGB_model2 = Pipeline([
     ('preprocessor', preprocessor),
     ('XGB', XGBRegressor())
])

In [77]:
XGB_model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Spread', 'MP', 'USG_perc',
                                                   'Pace', 'Opp_pace',
                                                   'Opp_DRtg']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Name', 'Team', 'Opp',
                                

In [80]:
y_pred_1 = XGB_model.predict(X_test)
score = performance(y_test, y_pred_1)
score

[0.6419531919730985, 6.845407127683802, 5.2516704141982]

In [81]:
print(cross_val_score(XGB_model, X_train, y_train, cv=5))

[0.63554846 0.63543662 0.6375313  0.63680606 0.62442764]


- Very similar to the linear regression model scores, but very slightly worse. However it is not overfit. So let's try running a grid search to find the optimal hyper parameters 

In [101]:
XGB_model = Pipeline([
     ('preprocessor', preprocessor),
     ('XGB', XGBRegressor())
])

In [106]:
XGB_param_grid ={
    "XGB__learning_rate": (0.05, 0.10, 0.15, 0.2),
    "XGB__max_depth": [ 3, 6, 8],
    "XGB__min_child_weight": [ 1, 3, 5],
    "XGB__subsample": [0.5, 0.7],
    "XGB__n_estimators": [100, 200],
    "XGB__gamma":[ 0.0, 0.1, 0.2],
    "XGB__colsample_bytree":[ 0.3, 0.4]}

In [107]:
XGB_gs = GridSearchCV(XGB_model, XGB_param_grid, scoring='r2', cv=5)
XGB_gs.fit(X_train, y_train)

XGB_best_parameters = XGB_gs.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(XGB_best_parameters.keys()):
    print('%s: %r' % (param_name, XGB_best_parameters[param_name]))

y_pred_XGB = XGB_gs.predict(X_test)
score = performance(y_test, y_pred_XGB)
print(score)

Grid Search found the following optimal parameters: 
XGB__colsample_bytree: 0.4
XGB__gamma: 0.0
XGB__learning_rate: 0.2
XGB__max_depth: 8
XGB__min_child_weight: 5
XGB__n_estimators: 200
XGB__subsample: 0.7
[0.6569865901997829, 6.700156221943865, 5.14442894708609]


## Final Model

In [115]:
XGB_model_final = Pipeline([
     ('preprocessor', preprocessor),
     ('XGB', XGBRegressor(learning_rate=0.3, max_depth=8, min_child_weight=7, n_estimators=350, subsample=0.9))
])

In [116]:
XGB_model_final.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Spread', 'MP', 'USG_perc',
                                                   'Pace', 'Opp_pace',
                                                   'Opp_DRtg']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Name', 'Team', 'Opp',
                                

In [117]:
y_pred_final = XGB_model_final.predict(X_test)
score = performance(y_test, y_pred_final)
print(score)

[0.6479778812726678, 6.787570536183406, 5.184198102047566]


## Saving a Final Model

In [149]:
import joblib

In [150]:
joblib.dump(lr, "lr_model.pkl") 

['lr_model.pkl']