# Regression Modeling

Author: Holly Bok

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.decomposition import PCA

In [2]:
# Reading in original DataFrame and saving as 'df'
# Creating a new DataFrame, 'df_numeric', will all of the numeric columns for modeling

df = pd.read_csv('datasets/NBAPlayersClustered.csv')
df_numeric = df.drop(columns=['player_id', 'player_name', 'team', 'height',
                             'draft_number_group'])

In [3]:
df[df['player_name'] == 'Stephen Curry']['19_20_salary']

123    40231758
Name: 19_20_salary, dtype: int64

In [4]:
# Setting X and y variables. The y variable, or target variable, is the 19/20 Season
# salary of each player.
# Running a train_test_split on the data to separate into training and testing sets.

X = df_numeric.drop(columns=['19_20_salary'])
y = df_numeric['19_20_salary']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33, random_state=13)

## Modeling

### LassoCV

In [5]:
# Using a StandardScaler to scale data

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [6]:
# Instantiating and fitting a LassoCV model and scoring for the testing and training
# datasets 

ls = LassoCV(max_iter = 5000, cv=5)
ls.fit(X_train_sc, y_train);

In [7]:
# Printing training and testing scores

print('LassoCV training score: ', ls.score(X_train_sc, y_train))
print('LassoCV testing score: ', ls.score(X_test_sc, y_test))

LassoCV training score:  0.672365682486175
LassoCV testing score:  0.5730709666193523


### RidgeCV

In [8]:
# Using a StandardScaler to scale data

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [9]:
# Instantiating and fitting a RidgeCV model and scoring for the testing and training
# datasets

ridge_alphas = np.logspace(0,5,1000)
rg = RidgeCV(scoring='r2',
    cv=5)
rg.fit(X_train_sc, y_train);

# The value for alpha was changed and tested for the RidgeCV model but the default
# value resulted in the highest R2 testing score



In [10]:
# Printing training and testing scores

print('RidgeCV training score: ', rg.score(X_train_sc, y_train))
print('RidgeCV testing score: ', rg.score(X_test_sc, y_test))

RidgeCV training score:  0.700472017238835
RidgeCV testing score:  0.5808824670160447


### LassoCV and RidgeCV with PCA

In [11]:
# Instantiating a PCA model for feature selection
# The PCA model is set at n_components = 10 because the first 10 PCA features are able to
# account for 95% of the variability in the data. 

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)



###### DOUBLE CHECK ON THIS MEANING 

In [12]:
# Instantiating and fitting a LassoCV model on features that have been manipulated with
# PCA. Printing the training and testing scores. 

ls_pca = LassoCV(max_iter = 5000, cv=5)
ls_pca.fit(X_train_pca, y_train);
print('LassoCV with PCA training score: ', ls_pca.score(X_train_pca, y_train))
print('LassoCV with PCA testing score: ', ls_pca.score(X_test_pca, y_test))

LassoCV with PCA training score:  0.6618077313759201
LassoCV with PCA testing score:  0.5715324573631061


In [13]:
# Instantiating and fitting a RidgeCV model on features that have been manipulated with
# PCA. Printing the training and testing scores. 

rg_pca = RidgeCV(
    scoring='r2',
    cv=5)
rg_pca.fit(X_train_pca, y_train);
print('RidgeCV with PCA training score: ', rg_pca.score(X_train_pca, y_train))
print('RidgeCV with PCA testing score: ', rg_pca.score(X_test_pca, y_test))

RidgeCV with PCA training score:  0.6631539836677258
RidgeCV with PCA testing score:  0.5697290540156192




### LassoCV and RidgeCV with Polynomial Features

In [14]:
# Insantiating a polynomial features object and transforming X_train and X_test

pf = PolynomialFeatures()
X_train_pf = pf.fit_transform(X_train_sc)
X_test_pf = pf.transform(X_test_sc)

In [15]:
# Instantiating and fitting a LassoCV model on features that have been manipulated with
# Polynomial Features. Printing the training and testing scores. 

ls_pf = LassoCV(max_iter = 5000, cv=5)
ls_pf.fit(X_train_pf, y_train);
print('LassoCV with PF training score: ', ls_pf.score(X_train_pf, y_train))
print('LassoCV with PF testing score: ', ls_pf.score(X_test_pf, y_test))

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


LassoCV with PF training score:  0.7701446948263703
LassoCV with PF testing score:  0.5971776723410374


In [16]:
# Instantiating and fitting a RidgeCV model on features that have been manipulated with
# Polynomial Features. Printing the training and testing scores.

rg_pf = RidgeCV(
    scoring='r2',
    cv=5)
rg_pf.fit(X_train_pf, y_train);
print('RidgeCV with PF training score: ', rg_pf.score(X_train_pf, y_train))
print('RidgeCV with PF testing score: ', rg_pf.score(X_test_pf, y_test))

RidgeCV with PF training score:  0.9397494118942956
RidgeCV with PF testing score:  0.6344305160835713




### LassoCV and RidgeCV with Polynomial Features AND PCA

In [17]:
# Instantiating a PCA object and fitting to the X_train and X_test data that has already
# been transformed using the Polynomial Features object. 
# The PCA model is set at n_components = 10 because the first 10 PCA features are able to
# account for 95% of the variability in the data.

pca = PCA(n_components = 15)
X_train_pf_pca = pca.fit_transform(X_train_pf)
X_test_pf_pca = pca.transform(X_test_pf)

In [18]:
# Instantiating and fitting a LassoCV model on features that have been manipulated with
# Polynomial Features AND PCA. Printing the training and testing scores. 

ls_pf_pca = LassoCV(max_iter = 5000, cv=5)
ls_pf_pca.fit(X_train_pf_pca, y_train);
print('LassoCV with PF and PCA training score: ', ls_pf_pca.score(X_train_pf_pca, y_train))
print('LassoCV with PF and PCA testing score: ', ls_pf_pca.score(X_test_pf_pca, y_test))

LassoCV with PF and PCA training score:  0.7339618393085113
LassoCV with PF and PCA testing score:  0.5364411974771343


In [19]:
# Instantiating and fitting a LassoCV model on features that have been manipulated with
# Polynomial Features AND PCA. Printing the training and testing scores. 

rg_pf_pca = RidgeCV(
    scoring='r2',
    cv=5)
rg_pf_pca.fit(X_train_pf_pca, y_train);
print('RidgeCV with PF and PCA training score: ', rg_pf_pca.score(X_train_pf_pca, y_train))
print('RidgeCV with PF and PCA testing score: ', rg_pf_pca.score(X_test_pf_pca, y_test))

RidgeCV with PF and PCA training score:  0.7392025012336042
RidgeCV with PF and PCA testing score:  0.5263036852871661




### Random Forest Model

In [20]:
# Instantiating and fitting a Random Forest Model
# Gridsearching through possible hyperparameters and setting the ran_f model to the best
# model created through the GridSearch

ran_f = RandomForestRegressor()

grid_params = ({
    'n_estimators' : [10, 100, 150],
    'max_depth' : [2, 4, 6, 10, 15],
    'min_samples_split': [2, 8, 10],
    'min_samples_leaf' : [1 , 2, 3, 4],
})

gs = GridSearchCV(ran_f, param_grid = grid_params, cv=5)
gs.fit(X_train, y_train)
ran_f = gs.best_estimator_

print('Random Forest Regressor train_score: ', ran_f.score(X_train, y_train))
print('Random Forest Regressor testing_score: ', ran_f.score(X_test, y_test))

Random Forest Regressor train_score:  0.901931195232552
Random Forest Regressor testing_score:  0.6265861314264569




### SVR Model

In [21]:
# SVR

svr = SVR()
svr.fit(X_train, y_train)
svr.score(X_train, y_train), svr.score(X_test, y_test)

print('SVR train_score: ', svr.score(X_train, y_train))
print('SVR testing_score: ', svr.score(X_test, y_test))

SVR train_score:  -0.21924112631411205
SVR testing_score:  -0.25406736101280236




### Making Predictions

In [22]:
# Making predictions using the best_model, 'ls_pf'

predictions = ls_pf.predict(X_test_pf)
residuals = y_test - predictions

In [23]:
test_df = pd.DataFrame(y_test)
test_df['predicted'] = predictions
test_df['residuals'] = residuals
test_df = pd.merge(test_df, df, left_index=True, right_index=True, how='left')
test_df['residuals'] = test_df['residuals'].abs()

In [24]:
test_df.to_csv('datasets/SalaryPredictions.csv', index=False)

### Isolating Coefficients

In [25]:
# Creating a new DataFrame, _coef, that has the polynomial feature names and the
# coefficients for each feature

_coef = pd.DataFrame({
    'feature': pf.get_feature_names(X_train.columns),
    'coef': ls_pf.coef_
})

In [26]:
# Isolating the coefficients that are above or below 0 and exporting them as a .csv called
# 'NAME' for use in analysis

coefficients = _coef[_coef['coef'] != 0]
coefficients.to_csv('datasets/Coefficients.csv', index=False)