In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge

from sklearn.linear_model import LassoCV, Lasso
import numpy as np
from sklearn.datasets import make_regression

from csv_reader import *
from csv_setup import *

bike_sharing_df = get_bike_sharing_df_clean()

In [6]:
# Train and test data preperation
# We split the data first, then we take the log of the cnt, which represents the number of bikes on a given day


# Count variable that we are going to be predicting
Y = bike_sharing_df.cnt
# Features that we are going to be fitting
X = bike_sharing_df.loc[:].drop(columns=["cnt"])

# Split the data
X_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.33, random_state = 1)

# log of each of the cnt
# this will normalize the variable that we are trying to predict
y_train_log = y_train.apply(lambda x:np.log1p(x))
y_test_log = y_test.apply(lambda x:np.log1p(x))

# y_train_log = y_train
# y_test_log = y_test

# y_test = y_test.reset_index().drop('index',axis = 1)
# y_train = y_train.reset_index().drop('index',axis = 1)

parameters = {
         'max_iter':[5000],        
         'alpha':np.logspace(0, 9, 100),
         'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }


In [20]:
def LassoLog(x_train, y_train):
    # Here we are cross validating as well as fitting the model
    # We are using the neg_mean_squared_log_error to find the best alpha
    gs = GridSearchCV(Lasso(),refit=True, param_grid = parameters, scoring='neg_mean_squared_log_error', cv=5)
    gs.fit(X_train,y_train_log)

    return gs

In [21]:
# This function uses the original count values
# The MSE will be extremely high

def LassoNormal(x_train, y_train):
    # 0-9 to show all iterations that are done
    # Now we are using just the neg_mean_squared_error to find the best coeff
    gs = GridSearchCV(Lasso(),refit=True, param_grid = parameters, scoring='neg_mean_squared_error', cv=5)
    gs.fit(X_train,y_train)
    return gs

In [22]:
gsLassoLog = LassoLog(X_train, y_train_log)
gsLasso = LassoNormal(X_train, y_train)

ValueError: Invalid parameter solver for estimator Lasso(max_iter=5000). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
coefs = []

for a in parametersLog['alpha']:
        pipe.set_params(lasso__alpha=a)
        pipe.fit(X_train, y_train_log)
        coefs.append(pipe.named_steps['lasso'].coef_)

plt.figure()
ax = plt.gca()
ax.plot(parameters['alpha'], coefs)
ax.set_xscale('log')

plt.axis('tight')
plt.xlabel('Alpha')
plt.ylabel('Coefficients');

In [None]:
coefs = []


for a in parametersNormal['alpha']:
        pipe.set_params(lasso__alpha=a)
        pipe.fit(X_train, y_train)
        coefs.append(pipe.named_steps['lasso'].coef_)

plt.figure().clear()
ax = plt.gca()
ax.plot(parameters['alpha'], coefs)
ax.set_xscale('log')

plt.axis('tight')
plt.xlabel('Alpha')
plt.ylabel('Coefficients');

In [None]:
print("Best alpha Log: " + str(gsLassoLog.best_params_['alpha']))
print("Best coeff Log: " + str(gsLassoLog.best_estimator_.coef_))

print("\nBest alpha Normal: " + str(gsLasso.best_params_['alpha']))
print("Best coeff Normal: " + str(gsLasso.best_estimator_.coef_))



In [None]:
# Average of Y's
np.mean(y_train_log)

In [None]:
print('\nCoefficients  [ridge] Log')

yvalues1 = gsLassoLog.best_estimator_.coef_
for i in range(0,10):
    print('   Feature %f:  %.2f'%(i,\
                    best_coef_lasso[i]))

print('\nCoefficients  [ridge] Normal')

yvalues2 = gsLasso.best_estimator_.coef_
for i in range(0,10):
    print('   Feature %f:  %.2f'%(i,\
                    best_coef_lasso[i]))
    
xvalues = [1,2,3,4,5,6,7,8,9,10]
   

In [None]:
# This is a plot showing the difference between the coefficient values.
# The largest difference being between the value of the coefficients for features 5 and 6

plt.figure()

plt.scatter(xvalues,yvalues1,color="red", label="Lasso Log")
plt.scatter(xvalues,yvalues2,color="blue", label="Lasso Normal")
ax = plt.gca()

ax.legend(("Lasso Log", "Lasso Normal"))
plt.xlabel('Feature #')
plt.ylabel('Coefficient Value');
plt.show()

In [None]:
y_pred = gsLassoLog.predict(x_test)

mse_test_using_lasso = mean_squared_error(y_test_log,y_pred)
print('\nTest set MSE using Log Lasso coefficients: %.2f'%mse_test_using_lasso)
print('Test set RMLSE using Log Lasso coefficients %.2f'%rmsle(y_test_log, y_pred))


# One reason for the MSE being unreasonably high here is the high variance in the data
# Compared to that of the log transformed MSE of the Log Ridge
y_pred = gslasso.predict(x_test)
mse_test_using_lasso = mean_squared_error(y_test,y_pred)
print('\nTest set MSE using Normal Lasso coefficients: %.2f'%mse_test_using_lasso)
print('Once again, we cannot use the Normal Lasso coefficients to calculate RMLSE because it predicts negative numbers')

In [None]:
lm = gsLassoLog

y_train_pred = lm.predict(X_train)
y_test_pred = lm.predict(x_test)


print('R squared statistic for Log Ridge')
print('Train Score: ')
print(r2_score(y_train_log, y_train_pred))

print('\nTest Score: ')
print(r2_score(y_test_log, y_test_pred))

In [None]:
lm = gsLasso

y_train_pred = lm.predict(X_train)
y_test_pred = lm.predict(x_test)

print('R squared statistic for Normal Ridge')

print('Train Score: ')
print(r2_score(y_train, y_train_pred))

print('\nTest Score: ')
print(r2_score(y_test, y_test_pred))