In [5]:
# import all depedencies first cell
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, cross_validation

In [6]:
# define functions in one cell, call when you need
def read_csv(path):
    return pd.read_csv(path)

In [7]:
path = '../../assets/datasets/bikeshare.csv'
bikeshare = read_csv(path)

In [8]:
# get the data dictionary of bikeshare to see possible feature candidates
bikeshare.head(2)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40


In [16]:
# assume we want 2 features, 'temp' and 'hum' and weathersit, because
# weathersit is categorical we must make dummy variables
# create a new dataframe with only features temp and hum, 
# and all dummy variables required for weathersit (hint n - 1 classes)
# hint (pd.get_dummies(df_column)) will transform your variable 
# into n dummy classes
# hint, use .join to merge two dataframes on a common key (default inner join on index)
# call your features dataframe 'modeldata' and your y response column 'y'

y = bikeshare['cnt']

weathersit_dummies = pd.get_dummies(bikeshare['weathersit'])
modeldata = bikeshare[['temp','hum']].join(weathersit_dummies)
modeldata.head(3)

Unnamed: 0,temp,hum,1,2,3,4
0,0.24,0.81,1.0,0.0,0.0,0.0
1,0.22,0.8,1.0,0.0,0.0,0.0
2,0.22,0.8,1.0,0.0,0.0,0.0


In [19]:
kf = cross_validation.KFold(len(modeldata), n_folds=5, shuffle=True)

# you must define modeldata in cell above for kf assignment to run
# kf returns a dictionary composed of key-value pairs

In [20]:
lm_obj = linear_model.LinearRegression() # instaniate only one time
scores = [] # we will append mse scores from each iteration in kf
for train_index, test_index in kf: # for (key,value) in dictionary
    x_train = modeldata.iloc[train_index] # get new set each iteration
    y_train = y.iloc[train_index]
    
    x_test = modeldata.iloc[test_index] # get new test data each iteration
    y_test = y.iloc[test_index]
        
    lm = lm_obj.fit(x_train, y_train) # fit new model each iteration
    x_test_pred = lm.predict(x_test)

    mse = metrics.mean_squared_error(y_test,x_test_pred) 
    # get new mse each iteration
    scores.append(mse) # append mse scores from each model to scores list

In [25]:
# print the mean mse score from all iterations, explain output
np.mean(scores)

24578.423856193498

Here we get the expected MSE for unseen test set

In [30]:
# fit a regression model on all the model and outcome data (modeldata and y)
# hint use, linear_model.LinearRegression().fit()
# get predictions from fitted model using same model data
# calculate MSE and interpret 
# hint MSE = SSE/n = (y - y_est)/n, y_est = lm.predict(modeldata)

model = linear_model.LinearRegression().fit(modeldata,y)
predictions = model.predict(modeldata)
mse = metrics.mean_squared_error(y,predictions)
print(mse)

24567.5188864


Here we get the MSE for data already seen during the training. It is better than previous MSE, but less realistic to represent what to expect of unseen data.

In [42]:
# fit a lasso regression model on all the model and outcome data (modeldata and y)
# hint use, linear_model.Lasso().fit()
# get predictions from fitted model using same model data
# calculate MSE and interpret
# hint MSE = SSE/n = (y - y_est)/n, y_est = lm.predict(modeldata)

model2 = linear_model.Lasso(alpha = .1,normalize=True).fit(modeldata,y)
predictions2 = model2.predict(modeldata)
print "\nalpha: 0.1"
mse2 = metrics.mean_squared_error(y,predictions2)
print(mse2)

llmcv = linear_model.LassoCV(normalize=True)
model3 = llmcv.fit(modeldata, y)
predictions3 = model3.predict(modeldata)
print "\nalpha:", llmcv.alpha_
mse3 = metrics.mean_squared_error(y,predictions3)
print(mse3)


alpha: 0.1
24964.3412556

alpha: 0.000597167954614
24567.548839


In [48]:
# fit a Ridge regression model on all the model and outcome data (modeldata and y)
# hint use, linear_model.Ridge().fit()
# get predictions from fitted model using same model data
# calculate MSE and interpret
# hint MSE = SSE/n = (y - y_est)/n, y_est = lm.predict(modeldata)

rlmcv = linear_model.RidgeCV(normalize=True)
model4 = rlmcv.fit(modeldata, y)
predictions4 = model4.predict(modeldata)
print "\nalpha:", rlmcv.alpha_
mse4 = metrics.mean_squared_error(y,predictions4)
print(mse4)


alpha: 0.1
24637.0273161
