In [16]:
%pylab inline
import numpy as np
import sklearn.linear_model as linear
from sklearn.model_selection import KFold
from scipy.stats import ttest_rel

Populating the interactive namespace from numpy and matplotlib


In [5]:
def mape(actual, pred, eps=0.00001):
    components = np.abs(actual - pred)
    components = (components / (actual + eps)) * 100
    return np.mean(components)
    

In [10]:
with open('X.npy', 'rb') as fp:
    X = np.load(fp)
with open('y.npy', 'rb') as fp:
    y = np.load(fp)

In [14]:
kf = KFold(n_splits=5)
kf.get_n_splits(X)

5

In [17]:
lr1_res = []
lr2_res = []
lr3_res = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr1 = linear.LinearRegression()
    lr2 = linear.Ridge()
    lr3 = linear.Lasso()
    
    lr1.fit(X_train, y_train)
    lr2.fit(X_train, y_train)
    lr3.fit(X_train, y_train)
    
    lr1_res.append(mape(y_test, lr1.predict(X_test)))
    lr2_res.append(mape(y_test, lr2.predict(X_test)))
    lr3_res.append(mape(y_test, lr3.predict(X_test)))

In [18]:
lr1_res = np.array(lr1_res)
lr2_res = np.array(lr2_res)
lr3_res = np.array(lr3_res)

In [22]:
print('Mean for Linear Regression ', np.mean(lr1_res), ' Std for Linear Regression ', np.std(lr1_res))
print('Mean for Ridge Regression ', np.mean(lr2_res), ' Std for Ridge Regression ', np.std(lr2_res))
print('Mean for LASSO Regression ', np.mean(lr3_res), ' Std for LASSO Regression ', np.std(lr3_res))

Mean for Linear Regression  37.044292502576255  Std for Linear Regression  9.050565869904354
Mean for Ridege Regression  37.09350033190795  Std for Ridge Regression  9.067611505669358
Mean for LASSO Regression  37.35624334279797  Std for LASSO Regression  9.136222831898865


More statistically robust to check using Student-t test in addition to above

In [23]:
print(ttest_rel(lr1_res, lr2_res))
print(ttest_rel(lr2_res, lr3_res))
print(ttest_rel(lr1_res, lr3_res))

Ttest_relResult(statistic=-4.159691947089001, pvalue=0.01414870668655993)
Ttest_relResult(statistic=-2.9674534363061396, pvalue=0.04124828824419589)
Ttest_relResult(statistic=-3.1435782149384717, pvalue=0.034727891890525206)


Differences between Ridge and LASSO are statistically significiant, and between Linear and LASSO are statistically significant. However, between Linear and Ridge are not statistically different.  (Use p_value < 0.05 as indiciation of significance)

Based on the mean tests from above, the order of performance is lossely Linear > Ridge > LASSO. Considering the significance tests, the differences between Linear and Ridge are not significantly different; hence either Linear or Ridge should be used