In [1]:
# Standard includes
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# Routines for linear regression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
data = np.genfromtxt('mystery.dat', delimiter=',')
x = data[:,0:100] # predictors
y = data[:,100] # response variable

In [6]:
def feature_subset_regression(x,y,flist):
    if len(flist) < 1:
        print ("Need at least one feature")
        return
    for f in flist:
        if (f < 0) or (f > 99):
            print ("Feature index is out of bounds")
            return
    regr = linear_model.LinearRegression()
    regr.fit(x[:,flist], y)
    return regr

In [8]:
for flist in [[1,5,7,19,44], [2,3,13,17,29], [3,7,13,19,44], [5,23,24,51,61]]:
    regr = feature_subset_regression(x,y,flist)
    print(flist)
    print ("w = ", regr.coef_)
    print ("b = ", regr.intercept_)
    print ("Mean squared error: ", mean_squared_error(y, regr.predict(x[:,flist])))

[1, 5, 7, 19, 44]
w =  [ 0.83324318 -0.3459277  -0.4309913   0.2854153   0.36972746]
b =  0.0270325204452
Mean squared error:  8.27394094044
[2, 3, 13, 17, 29]
w =  [ 0.3876896   0.52133709  0.07247709 -0.17708642  0.15894952]
b =  -0.0279658310462
Mean squared error:  9.00043415662
[3, 7, 13, 19, 44]
w =  [ 0.60441966 -0.55227452  0.15870556  0.26222865  0.35794696]
b =  -0.0836955209554
Mean squared error:  8.69316022791
[5, 23, 24, 51, 61]
w =  [-0.30301849  0.22507339 -0.44556407  0.15784941  0.29273196]
b =  0.0605051636704
Mean squared error:  8.95410912144


In [15]:
regr = linear_model.Lasso(0.5)
regr.fit(x, y)
print(regr.coef_)
print(np.argsort(regr.coef_)[-10:]+1)

[ 0.          0.24841889  0.14858335  0.          0.42592769 -0.
  0.39656748 -0.         -0.         -0.          0.51638403 -0.
  0.34624688 -0.         -0.         -0.          0.62423963 -0.          0.4637213
  0.          0.         -0.          0.49472784  0.         -0.         -0.
 -0.          0.          0.10301208  0.         -0.          0.          0.
 -0.         -0.         -0.         -0.          0.         -0.          0.
 -0.          0.         -0.         -0.          0.         -0.         -0.
 -0.          0.          0.         -0.          0.          0.          0.
 -0.          0.         -0.         -0.          0.          0.          0.
  0.          0.         -0.         -0.          0.         -0.         -0.
  0.          0.          0.          0.          0.          0.          0.
  0.         -0.          0.         -0.         -0.          0.10048512
 -0.         -0.         -0.         -0.         -0.         -0.          0.
 -0.         -0.    

In [16]:
def best_ten_features(alpha):
    regr = linear_model.Lasso(alpha)
    regr.fit(x, y)
    print('alpha: %s; %s' % (alpha, np.argsort(regr.coef_)[-10:]+1))

In [19]:
for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    best_ten_features(alpha)

alpha: 0.1; [29  3 13 19  2 23 17 11  7  5]
alpha: 0.2; [29  3 13 19  2  7 23 17 11  5]
alpha: 0.3; [29  3 13  2 19  7 23  5 11 17]
alpha: 0.4; [29  3  2 13  7 19  5 23 11 17]
alpha: 0.5; [29  3  2 13  7  5 19 23 11 17]
alpha: 0.6; [36  2 81 13  5  7 19 23 11 17]
alpha: 0.7; [ 30 100  81  13   5   7  23  11  19  17]
alpha: 0.8; [ 30 100  81  13   5   7  23  11  19  17]
alpha: 0.9; [ 32  31  30  29  37 100  23  11  19  17]
alpha: 1; [ 32  31  30  29  37 100  23  11  19  17]
