In this assignment, you will use LASSO to select features, building on a pre-implemented solver for LASSO (using GraphLab Create, though you can use other solvers). You will:

- Run LASSO with different L1 penalties.
- Choose best L1 penalty using a validation set.
- Choose best L1 penalty using a validation set, with additional constraint on the size of subset.
- In the second assignment, you will implement your own LASSO solver, using coordinate descent.

In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import Lasso

In [2]:
def Coeff3(x, features):
    '''Prints intercept and coeff in a better format
    
    Paramaters:
    ------------
    x : name of model
    power: int,  degree of polynomial function
    
    Returns: Coefficients table'''
    
    coef_df=pd.DataFrame()
    coef_df=coef_df.append({"Name":'Intercept', 'Value':x.intercept_}, ignore_index=True)
    coef_df=coef_df.append({'Name':{'name':features,'Value':x.coef_}}, ignore_index=True)
    
    #if power > 1:
     #   for power in range(2, power+1):
      #      name= 'Power_' + str(power)
       #     coef_df=coef_df.append({"Name":name, 'Value':'{:.3e}'.format(x.coef_[power-1])}, ignore_index=True)
    
    return coef_df

In [3]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

# learn regression

In [5]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [6]:
X=sales[all_features]
y=sales['price']
model_all=Lasso(alpha=5e2, normalize=True).fit(X,y)

In [7]:
coef=model_all.coef_
coef

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

In [8]:
df=pd.DataFrame(data={'features':all_features, 'Coeff':coef.round(2)})

In [9]:
df

Unnamed: 0,Coeff,features
0,0.0,bedrooms
1,0.0,bedrooms_square
2,0.0,bathrooms
3,134.44,sqft_living
4,0.0,sqft_living_sqrt
5,0.0,sqft_lot
6,0.0,sqft_lot_sqrt
7,0.0,floors
8,0.0,floors_square
9,0.0,waterfront


# find best L1 penalty

In [10]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [11]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [12]:
L1_penalties=np.logspace(1,7,num=13)
X_train= training[all_features]
y_train = training['price']

X_valid= validation[all_features]
y_valid= validation['price']

X_test= testing[all_features]
y_test = testing['price']

In [13]:
for L1 in L1_penalties:
    model =Lasso(alpha= L1, normalize=True).fit(X_train, y_train)
    prediction = model.predict(X_valid)
    RSS= ( (prediction - y_valid)**2 ).sum()
    print('For this L1: {}, the RSS is: {:.3e}\n'.format(L1, round(RSS,3)))

For this L1: 10.0, the RSS is: 3.982e+14

For this L1: 31.622776601683793, the RSS is: 3.990e+14

For this L1: 100.0, the RSS is: 4.298e+14

For this L1: 316.22776601683796, the RSS is: 4.637e+14

For this L1: 1000.0, the RSS is: 6.459e+14

For this L1: 3162.2776601683795, the RSS is: 1.223e+15

For this L1: 10000.0, the RSS is: 1.223e+15

For this L1: 31622.776601683792, the RSS is: 1.223e+15

For this L1: 100000.0, the RSS is: 1.223e+15

For this L1: 316227.7660168379, the RSS is: 1.223e+15

For this L1: 1000000.0, the RSS is: 1.223e+15

For this L1: 3162277.6601683795, the RSS is: 1.223e+15

For this L1: 10000000.0, the RSS is: 1.223e+15



In [14]:
best_model=Lasso(alpha=10, normalize=True).fit(X_train,y_train)
prediction_test=best_model.predict(X_test)
RSS_test = ( (prediction_test-y_test)**2 ).sum()
print('{:.3e}'.format(RSS_test))

9.847e+13


In [15]:
# non zero features

np.count_nonzero(best_model.coef_) + np.count_nonzero(best_model.intercept_), best_model.intercept_


(15, 6630155.668628358)

In [16]:
coef2=best_model.coef_
df=pd.DataFrame(data={'features':all_features, 'Coeff':coef2.round(2)})


In [17]:
df

Unnamed: 0,Coeff,features
0,-16144.56,bedrooms
1,373.25,bedrooms_square
2,50841.24,bathrooms
3,617.85,sqft_living
4,-44411.35,sqft_living_sqrt
5,0.79,sqft_lot
6,-701.19,sqft_lot_sqrt
7,-0.0,floors
8,5014.2,floors_square
9,619488.75,waterfront


# limit ourself to only 7 features 

In [18]:
max_nonzeros=7

In [41]:
L1_large=np.logspace(1,4, num=20)
coef_count= []
for L1 in L1_large:
    model=Lasso(alpha=L1, normalize=True).fit(X_train, y_train)
    count_nonZ=np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    coef_count.append({'L1_value':L1, 'NonZ': count_nonZ})
    NonZ=pd.DataFrame(data=coef_count)
    
    

In [37]:
NonZ

Unnamed: 0,L1_value,NonZ
0,10.0,15
1,14.384499,15
2,20.691381,15
3,29.763514,15
4,42.813324,13
5,61.584821,12
6,88.586679,11
7,127.427499,10
8,183.298071,7
9,263.66509,6


In [47]:
l1_min=127.427499
l1_max=263.665090

# Exploring narrower range of l1_penalty

    find best L1  with lowest RSS and spasity == max_nonzeros

In [62]:
L1_range=np.linspace(l1_min, l1_max,20)
coef_count2 =[]
rss=[]
for L1 in L1_range:
    model=Lasso(alpha=L1, normalize=True).fit(X_train,y_train)
    prediction=model.predict(X_valid)
    RSS2 = ( (prediction-y_valid)**2 ).sum()
    rss.append(RSS2)
    count_nonZ2=np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    coef_count2.append({'L1_value':L1, 'NonZ': count_nonZ2})
    NonZ2=pd.DataFrame(data=coef_count2)
    NonZ2['RSS']=rss


In [63]:
NonZ2

Unnamed: 0,L1_value,NonZ,RSS
0,127.427499,10,435374700000000.0
1,134.597899,10,437009200000000.0
2,141.768298,8,438236100000000.0
3,148.938698,8,439158900000000.0
4,156.109097,7,440037400000000.0
5,163.279497,7,440777500000000.0
6,170.449896,7,441566700000000.0
7,177.620296,7,442406400000000.0
8,184.790695,7,443296700000000.0
9,191.961095,7,444239800000000.0


In [73]:
NonZ2[NonZ2['NonZ'] ==7]['RSS'].min()


AttributeError: 'list' object has no attribute 'min'

# We now explore the region of l1_penalty we found:

In [74]:
best_model2=Lasso(alpha=156.109097, normalize=True).fit(X_train,y_train)

coef3=best_model2.coef_
df=pd.DataFrame(data={'features':all_features, 'Coeff':coef3.round(2)})
df

Unnamed: 0,Coeff,features
0,-0.0,bedrooms
1,-0.0,bedrooms_square
2,10610.89,bathrooms
3,163.38,sqft_living
4,0.0,sqft_living_sqrt
5,-0.0,sqft_lot
6,-0.0,sqft_lot_sqrt
7,0.0,floors
8,0.0,floors_square
9,506451.69,waterfront
