# import data

In [15]:
import pandas as pd
import numpy as np

dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 
              'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 
              'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 
              'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

# add new features, verify that lasso leads to sparsity indeed

In [2]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [3]:
from sklearn import linear_model

all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True)
model_all.fit(sales[all_features], sales['price'])

Lasso(alpha=500.0, normalize=True)

In [5]:
for coef,feature in zip(model_all.coef_, all_features):
    print(coef,feature)

0.0 bedrooms
0.0 bedrooms_square
0.0 bathrooms
134.43931395541438 sqft_living
0.0 sqft_living_sqrt
0.0 sqft_lot
0.0 sqft_lot_sqrt
0.0 floors
0.0 floors_square
0.0 waterfront
24750.004585609488 view
0.0 condition
61749.10309070811 grade
0.0 sqft_above
0.0 sqft_basement
-0.0 yr_built
0.0 yr_renovated


In [6]:
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

# validation set to select l1-penelty strength hyperparameter 

In [8]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [9]:
l1_penalty = list(map(lambda i: 10**i ,[i/2 for i in range(2,15)] ))

In [21]:
for para in l1_penalty:
    model = linear_model.Lasso(alpha= para, normalize=True)
    model.fit(training[all_features], training['price'])
    pred = model.predict(validation[all_features])
    RSS = np.linalg.norm(pred - validation['price'].to_numpy())**2
    print('%.2e, %.5e'%(para,RSS))

1.00e+01, 3.98213e+14
3.16e+01, 3.99042e+14
1.00e+02, 4.29792e+14
3.16e+02, 4.63740e+14
1.00e+03, 6.45899e+14
3.16e+03, 1.22251e+15
1.00e+04, 1.22251e+15
3.16e+04, 1.22251e+15
1.00e+05, 1.22251e+15
3.16e+05, 1.22251e+15
1.00e+06, 1.22251e+15
3.16e+06, 1.22251e+15
1.00e+07, 1.22251e+15


In [22]:
best_l1_penalty = l1_penalty[0]

In [28]:
model = linear_model.Lasso(alpha= best_l1_penalty, normalize=True)
model.fit(training[all_features], training['price'])
pred = model.predict(validation[all_features])
RSS = np.linalg.norm(pred - validation['price'].to_numpy())**2
print('%.2e, %.5e'%(para,RSS))
print(model.coef_ , np.count_nonzero(model.intercept_))

1.00e+07, 3.98213e+14
[-1.61445628e+04  3.73245384e+02  5.08412433e+04  6.17853560e+02
 -4.44113549e+04  7.85623065e-01 -7.01194765e+02 -0.00000000e+00
  5.01420046e+03  6.19488752e+05  3.80418557e+04  2.49987718e+04
  1.28716235e+05  0.00000000e+00  0.00000000e+00 -3.29383118e+03
  1.00573209e+01] 1


In [30]:
l1_penalty = np.logspace(1,4,num=20)
nnz = []
for para in l1_penalty:
    model = linear_model.Lasso(alpha=para, normalize=True)
    model.fit(training[all_features], training['price'])
    nnz.append(np.count_nonzero(model.intercept_) + np.count_nonzero(model.coef_))
    print( para,  nnz[-1])

10.0 15
14.38449888287663 15
20.6913808111479 15
29.76351441631318 15
42.81332398719393 13
61.58482110660264 12
88.58667904100822 11
127.42749857031335 10
183.29807108324357 7
263.6650898730358 6
379.26901907322497 6
545.5594781168514 6
784.7599703514607 5
1128.8378916846884 3
1623.776739188721 3
2335.7214690901214 2
3359.818286283781 1
4832.930238571752 1
6951.927961775606 1
10000.0 1


In [37]:
max_nnz = 7
l1_penalty_min = max([l1_penalty[i] for i in range(len(nnz)) if nnz[i] > max_nnz])
l1_penalty_max = min([l1_penalty[i] for i in range(len(nnz)) if nnz[i] < max_nnz])

In [38]:
print(l1_penalty_min, l1_penalty_max)

127.42749857031335 263.6650898730358


In [43]:
l1_penalty = np.linspace(l1_penalty_min, l1_penalty_max, num=20)
nnz = []
dic = {}
for para in l1_penalty:
    model = linear_model.Lasso(alpha=para, normalize=True)
    model.fit(training[all_features], training['price'])
    nnz = np.count_nonzero(model.intercept_) + np.count_nonzero(model.coef_)
    if nnz == max_nnz:
        pred = model.predict(validation[all_features])
        rss = (np.linalg.norm(validation['price'].to_numpy() - pred) **2)
        dic[para] = rss
        print( para,  nnz , rss)

156.10909673930755 7 440037365263316.5
163.2794962815561 7 440777489641605.25
170.44989582380464 7 441566698090139.9
177.6202953660532 7 442406413188666.3
184.79069490830176 7 443296716874315.0
191.96109445055032 7 444239780526141.6
199.13149399279888 7 445230739842614.2


In [44]:
for para in dic:
    print(para,  '%.5e'%dic[para])

156.10909673930755 4.40037e+14
163.2794962815561 4.40777e+14
170.44989582380464 4.41567e+14
177.6202953660532 4.42406e+14
184.79069490830176 4.43297e+14
191.96109445055032 4.44240e+14
199.13149399279888 4.45231e+14


In [56]:
para = l1_penalty[4]
model = linear_model.Lasso(alpha=para, normalize=True)
model.fit(training[all_features], training['price'])
nnz = np.count_nonzero(model.intercept_) + np.count_nonzero(model.coef_)
if nnz == max_nnz:
    pred = model.predict(validation[all_features])
    rss = (np.linalg.norm(validation['price'].to_numpy() - pred) **2)
    dic[para] = rss
    print( para,  nnz , rss)
for feature, coef in zip(all_features, model.coef_):
    if coef != 0:
        print(feature, coef)

156.10909673930755 7 440037365263316.5
bathrooms 10610.890284398312
sqft_living 163.3802516476289
waterfront 506451.6871148493
view 41960.04355485289
grade 116253.55369970748
yr_built -2612.2348803574882
