In [1]:
import numpy as np
from math import sqrt, log
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import turicreate as tc
from sklearn import linear_model

In [7]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

#sales = tc.SFrame('m_1ce96d9d245ca490.frame_idx')
sales= pd.read_csv('kc_house_data.csv', dtype = dtype_dict)
sales_test_pd = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)
sales_train_pd = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)


In [8]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = np.square(sales['floors'])

In [9]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,sqft_living_sqrt,sqft_lot_sqrt,bedrooms_square,floors_square
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,...,0,98178,47.5112,-122.257,1340.0,5650.0,34.351128,75.166482,9.0,1.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,...,1991,98125,47.721,-122.319,1690.0,7639.0,50.695167,85.099941,9.0,4.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1.0,0,0,...,0,98028,47.7379,-122.233,2720.0,8062.0,27.748874,100.0,4.0,1.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,...,0,98136,47.5208,-122.393,1360.0,5000.0,44.271887,70.710678,16.0,1.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,...,0,98074,47.6168,-122.045,1800.0,7503.0,40.987803,89.88882,9.0,1.0


In [10]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=True, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
model_all.coef_

array([    0.        ,     0.        ,     0.        ,   134.43931396,
           0.        ,     0.        ,     0.        ,     0.        ,
           0.        ,     0.        , 24750.00458561,     0.        ,
       61749.10309071,     0.        ,     0.        ,    -0.        ,
           0.        ])

In [12]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [14]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [20]:
l1_penalty_lst = np.logspace(1, 7, num=13)
model_lst = []
RSSs = []
for l1_penalty in l1_penalty_lst:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model_lst.append(model)
    model.fit(training[all_features], training['price'])
    RSS = np.sum(np.square(np.subtract(model.predict(validation[all_features]), validation['price'])))
    RSSs.append(RSS)

print('The minimum RSS = %.4E, of penalty = %.4f, of index %d' % (np.amin(RSSs), l1_penalty_lst[np.argmin(RSSs)], np.argmin(RSSs)))

The minimum RSS = 3.9821E+14, of penalty = 10.0000, of index 0


In [22]:
np.count_nonzero(model_lst[0].coef_) + np.count_nonzero(model_lst[0].intercept_)

15

In [27]:
l1_penalty_lst_2 = np.logspace(1, 4, num=20)
model_lst_2 = []
RSSs_2 = []
non_zero_lst = []
max_nonzeros = 7

for l1_penalty in l1_penalty_lst_2:
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model_lst_2.append(model)
    model.fit(training[all_features], training['price'])
    RSS = np.sum(np.square(np.subtract(model.predict(validation[all_features]), validation['price'])))
    RSSs_2.append(RSS)
    non_zero_lst.append(np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_))

print('The minimum RSS = %.4E, of penalty = %.4f, of index %d' 
      % (np.amin(RSSs_2), l1_penalty_lst[np.argmin(RSSs_2)], np.argmin(RSSs_2)))


The minimum RSS = 3.9621E+14, of penalty = 100.0000, of index 2


In [39]:
mask_2

[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [44]:
mask_1 = [idx for idx, val in enumerate(non_zero_lst) if val > max_nonzeros]
mask_2 = [idx for idx, val in enumerate(non_zero_lst) if val < max_nonzeros]
l1_penalty_min = 0
l1_penalty_max = 0

for idx in mask_1:
    if l1_penalty_min == 0 or l1_penalty_lst_2[idx] > l1_penalty_min:
        l1_penalty_min = l1_penalty_lst_2[idx]
        
for idx in mask_2:
    if l1_penalty_max == 0 or l1_penalty_lst_2[idx] < l1_penalty_max:
        l1_penalty_max = l1_penalty_lst_2[idx]
    

In [45]:
print('The Smallest L1 penalty of more non zeroes than max_nonzeros is %.4f' % l1_penalty_min)
print('The largest L1 penalty of less non zeroes than max_nonzeros is %.4f' % l1_penalty_max)

The Smallest L1 penalty of more non zeroes than max_nonzeros is 127.4275
The largest L1 penalty of less non zeroes than max_nonzeros is 263.6651


In [43]:
non_zero_lst

[15, 15, 15, 15, 13, 12, 11, 10, 7, 6, 6, 6, 5, 3, 3, 2, 1, 1, 1, 1]

In [47]:
print('%.4E' % RSSs_2[8])

4.4311E+14


In [49]:
model_lst_2[8].coef_

array([-0.00000000e+00, -0.00000000e+00,  4.84964317e+03,  1.65210126e+02,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  4.84780808e+05,  4.14997727e+04,  0.00000000e+00,
        1.13406888e+05,  0.00000000e+00,  0.00000000e+00, -2.41386679e+03,
        0.00000000e+00])

In [51]:
model_lst_2[8].intercept_

4061731.6258036634