In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt, log
from sklearn import linear_model

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)

In [4]:
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']
sales['floors_square'] = sales['floors']*sales['floors']

In [5]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [8]:
model_all = linear_model.Lasso(alpha=5e2, normalize=True) # set parameters
model_all.fit(sales[all_features], sales['price']) # learn weights

Lasso(alpha=500.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
model_all.coef_

array([     0.        ,      0.        ,      0.        ,    134.43931396,
            0.        ,      0.        ,      0.        ,      0.        ,
            0.        ,      0.        ,  24750.00458561,      0.        ,
        61749.10309071,      0.        ,      0.        ,     -0.        ,
            0.        ])

In [10]:
model_all.intercept_

-218136.21403515921

In [11]:
testing = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)
training = pd.read_csv('wk3_kc_house_train_data.csv', dtype=dtype_dict)
validation = pd.read_csv('wk3_kc_house_valid_data.csv', dtype=dtype_dict)

In [12]:
testing['sqft_living_sqrt'] = testing['sqft_living'].apply(sqrt)
testing['sqft_lot_sqrt'] = testing['sqft_lot'].apply(sqrt)
testing['bedrooms_square'] = testing['bedrooms']*testing['bedrooms']
testing['floors_square'] = testing['floors']*testing['floors']

training['sqft_living_sqrt'] = training['sqft_living'].apply(sqrt)
training['sqft_lot_sqrt'] = training['sqft_lot'].apply(sqrt)
training['bedrooms_square'] = training['bedrooms']*training['bedrooms']
training['floors_square'] = training['floors']*training['floors']

validation['sqft_living_sqrt'] = validation['sqft_living'].apply(sqrt)
validation['sqft_lot_sqrt'] = validation['sqft_lot'].apply(sqrt)
validation['bedrooms_square'] = validation['bedrooms']*validation['bedrooms']
validation['floors_square'] = validation['floors']*validation['floors']

In [13]:
for l1_penalty in np.logspace(1,7,num=13) : 
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    predictions = model.predict(validation[all_features])
    RSS = sum((predictions-validation['price'])*(predictions-validation['price']))
    print(l1_penalty, " : " , RSS)

10.0  :  3.982133273e+14
31.6227766017  :  3.99041900253e+14
100.0  :  4.29791604073e+14
316.227766017  :  4.63739831045e+14
1000.0  :  6.45898733634e+14
3162.27766017  :  1.22250685943e+15
10000.0  :  1.22250685943e+15
31622.7766017  :  1.22250685943e+15
100000.0  :  1.22250685943e+15
316227.766017  :  1.22250685943e+15
1000000.0  :  1.22250685943e+15
3162277.66017  :  1.22250685943e+15
10000000.0  :  1.22250685943e+15


In [14]:
best_model = linear_model.Lasso(alpha=10.0, normalize=True)
best_model.fit(training[all_features], training['price'])
best_predictions = best_model.predict(testing[all_features])
RSS = sum((best_predictions-testing['price'])*(best_predictions-testing['price']))
print(RSS)

9.84674025527e+13


In [16]:
best_model.coef_

array([ -1.61445628e+04,   3.73245384e+02,   5.08412433e+04,
         6.17853560e+02,  -4.44113549e+04,   7.85623065e-01,
        -7.01194765e+02,  -0.00000000e+00,   5.01420046e+03,
         6.19488752e+05,   3.80418557e+04,   2.49987718e+04,
         1.28716235e+05,   0.00000000e+00,   0.00000000e+00,
        -3.29383118e+03,   1.00573209e+01])

In [20]:
np.count_nonzero(best_model.coef_) + np.count_nonzero(best_model.intercept_)

15

In [22]:
max_nonzeros=7
myList = []
penalties = []
for l1_penalty in np.logspace(1,4,13) : 
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    nonzeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    myList.append(nonzeros)
    penalties.append(l1_penalty)
print(myList)
print(penalties)

[15, 15, 15, 13, 11, 7, 6, 6, 4, 3, 1, 1, 1]
[10.0, 17.782794100389228, 31.622776601683793, 56.234132519034908, 100.0, 177.82794100389228, 316.22776601683796, 562.34132519034904, 1000.0, 1778.2794100389228, 3162.2776601683795, 5623.4132519034911, 10000.0]


In [23]:
l1_penalty_min  , l1_penalty_max= penalties[4], penalties[6]

In [32]:
for l1_penalty in np.linspace(l1_penalty_min, l1_penalty_max, 20) : 
    model = linear_model.Lasso(alpha=l1_penalty, normalize=True)
    model.fit(training[all_features], training['price'])
    nonzeros = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_)
    if nonzeros == max_nonzeros : 
        predictions = model.predict(validation[all_features])
        RSS = sum((predictions-validation['price'])*(predictions-validation['price']))
        print(RSS , " : " , l1_penalty)

4.40118263243e+14  :  156.902043689
4.41322817704e+14  :  168.282452426
4.42654859676e+14  :  179.662861164
4.44116566258e+14  :  191.043269902
4.45702397073e+14  :  202.42367864


In [34]:
best_model = linear_model.Lasso(alpha = 156.902043689, normalize=True)
best_model.fit(training[all_features] , training['price'])
best_model.coef_

array([ -0.00000000e+00,  -0.00000000e+00,   1.04313641e+04,
         1.63441034e+02,   0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.05817774e+05,   4.19474087e+04,   0.00000000e+00,
         1.16169193e+05,   0.00000000e+00,   0.00000000e+00,
        -2.60634807e+03,   0.00000000e+00])

In [35]:
all_features

['bedrooms',
 'bedrooms_square',
 'bathrooms',
 'sqft_living',
 'sqft_living_sqrt',
 'sqft_lot',
 'sqft_lot_sqrt',
 'floors',
 'floors_square',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [36]:
print(l1_penalty_min)
print(l1_penalty_max)

100.0
316.227766017
