In [1]:
import graphlab

In [3]:
sales = graphlab.SFrame('kc_house_data.gl/')

In [4]:
from math import log, sqrt
sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)

In [10]:
sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)
sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']

In [13]:
sales['floors'] = sales['floors'].astype(int)
sales['floors_square'] = sales['floors']*sales['floors']

In [101]:
all_features = ['bedrooms', 'bedrooms_square',
            'bathrooms',
            'sqft_living', 'sqft_living_sqrt',
            'sqft_lot', 'sqft_lot_sqrt',
            'floors', 'floors_square',
            'waterfront', 'view', 'condition', 'grade',
            'sqft_above',
            'sqft_basement',
            'yr_built', 'yr_renovated']

In [102]:
model_all = graphlab.linear_regression.create(sales, target = 'price', features = all_features,
                                             validation_set = None, 
                                              l2_penalty = 0., l1_penalty = 1e10)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 21613
PROGRESS: Number of features          : 17
PROGRESS: Number of unpacked features : 17
PROGRESS: Number of coefficients    : 18
PROGRESS: Starting Accelerated Gradient (FISTA)
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+-----------+--------------+--------------------+---------------+
PROGRESS: Tuning step size. First iteration could take longer than subsequent iterations.
PROGRESS: | 1         | 2        | 0.000002  | 0.125147     | 6981926.539042     | 427729.501760 |
PROGRESS: | 2         | 3        | 0.000002  | 0.136161     | 6861970.573528     | 393259.158552 |
PROGRESS: | 3         | 4      

In [103]:
model_all.get('coefficients').print_rows(num_rows = 16)

+------------------+-------+---------------+
|       name       | index |     value     |
+------------------+-------+---------------+
|   (intercept)    |  None |  274952.62044 |
|     bedrooms     |  None |      0.0      |
| bedrooms_square  |  None |      0.0      |
|    bathrooms     |  None | 8483.95148798 |
|   sqft_living    |  None | 24.4238022551 |
| sqft_living_sqrt |  None | 351.097833343 |
|     sqft_lot     |  None |      0.0      |
|  sqft_lot_sqrt   |  None |      0.0      |
|      floors      |  None |      0.0      |
|  floors_square   |  None |      0.0      |
|    waterfront    |  None |      0.0      |
|       view       |  None |      0.0      |
|    condition     |  None |      0.0      |
|      grade       |  None | 850.427363977 |
|    sqft_above    |  None | 20.0777654516 |
|  sqft_basement   |  None |      0.0      |
+------------------+-------+---------------+
[18 rows x 3 columns]



In [104]:
(training_and_validation, testing) = sales.random_split(.9, seed = 1)
(training, validation) = training_and_validation.random_split(0.5, seed = 1)

In [105]:
import numpy as np
l1_values = np.logspace(1, 7, num = 13)

In [106]:
RSS_list = []
for l1_value in np.logspace(1, 7, num = 13):
    print "l1_value: " + str(l1_value)
    model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                             l1_penalty = l1_value, l2_penalty = 0., verbose = False)
    RSS = ((validation['price'] - model.predict(validation))**2).sum()
    RSS_list.append(RSS)
    print RSS

l1_value: 10.0
6.28412158085e+14
l1_value: 31.6227766017
6.28412158283e+14
l1_value: 100.0
6.28412158907e+14
l1_value: 316.227766017
6.28412160882e+14
l1_value: 1000.0
6.28412167128e+14
l1_value: 3162.27766017
6.2841218688e+14
l1_value: 10000.0
6.28412249343e+14
l1_value: 31622.7766017
6.28412446908e+14
l1_value: 100000.0
6.2841307204e+14
l1_value: 316227.766017
6.28415052663e+14
l1_value: 1000000.0
6.2842135376e+14
l1_value: 3162277.66017
6.28441657748e+14
l1_value: 10000000.0
6.28509646259e+14


In [27]:
model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                             l1_penalty = l1_values[0], l2_penalty = 0., verbose = False)
RSS = ((validation['price'] - model.predict(validation))**2).sum()
print RSS

6.24975431996e+14


In [26]:
l1_values[0]

10.0

In [28]:
model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                             l1_penalty = l1_values[1], l2_penalty = 0., verbose = False)
RSS = ((validation['price'] - model.predict(validation))**2).sum()
print RSS

6.24975432277e+14


In [108]:
RSS_list

[628412158085184.9,
 628412158282694.2,
 628412158907274.1,
 628412160882372.2,
 628412167128218.9,
 628412186879697.5,
 628412249343142.6,
 628412446907719.4,
 628413072039907.9,
 628415052663291.8,
 628421353760493.2,
 628441657747716.5,
 628509646259482.0]

In [107]:
min(RSS_list)

628412158085184.9

In [109]:
model_low_rss = graphlab.linear_regression.create(testing, target = 'price', features = all_features, validation_set = None,
                                                 l1_penalty = 10, l2_penalty = 0., verbose = False)
low_rss = ((testing['price'] - model.predict(testing))**2).sum()
print low_rss

1.57662866594e+14


In [110]:
model_l1_10 = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                               l1_penalty = 10, l2_penalty = 0., verbose = False)

In [111]:
model_l1_10['coefficients']['value'].nnz()

18

In [43]:
model_l1_10.get('coefficients').print_rows(num_rows = 16)

+------------------+-------+---------------+
|       name       | index |     value     |
+------------------+-------+---------------+
|   (intercept)    |  None | 21862.6301741 |
|     bedrooms     |  None | 8536.55114934 |
| bedrooms_square  |  None | 951.779014251 |
|    bathrooms     |  None |  26071.312855 |
|   sqft_living    |  None | 40.3726918916 |
| sqft_living_sqrt |  None |  1187.7350301 |
|      floors      |  None | 20269.7857343 |
|  floors_square   |  None | 11116.3308822 |
|    waterfront    |  None | 605410.063799 |
|       view       |  None | 94701.1677273 |
|    condition     |  None |  7488.9837845 |
|      grade       |  None | 6551.52816569 |
|    sqft_above    |  None | 45.0575913705 |
|  sqft_basement   |  None | 121.043415784 |
|     yr_built     |  None |  10.858168471 |
|   yr_renovated   |  None | 56.9039466586 |
+------------------+-------+---------------+
[16 rows x 3 columns]



In [72]:
max_nonzeros = 7

In [121]:
l1_penalty_values = np.logspace(8, 10, num = 20)

In [122]:
l1_penalty_values

array([  1.00000000e+08,   1.27427499e+08,   1.62377674e+08,
         2.06913808e+08,   2.63665090e+08,   3.35981829e+08,
         4.28133240e+08,   5.45559478e+08,   6.95192796e+08,
         8.85866790e+08,   1.12883789e+09,   1.43844989e+09,
         1.83298071e+09,   2.33572147e+09,   2.97635144e+09,
         3.79269019e+09,   4.83293024e+09,   6.15848211e+09,
         7.84759970e+09,   1.00000000e+10])

In [123]:
non_zero = []
l1_penalty_min = 0
l1_penalty_max = 1e20
for l1_penalty in l1_penalty_values:
    print "l1_penalty: " + str(l1_penalty)
    model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                             l1_penalty = l1_penalty, l2_penalty = 0., verbose = False)
    non_zero.append(model['coefficients']['value'].nnz())
    if (model['coefficients']['value'].nnz() > max_nonzeros) & (l1_penalty > l1_penalty_min):
        l1_penalty_min = l1_penalty
    if (model['coefficients']['value'].nnz() < max_nonzeros) & (l1_penalty < l1_penalty_max):
        l1_penalty_max = l1_penalty
    print model['coefficients']['value'].nnz()
    

l1_penalty: 100000000.0
18
l1_penalty: 127427498.57
18
l1_penalty: 162377673.919
18
l1_penalty: 206913808.111
18
l1_penalty: 263665089.873
17
l1_penalty: 335981828.628
17
l1_penalty: 428133239.872
17
l1_penalty: 545559478.117
17
l1_penalty: 695192796.178
17
l1_penalty: 885866790.41
16
l1_penalty: 1128837891.68
15
l1_penalty: 1438449888.29
15
l1_penalty: 1832980710.83
13
l1_penalty: 2335721469.09
11
l1_penalty: 2976351441.63
10
l1_penalty: 3792690190.73
6
l1_penalty: 4832930238.57
5
l1_penalty: 6158482110.66
3
l1_penalty: 7847599703.51
1
l1_penalty: 10000000000.0
1


In [124]:
l1_penalty_min

2976351441.6313128

In [126]:
l1_penalty_max.round()

3792690191.0

In [93]:
l1_penalty_values = np.linspace(l1_penalty_min, l1_penalty_max, 20)

In [94]:
l1_penalty_values

array([  2.97635144e+09,   3.01931664e+09,   3.06228184e+09,
         3.10524703e+09,   3.14821223e+09,   3.19117743e+09,
         3.23414263e+09,   3.27710782e+09,   3.32007302e+09,
         3.36303822e+09,   3.40600341e+09,   3.44896861e+09,
         3.49193381e+09,   3.53489901e+09,   3.57786420e+09,
         3.62082940e+09,   3.66379460e+09,   3.70675980e+09,
         3.74972499e+09,   3.79269019e+09])

In [117]:
low_rss = 1e25
best_l1_pen = 0
for l1_penalty in l1_penalty_values:
    print "l1_penalty: " + str(l1_penalty)
    model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                             l1_penalty = l1_penalty, l2_penalty = 0., verbose = False)
    RSS = ((validation['price'] - model.predict(validation))**2).sum()
    print RSS
    print model['coefficients']['value'].nnz()
    if (model['coefficients']['value'].nnz() == 7) & (RSS < low_rss):
        low_rss = RSS
        best_l1_pen = l1_penalty
        

l1_penalty: 2976351441.63
9.64876826273e+14
10
l1_penalty: 3019316638.95
9.72840378296e+14
10
l1_penalty: 3062281836.27
9.81196599802e+14
10
l1_penalty: 3105247033.59
9.89653612279e+14
10
l1_penalty: 3148212230.92
9.98361723081e+14
10
l1_penalty: 3191177428.24
1.00822970436e+15
9
l1_penalty: 3234142625.56
1.01702550003e+15
9
l1_penalty: 3277107822.88
1.02492450555e+15
8
l1_penalty: 3320073020.2
1.0299794723e+15
7
l1_penalty: 3363038217.52
1.03293901937e+15
7
l1_penalty: 3406003414.84
1.03646285326e+15
7
l1_penalty: 3448968612.16
1.0401993584e+15
7
l1_penalty: 3491933809.48
1.04511992951e+15
7
l1_penalty: 3534899006.81
1.05027158412e+15
7
l1_penalty: 3577864204.13
1.05541689503e+15
7
l1_penalty: 3620829401.45
1.06058634457e+15
7
l1_penalty: 3663794598.77
1.06475371829e+15
6
l1_penalty: 3706759796.09
1.06876205082e+15
6
l1_penalty: 3749724993.41
1.07276526773e+15
6
l1_penalty: 3792690190.73
1.07695101957e+15
6


In [118]:
low_rss

1029979472296641.5

In [119]:
best_l1_pen

3320073020.20013

In [120]:
final_model = graphlab.linear_regression.create(training, target = 'price', features = all_features, validation_set = None,
                                               l1_penalty = best_l1_pen, l2_penalty = 0., verbose = False)
final_model.get('coefficients').print_rows(num_rows = 16)

+------------------+-------+---------------+
|       name       | index |     value     |
+------------------+-------+---------------+
|   (intercept)    |  None | 215675.924417 |
|     bedrooms     |  None | 1228.59582293 |
| bedrooms_square  |  None |      0.0      |
|    bathrooms     |  None | 16669.4449433 |
|   sqft_living    |  None | 33.0891011485 |
| sqft_living_sqrt |  None |  728.74471267 |
|     sqft_lot     |  None |      0.0      |
|  sqft_lot_sqrt   |  None |      0.0      |
|      floors      |  None |      0.0      |
|  floors_square   |  None |      0.0      |
|    waterfront    |  None |      0.0      |
|       view       |  None |      0.0      |
|    condition     |  None |      0.0      |
|      grade       |  None | 3155.81538376 |
|    sqft_above    |  None |  31.080485027 |
|  sqft_basement   |  None |      0.0      |
+------------------+-------+---------------+
[18 rows x 3 columns]

