In [1]:
import graphlab

In [2]:
sales = graphlab.SFrame('kc_house_data.gl/')
sales['floors'] = sales['floors'].astype(int)

[INFO] [1;32m1449676714 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_FILE to C:\Users\Adam H\Anaconda\lib\site-packages\certifi\cacert.pem
[0m[1;32m1449676714 : INFO:     (initialize_globals_from_environment:282): Setting configuration variable GRAPHLAB_FILEIO_ALTERNATIVE_SSL_CERT_DIR to 
[0mThis non-commercial license of GraphLab Create is assigned to athoul01@gmail.com and will expire on September 21, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-6336 - Server binary: C:\Users\Adam H\Anaconda\lib\site-packages\graphlab\unity_server.exe - Server log: C:\Users\ADAMH~1\AppData\Local\Temp\graphlab_server_1449676714.log.0
[INFO] GraphLab Server Version: 1.7.1


In [3]:
import numpy as np
def get_numpy_data(data_sframe, features, output):
    data_sframe['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_sframe = data_sframe[features]
    
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_sframe.to_numpy()
    
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_sarray = data_sframe[output]
    
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_sarray.to_numpy()
    
    return(feature_matrix, output_array)

In [4]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [5]:
X = np.array([[3., 5., 8.], [4., 12., 15.]])
print X

[[  3.   5.   8.]
 [  4.  12.  15.]]


In [6]:
norms = np.linalg.norm(X, axis = 0) # gives [norm(X[:, 0]), norm(X[:, 1]), norm(X[:, 2])]
print norms

[  5.  13.  17.]


In [7]:
print X/norms # gives [X[:, 0]/norm(X[:,0]), X[:, 1]/norm(X[:, 1]), X[:, 2]/norm(X[:, 2])]

[[ 0.6         0.38461538  0.47058824]
 [ 0.8         0.92307692  0.88235294]]


In [8]:
def normalize_features(feature_matrix):
    norms =  np.linalg.norm(feature_matrix, axis = 0)
    normalized_features = feature_matrix/norms
    return(normalized_features, norms)

In [9]:
features, norms = normalize_features(np.array([[3., 6., 9.], [4., 8., 12.]]))
print features
print norms

[[ 0.6  0.6  0.6]
 [ 0.8  0.8  0.8]]
[  5.  10.  15.]


In [10]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)

In [11]:
simple_feature_matrix, norms = normalize_features(simple_feature_matrix)

In [12]:
weights = np.array([1., 4., 1.])

In [13]:
prediction = predict_output(simple_feature_matrix, weights)

In [14]:
prediction

array([ 0.02675867,  0.04339256,  0.01990703, ...,  0.02289873,
        0.03178473,  0.02289873])

In [15]:
ro =  (simple_feature_matrix[:, 0] * (output - prediction + weights[0]* simple_feature_matrix[:, 0])).sum()

In [16]:
ros = []
for i in xrange(len(simple_feature_matrix[1])):
    ro = (simple_feature_matrix[:, i] * (output - prediction + weights[i]*simple_feature_matrix[:, i])).sum()
    ros.append(ro)

In [17]:
ros

[79400300.034929156, 87939470.772991076, 80966698.675965652]

In [18]:
1.73e8/2

86500000.0

In [19]:
1.4e8/2

70000000.0

In [20]:
1.9e8/2

95000000.0

In [21]:
2.3e8/2

115000000.0

In [22]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix, weights)
    
    #compute ro[i] = SUM[ [feature_i] * (output - prediction + weight[i]*[feature_i])]
    ro_i = (feature_matrix[:, i] * (output - prediction + weights[i]* feature_matrix[:, i])).sum()
    
    if i == 0: #intercept term. No regularization
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + (l1_penalty/2)
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - (l1_penalty/2)
    else:
        new_weight_i = 0
    return new_weight_i

In [23]:
import math
print lasso_coordinate_descent_step(1, np.array([[3./math.sqrt(13),1./math.sqrt(10)],[2./math.sqrt(13),3./math.sqrt(10)]]), 
                                   np.array([1., 1.]), np.array([1., 4.]), 0.1)

0.425558846691


In [24]:
np.array([[3./math.sqrt(13), 1./math.sqrt(10)], [2./math.sqrt(13), 3./math.sqrt(10)]])[:, 1]

array([ 0.31622777,  0.9486833 ])

In [76]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    old_weights = np.array(initial_weights)
    new_weights = np.array(initial_weights)
    changes = np.zeros(len(feature_matrix[1]))
    below_tolerance = False
    while not below_tolerance:
        for feature in xrange(len(feature_matrix[1])):
            new_weights[feature] = lasso_coordinate_descent_step(feature, feature_matrix, output, old_weights, l1_penalty)
            changes[feature] = abs(new_weights[feature] - old_weights[feature])
            old_weights[feature] = new_weights[feature]
        if max(changes) < tolerance:
            below_tolerance = True
    
    return new_weights

In [65]:
simple_features = ['sqft_living', 'bedrooms']
my_output = 'price'
initial_weights = np.zeros(3)
l1_penalty = 1e7
tolerance = 1.0

In [66]:
(simple_feature_matrix, output) = get_numpy_data(sales, simple_features, my_output)
(normalized_simple_feature_matrix, simple_norms) = normalize_features(simple_feature_matrix)

In [77]:
weights = lasso_cyclical_coordinate_descent(normalized_simple_feature_matrix, output, initial_weights,
                                           l1_penalty, tolerance)

In [78]:
weights

array([ 21624998.36636292,  63157246.78545421,         0.        ])

In [79]:
simple_predictions = predict_output(normalized_simple_feature_matrix, weights)

In [80]:
simple_rss = ((output - simple_predictions)**2).sum()

In [81]:
print simple_rss

1.63049248148e+15


In [82]:
simple_feature_matrix

array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00],
       [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00],
       [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00],
       [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00],
       [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00]])

In [83]:
train_data, test_data = sales.random_split(.8, seed = 0)

In [84]:
all_features = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront', 
                'view', 
                'condition', 
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built', 
                'yr_renovated']

In [85]:
(all_feature_matrix, output) = get_numpy_data(train_data, all_features, my_output)
(normalized_all_feature_matrix, all_norms) = normalize_features(all_feature_matrix)

In [92]:
initial_weights = np.zeros(len(normalized_all_feature_matrix[1]))
l1_penalty = 1e7
tolerance = 1

In [93]:
weights1e7 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output, initial_weights, 
                                              l1_penalty, tolerance)

In [94]:
weights1e7

array([ 24429600.60933314,         0.        ,         0.        ,
        48389174.35227978,         0.        ,         0.        ,
         3317511.16271981,   7329961.9848964 ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [95]:
initial_weights = np.zeros(len(normalized_all_feature_matrix[1]))
l1_penalty = 1e8
tolerance = 1
weights1e8 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output, initial_weights,
                                               l1_penalty, tolerance)

In [96]:
weights1e8

array([ 71114625.75280938,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ,         0.        ,
               0.        ,         0.        ])

In [97]:
initial_weights = np.zeros(len(normalized_all_feature_matrix[1]))
l1_penalty = 1e4
tolerance = 5e5
weights1e4 = lasso_cyclical_coordinate_descent(normalized_all_feature_matrix, output, initial_weights,
                                               l1_penalty, tolerance)

In [98]:
weights1e4

array([ 77779073.91265222, -22884012.25023359,  15348487.08089997,
        92166869.69883074,  -2139328.0824278 ,  -8818455.54409495,
         6494209.73310655,   7065162.05053197,   4119079.21006764,
        18436483.5261878 , -14566678.54514342,  -5528348.75179427,
       -83591746.20730537,   2784276.46012858])

In [110]:
all_features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated']

In [103]:
normalized_weights1e7 = weights1e7/all_norms
normalized_weights1e4 = weights1e4/all_norms
normalized_weights1e8 = weights1e8/all_norms

In [102]:
normalized_weights1e7[3]

161.31745624837859

In [104]:
(test_feature_matrix, test_output) = get_numpy_data(test_data, all_features, 'price')

In [107]:
norm_1e4_preds = predict_output(test_feature_matrix, normalized_weights1e4)
norm_1e4_rss = ((test_output - norm_1e4_preds)**2).sum()
print norm_1e4_rss

2.2778100476e+14


In [108]:
norm_1e7_preds = predict_output(test_feature_matrix, normalized_weights1e7)
norm_1e7_rss = ((test_output - norm_1e7_preds)**2).sum()
print norm_1e7_rss

2.75962079909e+14


In [109]:
norm_1e8_preds = predict_output(test_feature_matrix, normalized_weights1e8)
norm_1e8_rss = ((test_output - norm_1e8_preds)**2).sum()
print norm_1e8_rss

5.37166150034e+14
