In [2]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import math

In [3]:
sales = pd.read_csv('kc_house_data.csv')

In [4]:
def get_numpy_data(data_frame, features, output):
    data_frame['constant'] = 1 # this is how you add a constant column to an SFrame
    # add the column 'constant' to the front of the features list so that we can extract it along with the others:
    features = ['constant'] + features # this is how you combine two lists
    # select the columns of data_SFrame given by the features list into the SFrame features_sframe (now including constant):
    features_frame = data_frame[features]
    # the following line will convert the features_SFrame into a numpy matrix:
    feature_matrix = features_frame.as_matrix()
    # assign the column of data_sframe associated with the output to the SArray output_sarray
    output_array = data_frame[output]
    # the following will convert the SArray into a numpy array by first converting it to a list
    output_array = output_array.as_matrix()
    return(feature_matrix, output_array)

In [5]:
def predict_output(feature_matrix, weights):
    # assume feature_matrix is a numpy matrix containing the features as columns and weights is a corresponding numpy array
    # create the predictions vector by using np.dot()
    predictions = np.dot(feature_matrix, weights)
    return(predictions)

In [33]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    return (normalized_features, norms)

In [34]:
example_features, example_output = get_numpy_data(sales, ['sqft_living', 'bedrooms'], 'price')
example_features, norms = normalize_features(example_features)

In [35]:
initial_weights = [1,4,1]
predictions = predict_output(example_features, initial_weights)
print(predictions)

[ 0.02675867  0.04339256  0.01990703 ...,  0.02289873  0.03178473
  0.02289873]


In [36]:
ro = []
for i in range(3) :
    ro.append(sum(example_features[:,i]*(sales['price']-predictions + initial_weights[i]*example_features[:,i])))
    print(ro[i])

79400300.0145
87939470.8233
80966698.6662


In [37]:
def lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty):
    # compute prediction
    prediction = predict_output(feature_matrix, weights)
    # compute ro[i] = SUM[ [feature_i]*(output - prediction + weight[i]*[feature_i]) ]
    ro_i = sum(feature_matrix[:,i]*(output-prediction + weights[i]*feature_matrix[:,i]))
    
    if i == 0: # intercept -- do not regularize
        new_weight_i = ro_i
    elif ro_i < -l1_penalty/2.:
        new_weight_i = ro_i + l1_penalty/2
    elif ro_i > l1_penalty/2.:
        new_weight_i = ro_i - l1_penalty/2
    else:
        new_weight_i = 0.
    
    return new_weight_i

In [38]:
# should print 0.425558846691
import math
x = np.array([[3.,1.],[2.,3.]])
x,y = normalize_features(x)
print(x)
print(lasso_coordinate_descent_step(1, x, np.array([1., 1.]), np.array([1., 4.]), 0.1))

[[ 0.83205029  0.31622777]
 [ 0.5547002   0.9486833 ]]
0.425558846691


In [39]:
def lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, l1_penalty, tolerance):
    converged = False
    weights = initial_weights
    while not converged :
        change = 0.0
        for i in range(len(initial_weights)) : 
            previous_weight = weights[i]
            weights[i] = lasso_coordinate_descent_step(i, feature_matrix, output, weights, l1_penalty)
            change = max(change , abs(weights[i]-previous_weight))
        if change < tolerance : 
            converged = True
    return weights

In [40]:
initial_weights, l1_penalty, tolerance = [0,0,0], 1e7, 1.0

In [41]:
weights = lasso_cyclical_coordinate_descent(example_features, sales['price'], initial_weights, l1_penalty, tolerance)
print(weights)

[21624997.959518719, 63157247.20788978, 0.0]


In [42]:
predictions = predict_output(example_features, weights)
RSS = sum((predictions-sales['price'])*(predictions-sales['price']))
print(RSS)

1.63049247672e+15


In [43]:
train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')

In [44]:
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
feature_matrix , output = get_numpy_data(train_data, features, 'price')
feature_matrix , norms = normalize_features(feature_matrix)

In [62]:
initial_weights = np.zeros(len(features)+1)
weights1e7 = lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, 1e7,1)
weights1e7_normalized = weights1e7/norms
weights1e7_normalized[3]

[ 24429600.23440336         0.                 0.          48389174.77154855
         0.                 0.           3317511.21492165
   7329961.81171433         0.                 0.                 0.
         0.                 0.                 0.        ]


161.31745764611625

In [65]:
weights1e8 = lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, 1e8,1)
weights1e8_normalized = weights1e8/norms
print(weights1e8)
weights1e8_normalized[3]

[ 71114625.71488713         0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.                 0.                 0.
         0.                 0.        ]


0.0

In [66]:
weights1e4 = lasso_cyclical_coordinate_descent(feature_matrix, output, initial_weights, 1e4,5e5)
weights1e4_normalized = weights1e4/norms
print(weights1e4)
weights1e4_normalized[3]

[ 78564738.34156857 -22097398.92430511  12791071.87278493
  93808088.09281243  -2013172.75704975  -4219184.93265008
   6482842.81753504   7127408.53480684   5001664.85469703
  14327518.43714108 -15770959.15237425  -5159591.22213155
 -84495341.76843894   2824439.4970369 ]


312.73280334330298