In [1]:
import numpy as np
import turicreate as tc
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
from sklearn import linear_model

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

house_data = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
sf = tc.SFrame('m_1ce96d9d245ca490.frame_idx')
sf_sort = sf.sort(['sqft_living', 'price'])

In [3]:
l2_small_penalty = 1.5e-5

In [4]:
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    assert degree >= 1
    # initialize the SFrame:
    poly_sframe = tc.SFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_sframe['power_1'] = feature
    # first check if degree > 1
    features_name = ['power_1']
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_sframe[name] to be feature^power
            poly_sframe[name] = np.power(feature, power)
            features_name.append(name)
    return poly_sframe, features_name

In [23]:
poly15_data, my_features = polynomial_sframe(sf_sort['sqft_living'], 15) # use equivalent of `polynomial_sframe`
poly15_data['price'] = sf_sort['price']


model_pd = linear_model.Ridge(alpha=l2_small_penalty, normalize=True)
model = tc.linear_regression.create(poly15_data, target = 'price', features = my_features, verbose=True, l2_penalty = l2_small_penalty, validation_set=None)
"""model.fit(poly15_data, sf_sort['price'])"""

"model.fit(poly15_data, sf_sort['price'])"

In [30]:
house_data = house_data.sort_values(['sqft_living'])

In [31]:
house_data

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
19452,3980300371,20140926T000000,142000.0,0.0,0.00,290.0,20875,1.0,0,0,...,1,290,0,1963,0,98024,47.5308,-121.888,1620.0,22850.0
15381,2856101479,20140701T000000,276000.0,1.0,0.75,370.0,1801,1.0,0,0,...,5,370,0,1923,0,98117,47.6778,-122.389,1340.0,5000.0
860,1723049033,20140620T000000,245000.0,1.0,0.75,380.0,15000,1.0,0,0,...,5,380,0,1963,0,98168,47.4810,-122.323,1170.0,15000.0
18379,1222029077,20141029T000000,265000.0,0.0,0.75,384.0,213444,1.0,0,0,...,4,384,0,2003,0,98070,47.4177,-122.491,1920.0,224341.0
4868,6896300380,20141002T000000,228000.0,0.0,1.00,390.0,5900,1.0,0,0,...,4,390,0,1953,0,98118,47.5260,-122.261,2170.0,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8092,1924059029,20140617T000000,4668000.0,5.0,6.75,9640.0,13068,1.0,1,4,...,12,4820,4820,1983,2009,98040,47.5570,-122.210,3270.0,10454.0
9254,9208900037,20140919T000000,6885000.0,6.0,7.75,9890.0,31374,2.0,0,4,...,13,8860,1030,2001,0,98039,47.6305,-122.240,4540.0,42730.0
3914,9808700762,20140611T000000,7062500.0,5.0,4.50,10040.0,37325,2.0,1,2,...,11,7680,2360,1940,2001,98004,47.6500,-122.214,3930.0,25449.0
7252,6762700020,20141013T000000,7700000.0,6.0,8.00,12050.0,27600,2.5,0,3,...,13,8570,3480,1910,1987,98102,47.6298,-122.323,3940.0,8800.0


In [32]:

house_data_15, _ = polynomial_pd(house_data['sqft_living'], 15)
house_data_price = house_data['price']

In [None]:
house_data_15

In [33]:
model_pd.fit(house_data_15, house_data_price)

Ridge(alpha=1.5e-05, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=None, solver='auto', tol=0.001)

In [34]:
print(model_pd.coef_)
print(model.coefficients.print_rows(num_rows = 16))

[ 1.24873306e+02 -4.77376011e-02  3.01446238e-05 -2.44419942e-09
 -1.94153675e-13  8.54085686e-18  1.51142121e-21  8.27979094e-26
  6.52603100e-31 -3.27895017e-34 -3.87962315e-38 -2.72437650e-42
 -1.07790800e-46  3.78242694e-51  1.39790296e-54]
+-------------+-------+-------------------------+--------+
|     name    | index |          value          | stderr |
+-------------+-------+-------------------------+--------+
| (intercept) |  None |    156896.98133571132   |  nan   |
|   power_1   |  None |    137.41238230525002   |  nan   |
|   power_2   |  None |   0.09434556534476389   |  nan   |
|   power_3   |  None | -0.00010600647237643056 |  nan   |
|   power_4   |  None |  4.504952596646743e-08  |  nan   |
|   power_5   |  None | -6.8127727296296215e-12 |  nan   |
|   power_6   |  None |  1.4634062137839945e-16 |  nan   |
|   power_7   |  None |  3.8054508415461626e-20 |  nan   |
|   power_8   |  None | -2.1542846446346125e-25 |  nan   |
|   power_9   |  None | -1.8479510818346008e-28

In [15]:
set_1 = pd.read_csv('wk3_kc_house_set_1_data.csv', dtype=dtype_dict)
set_2 = pd.read_csv('wk3_kc_house_set_2_data.csv', dtype=dtype_dict)
set_3 = pd.read_csv('wk3_kc_house_set_3_data.csv', dtype=dtype_dict)
set_4 = pd.read_csv('wk3_kc_house_set_4_data.csv', dtype=dtype_dict)

In [16]:
l2_small_penalty2=1e-9

In [17]:
model_pd2 = linear_model.Ridge(alpha=l2_small_penalty2, normalize=True)

In [18]:
set_1_data = set_1[['sqft_living', 'price']]
set_2_data = set_2[['sqft_living', 'price']]
set_3_data = set_3[['sqft_living', 'price']]
set_4_data = set_4[['sqft_living', 'price']]

In [5]:
def polynomial_pd(feature, degree):
    # assume that degree >= 1
    assert degree >= 1
    # initialize the SFrame:
    poly_dataframe = pd.DataFrame({"power_1":feature})
    # and set poly_sframe['power_1'] equal to the passed feature
    # first check if degree > 1
    features_name = ['power_1']
    if degree > 1:
        # then loop over the remaining degrees:
        for power in range(2, degree+1):
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # assign poly_sframe[name] to be feature^power
            poly_dataframe[name] = np.power(feature, power)
            features_name.append(name)
    return poly_dataframe, features_name

In [19]:
set_1_X, my_features = polynomial_pd(set_1_data['sqft_living'], 15)
set_2_X, _ = polynomial_pd(set_2_data['sqft_living'], 15)
set_3_X, _ = polynomial_pd(set_3_data['sqft_living'], 15)
set_4_X, _ = polynomial_pd(set_4_data['sqft_living'], 15)


In [None]:
set_1_X

In [20]:
model_pd2_1 = linear_model.Ridge(alpha=l2_small_penalty2, normalize=True)
model_pd2_2 = linear_model.Ridge(alpha=l2_small_penalty2, normalize=True)
model_pd2_3 = linear_model.Ridge(alpha=l2_small_penalty2, normalize=True)
model_pd2_4 = linear_model.Ridge(alpha=l2_small_penalty2, normalize=True)
fit_1 = model_pd2_1.fit(set_1_X, set_1_data['price'])
fit_2 = model_pd2_2.fit(set_2_X, set_2_data['price'])
fit_3 = model_pd2_3.fit(set_3_X, set_3_data['price'])
fit_4 = model_pd2_4.fit(set_4_X, set_4_data['price'])

In [None]:
plt.subplot(2,2,1)
plt.plot(set_1_X['power_1'], set_1_data['price'],'.', set_1_X['power_1'], model_pd2_1.predict(set_1_X),'-')
plt.subplot(2,2,2)
plt.plot(set_2_X['power_1'], set_2_data['price'],'.', set_2_X['power_1'], model_pd2_2.predict(set_2_X),'-')
plt.subplot(2,2,3)
plt.plot(set_3_X['power_1'], set_3_data['price'],'.', set_3_X['power_1'], model_pd2_3.predict(set_3_X),'-')
plt.subplot(2,2,4)
plt.plot(set_4_X['power_1'], set_4_data['price'],'.', set_4_X['power_1'], model_pd2_4.predict(set_4_X),'-')

In [21]:
print(model_pd2_1.coef_)

[ 5.44669398e+02 -3.55447605e-01  1.22446382e-04 -1.17175318e-08
 -3.90512295e-13 -1.39076633e-17  1.47860337e-20  6.87491612e-25
 -7.57203949e-29 -1.04097310e-32 -3.71844032e-37  3.39989217e-41
  5.56592061e-45  2.53761389e-49 -3.35152920e-53]


In [None]:
print(model_pd2_2.coef_)

In [None]:
print(model_pd2_3.coef_)

In [None]:
print(model_pd2_4.coef_)

In [None]:
l2_large_penalty=1.23e2

In [None]:
model_pd2_5 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_pd2_6 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_pd2_7 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
model_pd2_8 = linear_model.Ridge(alpha=l2_large_penalty, normalize=True)
fit_5 = model_pd2_5.fit(set_1_X, set_1_data['price'])
fit_6 = model_pd2_6.fit(set_2_X, set_2_data['price'])
fit_7 = model_pd2_7.fit(set_3_X, set_3_data['price'])
fit_8 = model_pd2_8.fit(set_4_X, set_4_data['price'])

In [None]:
print(model_pd2_5.coef_)

In [None]:
print(model_pd2_6.coef_)

In [None]:
print(model_pd2_7.coef_)

In [None]:
print(model_pd2_8.coef_)

In [None]:
plt.plot(set_1_X['power_1'], set_1_data['price'],'.', set_1_X['power_1'], model_pd2_5.predict(set_1_X),'-')

In [None]:
train_valid_shuffled = pd.read_csv('wk3_kc_house_train_valid_shuffled.csv', dtype=dtype_dict)
test = pd.read_csv('wk3_kc_house_test_data.csv', dtype=dtype_dict)

In [None]:
test.shape

In [None]:
n = len(train_valid_shuffled)
k = 10 # 10-fold cross-validation

for i in range(k):
    start = (n*i)/k
    end = (n*(i+1))/k-1
    print (i, (start, end))

In [None]:
def k_fold_cross_validation(k, l2_penalty, data, output):
    n = len(data)
    error_total = 0
    for i in range(0, k):
        start = (int)((n*i)/k)
        end = (int)((n*(i+1))/k-1)
        
        
        validation_set = data[start: end + 1]
        output_validation = output[start: end + 1]
        
        training_set = data[end + 1: n]
        output_training = output[end + 1: n]
        
        
        
        training_set = data[0:start].append(data[end+1:n])
        output_training = output[0:start].append(output[end+1:n])
        
        
        
        model = linear_model.Ridge(alpha=l2_penalty, normalize=True)
        model.fit(training_set, output_training)
        fit = model.predict(validation_set)
        RSS = np.sum(np.square(np.subtract(fit, output_validation)))
        error_total += RSS
        
    avg_valid_err = error_total/n
    return avg_valid_err

In [None]:
lowest = -1
lowest_l2 = 0
for l2_penalty_loop in np.logspace(3,9, num=13):
    data_loop, my_features_loop = polynomial_pd(train_valid_shuffled['sqft_living'], 15)
    data_price = train_valid_shuffled['price']
    avg_valid_err = k_fold_cross_validation(10, l2_penalty_loop, data_loop, data_price)
    
    if lowest == -1 or avg_valid_err < lowest:
        lowest = avg_valid_err
        lowest_l2 = l2_penalty_loop

In [None]:
print(lowest)
print(lowest_l2)

In [None]:
model_last = linear_model.Ridge(alpha = 1000, normalize = True)
test_data, _ = polynomial_pd(test['sqft_living'], 15)
test_price = test['price']
model_last.fit(test_data, test_price)
RSS_test = np.sum(np.square(np.subtract(model_last.predict(test_data), test_price)))
print("%.4E" % RSS_test)