In [1]:
import graphlab

In [3]:
def polynomial_sframe(feature, degree):
    # assume that degree >= 1
    # initialize the SFrame:
    poly_sframe = graphlab.SFrame()
    # and set poly_sframe['power_1'] equal to the passed feature
    poly_sframe['power_1']=feature
    # first check if degree > 1
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree+1): 
            # first we'll give the column a name:
            name = 'power_' + str(power)
            # then assign poly_sframe[name] to the appropriate power of feature
            poly_sframe[name]=feature.apply(lambda x:x**power)
    return poly_sframe

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
sales = graphlab.SFrame('kc_house_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to dragonflarex@mail.bg and will expire on October 28, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\KONSTA~1\AppData\Local\Temp\graphlab_server_1481125654.log.0


In [6]:
sales = sales.sort(['sqft_living','price'])

In [7]:
l2_small_penalty = 1e-5

In [8]:
poly1_data = polynomial_sframe(sales['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = sales['price']
model_1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, 
                                           validation_set = None, l2_penalty=l2_small_penalty)
model_1.get("coefficients")

name,index,value,stderr
(intercept),,167924.863088,
power_1,,103.090934871,
power_2,,0.13460457027,
power_3,,-0.000129071375318,
power_4,,5.18928995405e-08,
power_5,,-7.771693847e-12,
power_6,,1.71144973107e-16,
power_7,,4.51177797327e-20,
power_8,,-4.788384934879999e-25,
power_9,,-2.33343501458e-28,


In [9]:
(semi_split1, semi_split2) = sales.random_split(.5,seed=0)
(set_1, set_2) = semi_split1.random_split(0.5, seed=0)
(set_3, set_4) = semi_split2.random_split(0.5, seed=0)

In [10]:
poly1_data = polynomial_sframe(set_1['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_1['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', 
                                           features = my_features, verbose = False, validation_set = None, 
                                           l2_penalty=l2_small_penalty)
model1.get("coefficients")

name,index,value,stderr
(intercept),,9306.4666294,810199.413847
power_1,,585.865804883,3442.10422346
power_2,,-0.397305874315,6.02291333786
power_3,,0.000141470888681,0.0057279761005
power_4,,-1.52945955404e-08,3.29578977423e-06
power_5,,-3.79756819869e-13,1.2133445418e-09
power_6,,5.97481991752e-17,3.00417024061e-13
power_7,,1.06888523817e-20,5.5009491761400004e-17
power_8,,1.5934378094299998e-25,8.364241557819999e-21
power_9,,-6.92834881362e-29,1.00031911962e-24


In [11]:
poly1_data = polynomial_sframe(set_2['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_2['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', 
                                           features = my_features, verbose = False, validation_set = None, 
                                           l2_penalty=l2_small_penalty)
model1.get("coefficients")

name,index,value,stderr
(intercept),,-25115.8993262,1461908.2824
power_1,,783.493780749,8141.54862382
power_2,,-0.767759273211,19.1599476904
power_3,,0.000438766345759,0.0252348495278
power_4,,-1.15169156529e-07,2.07713736269e-05
power_5,,6.842811259260001e-12,1.1267705289e-08
power_6,,2.51195199365e-15,4.11595289605e-12
power_7,,-2.0644054241000003e-19,1.00652509326e-15
power_8,,-4.5967316974600005e-23,1.56967454249e-19
power_9,,-2.7127203920799997e-29,1.2650332543900001e-23


In [12]:
poly1_data = polynomial_sframe(set_3['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_3['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', 
                                           features = my_features, verbose = False, validation_set = None, 
                                           l2_penalty=l2_small_penalty)
model1.get("coefficients")

name,index,value,stderr
(intercept),,462426.567754,1264112.98769
power_1,,-759.251854206,6160.31746987
power_2,,1.02867006808,12.6575879778
power_3,,-0.00052826454477,0.0145109950169
power_4,,1.15422915761e-07,1.03360324025e-05
power_5,,-2.2609610825300003e-12,4.79760227635e-09
power_6,,-2.08214269714e-15,1.4667300238900001e-12
power_7,,4.08770281089e-20,2.86235422373e-16
power_8,,2.5707916981600003e-23,3.1670830313200003e-20
power_9,,1.2431125129e-27,2.07861266245e-24


In [13]:
poly1_data = polynomial_sframe(set_4['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_4['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', 
                                           features = my_features, verbose = False, validation_set = None, 
                                           l2_penalty=l2_small_penalty)
model1.get("coefficients")

name,index,value,stderr
(intercept),,-170240.034765,1038851.01649
power_1,,1247.59034807,5068.70200366
power_2,,-1.22460911823,5.25072710262
power_3,,0.000555254617758,
power_4,,-6.38262314137e-08,
power_5,,-2.20216008557e-11,
power_6,,4.818347075499999e-15,
power_7,,4.2146163971099997e-19,
power_8,,-7.99880731487e-23,
power_9,,-1.3236591845300002e-26,


In [14]:
poly1_data = polynomial_sframe(set_1['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_1['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, 
                                           verbose = False, validation_set = None, l2_penalty=1e5)
model1.get("coefficients")

name,index,value,stderr
(intercept),,530317.024516,1219250.91739
power_1,,2.58738875673,5179.94540664
power_2,,0.00127414400592,9.06374713073
power_3,,1.74934226932e-07,0.00861990270049
power_4,,1.06022119097e-11,4.95976007523e-06
power_5,,5.42247604482e-16,1.82593497407e-09
power_6,,2.8956382834300004e-20,4.5209083829e-13
power_7,,1.6500066635100001e-24,8.27825497639e-17
power_8,,9.86081528409e-29,1.2587159430600001e-20
power_9,,6.0658934825399995e-33,1.50535779641e-24


In [15]:
poly1_data = polynomial_sframe(set_2['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_2['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, 
                                           verbose = False, validation_set = None, l2_penalty=1e5)
model1.get("coefficients")

name,index,value,stderr
(intercept),,519216.897383,2014830.5895
power_1,,2.04470474182,11220.8415608
power_2,,0.0011314362684,26.4066146726
power_3,,2.93074277549e-07,0.034779163209
power_4,,4.43540598453e-11,2.86275134176e-05
power_5,,4.80849112204e-15,1.5529371824e-08
power_6,,4.530917078259999e-19,5.67268678875e-12
power_7,,4.16042910575e-23,1.38721257101e-15
power_8,,3.90094635128e-27,2.16335615706e-19
power_9,,3.7773187602e-31,1.74349357506e-23


In [16]:
poly1_data = polynomial_sframe(set_3['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_3['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, 
                                           verbose = False, validation_set = None, l2_penalty=1e5)
model1.get("coefficients")

name,index,value,stderr
(intercept),,522911.518048,1762190.05962
power_1,,2.26890421877,8587.5632283
power_2,,0.00125905041842,17.6448434043
power_3,,2.77552918155e-07,0.0202285170891
power_4,,3.2093309779e-11,1.44085645294e-05
power_5,,2.87573572364e-15,6.68792040245e-09
power_6,,2.50076112671e-19,2.04464086154e-12
power_7,,2.24685265906e-23,3.99015927317e-16
power_8,,2.09349983135e-27,4.41495522166e-20
power_9,,2.00435383296e-31,2.89761327289e-24


In [17]:
poly1_data = polynomial_sframe(set_4['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = set_4['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, 
                                           verbose = False, validation_set = None, l2_penalty=1e5)
model1.get("coefficients")

name,index,value,stderr
(intercept),,513667.087087,1373753.86668
power_1,,1.91040938244,6702.74068754
power_2,,0.00110058029175,6.94344669789
power_3,,3.12753987879e-07,
power_4,,5.50067886825e-11,
power_5,,7.20467557825e-15,
power_6,,8.2497724938399995e-19,
power_7,,9.06503223498e-23,
power_8,,9.95683160453e-27,
power_9,,1.10838127982e-30,


In [18]:
(train_valid, test) = sales.random_split(.9, seed=1)
train_valid_shuffled = graphlab.toolkits.cross_validation.shuffle(train_valid, random_seed=1)

In [19]:
n = len(train_valid_shuffled)
k = 10 # 10-fold cross-validation

for i in xrange(k):
    start = (n*i)/k
    end = (n*(i+1))/k-1
    print i, (start, end)

0 (0, 1938)
1 (1939, 3878)
2 (3879, 5817)
3 (5818, 7757)
4 (7758, 9697)
5 (9698, 11636)
6 (11637, 13576)
7 (13577, 15515)
8 (15516, 17455)
9 (17456, 19395)


In [20]:
train_valid_shuffled[0:10] # rows 0 to 9

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront
2780400035,2014-05-05 00:00:00+00:00,665000.0,4.0,2.5,2800.0,5900,1,0
1703050500,2015-03-21 00:00:00+00:00,645000.0,3.0,2.5,2490.0,5978,2,0
5700002325,2014-06-05 00:00:00+00:00,640000.0,3.0,1.75,2340.0,4206,1,0
475000510,2014-11-18 00:00:00+00:00,594000.0,3.0,1.0,1320.0,5000,1,0
844001052,2015-01-28 00:00:00+00:00,365000.0,4.0,2.5,1904.0,8200,2,0
2781280290,2015-04-27 00:00:00+00:00,305000.0,3.0,2.5,1610.0,3516,2,0
2214800630,2014-11-05 00:00:00+00:00,239950.0,3.0,2.25,1560.0,8280,2,0
2114700540,2014-10-21 00:00:00+00:00,366000.0,3.0,2.5,1320.0,4320,1,0
2596400050,2014-07-30 00:00:00+00:00,375000.0,3.0,1.0,1960.0,7955,1,0
4140900050,2015-01-26 00:00:00+00:00,440000.0,4.0,1.75,2180.0,10200,1,0

view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat
0,3,8,1660,1140,1963,0,98115,47.68093246
0,3,9,2490,0,2003,0,98074,47.62984888
0,5,7,1170,1170,1917,0,98144,47.57587004
0,4,7,1090,230,1920,0,98107,47.66737217
0,5,7,1904,0,1999,0,98010,47.31068733
0,3,8,1610,0,2006,0,98055,47.44911017
0,4,7,1560,0,1979,0,98001,47.33933392
0,3,6,660,660,1918,0,98106,47.53271982
0,4,7,1260,700,1963,0,98177,47.76407345
2,3,8,2000,180,1966,0,98028,47.76382378

long,sqft_living15,sqft_lot15
-122.28583258,2580.0,5900.0
-122.02177564,2710.0,6629.0
-122.28796,1360.0,4725.0
-122.36472902,1700.0,5000.0
-122.0012452,1560.0,12426.0
-122.1878086,1610.0,3056.0
-122.25864364,1920.0,8120.0
-122.34716948,1190.0,4200.0
-122.36361517,1850.0,8219.0
-122.27022456,2590.0,10445.0


In [21]:
validation4 = train_valid_shuffled[5818:7758] 

In [22]:
print int(round(validation4['price'].mean(), 0))

536234


In [23]:
n = len(train_valid_shuffled)
first_two = train_valid_shuffled[0:2]
last_two = train_valid_shuffled[n-2:n]
print first_two.append(last_two)

+------------+---------------------------+-----------+----------+-----------+
|     id     |            date           |   price   | bedrooms | bathrooms |
+------------+---------------------------+-----------+----------+-----------+
| 2780400035 | 2014-05-05 00:00:00+00:00 |  665000.0 |   4.0    |    2.5    |
| 1703050500 | 2015-03-21 00:00:00+00:00 |  645000.0 |   3.0    |    2.5    |
| 4139480190 | 2014-09-16 00:00:00+00:00 | 1153000.0 |   3.0    |    3.25   |
| 7237300290 | 2015-03-26 00:00:00+00:00 |  338000.0 |   5.0    |    2.5    |
+------------+---------------------------+-----------+----------+-----------+
+-------------+----------+--------+------------+------+-----------+-------+------------+
| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |
+-------------+----------+--------+------------+------+-----------+-------+------------+
|    2800.0   |   5900   |   1    |     0      |  0   |     3     |   8   |    1660    |
|    2490.0   |   59

In [24]:
train4=train_valid_shuffled[0:5818].append(train_valid_shuffled[7758:19396])

In [25]:
print int(round(train4['price'].mean(), 0))

539450


In [26]:
def get_RSS(prediction, output):
    residual = output - prediction
    RS = residual*residual
    RSS = RS.sum()
    return(RSS)

In [32]:
def k_fold_cross_validation(k, l2_penalty, data, features_list):
    n = len(data)
    RSS = 0
    for i in xrange(k):
        start = (n*i)/k
        end = (n*(i+1))/k-1
        valid=data[start:end+1]
        train=data[0:start].append(data[end+1:n])
        model1 = graphlab.linear_regression.create(train, target='price', features = features_list, 
                                                   l2_penalty=l2_penalty,validation_set=None,verbose = False)
        pred=model1.predict(valid)
        A =get_RSS(pred,valid['price'])
        RSS = RSS + A
        
    error = RSS/k
    return error

In [33]:
import numpy as np
poly_data = polynomial_sframe(train_valid_shuffled['sqft_living'], 15)

my_features = poly_data.column_names()

poly_data['price'] = train_valid_shuffled['price']

for l2_penalty in np.logspace(1, 7, num=13):

    error = k_fold_cross_validation(10, l2_penalty, poly_data, my_features)
    print l2_penalty
    print error

10.0
4.91826427769e+14
31.6227766017
2.87504229919e+14
100.0
1.60908965822e+14
316.227766017
1.22090967326e+14
1000.0
1.21192264451e+14
3162.27766017
1.2395000929e+14
10000.0
1.36837175248e+14
31622.7766017
1.71728094842e+14
100000.0
2.2936143126e+14
316227.766017
2.52940568729e+14
1000000.0
2.58682548441e+14
3162277.66017
2.62819399742e+14
10000000.0
2.64889015378e+14


In [34]:
poly1_data = polynomial_sframe(train_valid_shuffled['sqft_living'], 15) 
my_features = poly1_data.column_names() 
poly1_data['price'] = train_valid_shuffled['price']
model1 = graphlab.linear_regression.create(poly1_data, target = 'price', features = my_features, verbose = False,
                                           validation_set = None, l2_penalty=1000)
error = k_fold_cross_validation(10, 1000, poly1_data, my_features)
print error

1.21192264451e+14
