In [1]:
import graphlab
graphlab.canvas.set_target('ipynb')

In [3]:
sales = graphlab.SFrame('../data/home_data.csv.gz')

------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,int,int,float,int,int,float,int,int,int,int,int,int,int,int,int,float,float,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


## Filter Data

In [38]:
sales_zip = sales[sales["zipcode"] == '98039']
print sales_zip["price"].mean()
sales_zip.show(view="Scatter Plot", x="sqft_living", y="price")

2160606.6


In [52]:
sales_large = sales[(sales["sqft_living"] > 2000) & (sales["sqft_living"] <= 4000)]
print float(len(sales_large))/len(sales)

0.421875722945


## Advanced Model

In [53]:
train_data, test_data = sales.random_split(.8, seed=0)

In [54]:
sqft_model = graphlab.linear_regression.create(
    train_data, target='price', features=['sqft_living'], validation_set=None)

In [55]:
my_features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [56]:
my_features_model = graphlab.linear_regression.create(
    train_data, target='price', features=my_features, validation_set=None)

In [57]:
advanced_features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
    'zipcode', 'condition', 'grade', 'waterfront', 'view', 'sqft_above', 
    'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 
    'sqft_living15', 'sqft_lot15',]

In [58]:
advanced_features_model = graphlab.linear_regression.create(
    train_data, target='price', features=advanced_features, validation_set=None)

In [59]:
print sqft_model.evaluate(test_data)
print my_features_model.evaluate(test_data)
print advanced_features_model.evaluate(test_data)

{'max_error': 4143550.8825285938, 'rmse': 255191.02870527358}
{'max_error': 3486584.509381705, 'rmse': 179542.4333126903}
{'max_error': 3556849.413858208, 'rmse': 156831.1168021901}


In [60]:
my_features_model.evaluate(test_data)['rmse']-advanced_features_model.evaluate(test_data)['rmse']

22711.316510500183

## MAPEs

In [69]:
def mape(model, target="price"):
    return (abs(model.predict(test_data)-test_data[target])/test_data["price"]).mean()

In [70]:
print mape(sqft_model)
print mape(my_features_model)
print mape(advanced_features_model)

0.355860750112
0.207181620509
0.191556062218
