In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

# Selection and summary statistics

In [2]:
sales = pd.read_csv('home_data.csv')

In [3]:
sales.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [24]:
sales1 = sales[['price', 'zipcode']]
sales1.groupby('zipcode')['price'].mean()

zipcode
98001    2.808047e+05
98002    2.342840e+05
98003    2.941113e+05
98004    1.355927e+06
98005    8.101649e+05
             ...     
98177    6.761854e+05
98178    3.106128e+05
98188    2.890783e+05
98198    3.028789e+05
98199    7.918208e+05
Name: price, Length: 70, dtype: float64

In [25]:
max(sales1.groupby('zipcode')['price'].mean())

2160606.6

# Filtering data

In [10]:
sales1 = sales[ (sales['sqft_living'] > 2000) & (sales['sqft_living'] <= 4000) ]

In [11]:
print(sales1.shape)
print(sales.shape)

(9118, 21)
(21613, 21)


In [12]:
fractions = 9118 / 21613
fractions

0.42187572294452413

# Building a regression model with several more features:

In [13]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors 
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [20]:
X1 = sales[my_features]
X2 = sales[advanced_features]
y = sales['price']

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, random_state = 0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size = 0.2, random_state = 0)

In [21]:
# Model for X1 and y
lin = LinearRegression()
lin.fit(X_train, y_train)

# Calculate RMSE for Model1
y1_pred = lin.predict(X_test)
tmp1 = (y1_pred - y_test) ** 2
RSS1 = tmp1.sum()
RMSE1 = np.sqrt(np.mean(tmp1))

print(RMSE1)

244004.77443104205


In [22]:
# Model for X2 and y
lin.fit(X2_train, y2_train)

y2_pred = lin.predict(X2_test)
tmp2 = (y2_pred - y2_test) ** 2
RSS2 = tmp2.sum()
RMSE2 = np.sqrt(np.mean(tmp2))

print(RMSE2)

190473.37570967907


In [23]:
diff = RMSE1 - RMSE2
print(diff)

53531.39872136299
