# Launch Pandas

In [5]:
import pandas as pd

# prevent pandas from converting number to scientific
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Load house sales data

In [8]:
sales = pd.read_csv("home_data.csv", dtype={'zipcode': 'object'})

**Question 1**: Selection and summary statistics: We found the zip code with the highest average house price. What is the average house price of that zip code?

In [88]:
# lets first, find out the richest neighborhood in this county
sales.groupby('zipcode').price.mean().sort_values(ascending=False).reset_index().head(1)

Unnamed: 0,zipcode,price
0,98039,2160606.6


In [89]:
richest_neighborhood = '98039'

sales.loc[sales.zipcode == richest_neighborhood, 'price'].mean()

2160606.6

**Question 2**:
    1. first select the houses that have ‘sqft_living’ higher than 2000 sqft but no larger than 4000 sqft.

In [81]:
size = len(sales[(sales.sqft_living >= 2_000) & (sales.sqft_living <= 4_000)])

In [82]:
size/ len(sales) * 100

42.66413732475825

**Question 3**: What is the difference in RMSE between the model trained with `my_features` and the one trained with `advanced_features`?

In [69]:
import numpy as np

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.lineaxr_model import LinearRegression
from sklearn.metrics import mean_squared_error

#### my features model

In [None]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [70]:
X = sales[my_features]
y = sales.price

X_train, X_test, y_train, y_test = train_test_split(X, y)

my_features_model = LinearRegression()

my_features_model.fit(X_train, y_train)

y_pred = my_features_model.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred))

247811.3661432396

#### advanced features model

In [53]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [55]:
advanced_features_model = LinearRegression()

In [71]:
X = sales[advanced_features]
y = sales.price

X_train, X_test, y_train, y_test = train_test_split(X, y)

advanced_features_model = LinearRegression()

advanced_features_model.fit(X_train, y_train)

y_pred = advanced_features_model.predict(X_test)

np.sqrt(mean_squared_error(y_test, y_pred))

201638.59121433774

In [72]:
abs(247811.3661432396 - 201638.59121433774)

46172.77492890187

* Unfortunately, they hard coded this question. So (the right answer, as they calculated), is 25k, i.e. the `advanced_model` rmse is 25k lower than the `my_feaures_model`.