In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statistics
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import iplot
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Data Loading



In [2]:
housePrice = pd.read_csv("Data/kc_house_data.csv")
zipcodedf = pd.read_csv("Data/usa_zipcode_of_wa.csv")

zipcodedf = zipcodedf.dropna(subset=['City'])
zipcodedf = zipcodedf.rename(columns = {"zip":"zipcode"})

zipcodedf['zipcode'] = zipcodedf['zipcode'].astype(int)
merged_zip = pd.merge(zipcodedf, housePrice, on ="zipcode")
merged_zip.head()

Unnamed: 0,zipcode,Zipcode name,City,State,County Name,id,date,price,bedrooms,bathrooms,...,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,98001,"AUBURN, WA",AUBURN,WA,KING,7895500070,20150213T000000,240000.0,4,1.0,...,2,7,890,330,1969,0,47.3341,-122.282,1290,7800
1,98001,"AUBURN, WA",AUBURN,WA,KING,3717000160,20141009T000000,287000.0,4,2.5,...,3,7,2240,0,2005,0,47.3378,-122.257,2221,4557
2,98001,"AUBURN, WA",AUBURN,WA,KING,8961960160,20141028T000000,480000.0,4,2.5,...,3,9,2520,710,2001,0,47.3183,-122.253,2640,8517
3,98001,"AUBURN, WA",AUBURN,WA,KING,4014400292,20150114T000000,465000.0,3,2.5,...,3,9,2714,0,2005,0,47.3185,-122.275,2590,18386
4,98001,"AUBURN, WA",AUBURN,WA,KING,1115450240,20141022T000000,360000.0,4,2.5,...,3,9,2160,0,1992,0,47.3341,-122.255,2280,9937


In [3]:
merged_zip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 0 to 21612
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   zipcode        21613 non-null  int64  
 1   Zipcode name   21613 non-null  object 
 2   City           21613 non-null  object 
 3   State          21613 non-null  object 
 4   County Name    21613 non-null  object 
 5   id             21613 non-null  int64  
 6   date           21613 non-null  object 
 7   price          21613 non-null  float64
 8   bedrooms       21613 non-null  int64  
 9   bathrooms      21613 non-null  float64
 10  sqft_living    21613 non-null  int64  
 11  sqft_lot       21613 non-null  int64  
 12  floors         21613 non-null  float64
 13  waterfront     21613 non-null  int64  
 14  view           21613 non-null  int64  
 15  condition      21613 non-null  int64  
 16  grade          21613 non-null  int64  
 17  sqft_above     21613 non-null  int64  
 18  sqft_b

### Data Visualization \(at least 5\)



In [4]:
fig1 = px.imshow(merged_zip.corr())
fig1.update_layout(
    autosize=False,
    width=800,
    height=800)
fig1.show()

In [5]:
fig1 = px.scatter(merged_zip, x="price", y="sqft_living",facet_col="view", color = "condition")
fig1.update_layout(
    autosize=False,
    width=1000,
    height=600)
fig1.show()

In [6]:
fig1 = px.scatter(merged_zip, x="price", y="sqft_above", color = "City")
fig1.update_layout(
    autosize=False,
    width=1000,
    height=600)
fig1.show()

In [7]:
boxplot = px.box(merged_zip, y="price", x="City")
boxplot.update_layout(
    autosize=False,
    width=1500,
    height=600)
boxplot.show()

In [8]:
boxplot = px.box(merged_zip, x="bathrooms", y="price")
boxplot.show()

In [9]:
fig = px.scatter(merged_zip, x="sqft_living", y="price",
                 size="condition", color="City", log_x=True, size_max=30)
fig.show()

In [10]:
fig = px.pie(merged_zip, values='price', names='bedrooms', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

### Data Cleaning



In [11]:
# Drop not useful columns
merged_zip = merged_zip.drop(['zipcode', 'Zipcode name', 'id', 'State', 'lat', 'long', 'County Name'], axis = 1)
# convert date into datetime and split year, month, and day from it
merged_zip['date'] = pd.to_datetime(merged_zip['date'])
merged_zip['year'] = merged_zip['date'].dt.year
merged_zip['month'] = merged_zip['date'].dt.month

# calculate house age starting from the year that the house was sold
merged_zip = merged_zip.drop(["date"],axis = 1)
merged_zip['yr_age'] = abs(merged_zip['year'] - merged_zip["yr_built"])

# use 0 and 1 to represent whether the house is renewed or not
merged_zip['renovation'] = np.where(merged_zip['yr_renovated']==0,0,1)
merged_zip.drop(['yr_renovated'], inplace=True, axis=1)
print(merged_zip['renovation'].value_counts())

merged_zip = pd.concat([merged_zip,pd.get_dummies(merged_zip["City"],prefix = "city")],axis = 1)  
merged_zip = merged_zip.drop(["City"], axis = 1)
merged_zip.head()

0    20699
1      914
Name: renovation, dtype: int64


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,city_MEDINA,city_MERCER ISLAND,city_NORTH BEND,city_REDMOND,city_RENTON,city_SAMMAMISH,city_SEATTLE,city_SNOQUALMIE,city_VASHON,city_WOODINVILLE
0,240000.0,4,1.0,1220,8075,1.0,0,0,2,7,...,0,0,0,0,0,0,0,0,0,0
1,287000.0,4,2.5,2240,4648,2.0,0,0,3,7,...,0,0,0,0,0,0,0,0,0,0
2,480000.0,4,2.5,3230,16171,2.0,0,3,3,9,...,0,0,0,0,0,0,0,0,0,0
3,465000.0,3,2.5,2714,17936,2.0,0,0,3,9,...,0,0,0,0,0,0,0,0,0,0
4,360000.0,4,2.5,2160,9528,2.0,0,0,3,9,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# remove outliers
merged_zip = merged_zip[merged_zip['price'] < 2e6] 

### Data Modeling



In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Input
from sklearn.preprocessing import MinMaxScaler

In [14]:
# get features as X and response as y
X = merged_zip.drop(["price"], axis = 1)
y = merged_zip["price"]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Linear Regression with Polynomial features



In [15]:
# fit linear regression model by adding polynomial features
poly_reg=PolynomialFeatures(degree=2)
x_poly = poly_reg.fit_transform(x_train)
x_test_poly = poly_reg.transform(x_test)

lin_reg2=LinearRegression()
lin_reg2.fit(x_poly,y_train)

y_hat2 = lin_reg2.predict(x_test_poly)
print(lin_reg2.score(x_test_poly, y_test))

0.8070369067899824


In [16]:
print(mean_absolute_error(y_hat2, y_test))

84804.08421056102


In [0]:
# grid search for best degree in polynomial features
pipe = make_pipeline(PolynomialFeatures(),LinearRegression())

params = {'polynomialfeatures__degree':[1,2,3]}

#pipe.fit(x_train, y_train)

search = GridSearchCV(estimator = pipe, param_grid = params, scoring = "neg_mean_squared_error") # mean of y_test vs. y_predict

search.fit(x_train, y_train)

print(search.best_params_)
print(search.best_score_)

#### Decision Tree



In [0]:
regressor = DecisionTreeRegressor(random_state=0)
#cross_val_score(regressor, X, y, cv=10)

regressor.fit(x_train, y_train)

In [0]:
regressor = DecisionTreeRegressor(random_state=0)

parameters={"splitter":["best","random"],
            "max_depth" : [None,1,3,5,7],
           "min_samples_leaf":[1,2,3],
           "min_weight_fraction_leaf":[0,0.2,0.3,0.4,0.5],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,30,40] }

tuning_model = GridSearchCV(regressor,param_grid=parameters,scoring='neg_mean_squared_error',cv=3,verbose=1)

tuning_model.fit(x_train, y_train)
print(tuning_model.best_params_)
Price = tuning_model.predict(x_test)
print(r2_score(y_test,Price))

In [0]:
update_regressor = DecisionTreeRegressor(random_state=0, max_features = 'auto', min_samples_leaf = 3, min_weight_fraction_leaf = 0, splitter = 'random')
update_regressor.fit(x_train, y_train)

Price = update_regressor.predict(x_test)

In [0]:
plt.scatter(y_test,Price, color="b")
plt.plot(y_test,y_test, color="r")

In [0]:
MEA= np.mean(abs(y_test - Price))
score = r2_score(y_test,Price)
print("mean absolute error is:", MEA)
print("score is:", score)

#### Neural Network



In [0]:
scaler = MinMaxScaler()
# fit the scaler and scale training data 
X_train= scaler.fit_transform(x_train)
# scale the test data 
X_test = scaler.transform(x_test)

In [0]:
model = Sequential()
model.add(Input(shape=(42,)))
model.add(Dense(100,activation = "relu"))   # relu - limiting output between 0 and max
model.add(Dense(250,activation = "relu"))
model.add(Dense(200,activation = "relu"))
model.add(Dense(100,activation = "relu"))
model.add(Dense(1))
model.compile(optimizer="adam", loss= "mae")
model.fit(X_train, y_train,validation_data=(X_test,y_test), epochs = 200,batch_size =128 )

In [0]:
losses = pd.DataFrame(model.history.history)
losses.plot()

In [0]:
# predict using the test data 
predictions = model.predict(X_test)
# evaluate the model
print(mean_absolute_error(y_test,predictions))
print(np.sqrt(mean_squared_error(y_test,predictions)))

In [0]:
r2_score(y_test,predictions)