In [1]:
import requests
import pandas as pd
import json
import time
import plotly.express as px
import statsmodels.api as sm

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Pulling Data From API

In [None]:
url = "https://zillow56.p.rapidapi.com/search"

with open('api.txt', 'r') as file:
    api_key = file.read()

headers = {
	"X-RapidAPI-Key": api_key,
	"X-RapidAPI-Host": "zillow56.p.rapidapi.com"
}

lst_1 = []

for page in range(1, 21):
    querystring = {"location":"Utah County, UT", 
                "sortSelection":"days", 
                "page":f"{page}"}

    response = requests.get(url, headers=headers, params=querystring)

    data = response.json()
    df__ = pd.json_normalize(data['results'])
    lst_1.append(df__)
    time.sleep(4)

df = pd.concat(lst_1)

In [None]:
# df.to_csv('zillow_orig.csv')

### Cleaning the Data

In [2]:
## Because I have limited API requests ##
df_orig = pd.read_csv('zillow_orig.csv')

In [5]:
print(df_orig.columns)

Index(['Unnamed: 0', 'bathrooms', 'bedrooms', 'city', 'country', 'currency',
       'daysOnZillow', 'group_type', 'homeStatus', 'homeStatusForHDP',
       'homeType', 'imgSrc', 'isFeatured', 'isNonOwnerOccupied',
       'isPreforeclosureAuction', 'isPremierBuilder', 'isShowcaseListing',
       'isUnmappable', 'isZillowOwned', 'latitude', 'livingArea', 'longitude',
       'newConstructionType', 'price', 'priceForHDP', 'priceSuffix',
       'providerListingID', 'rentZestimate', 'shouldHighlight', 'state',
       'streetAddress', 'unit', 'zipcode', 'zpid',
       'listing_sub_type.is_newHome', 'lotAreaUnit', 'lotAreaValue',
       'zestimate', 'taxAssessedValue', 'listing_sub_type.is_FSBA',
       'openHouse', 'listing_sub_type.is_openHouse',
       'open_house_info.open_house_showing', 'datePriceChanged', 'priceChange',
       'priceReduction'],
      dtype='object')


In [3]:
# Dropping columns of no interest
df = df_orig[['price', 'bathrooms', 'bedrooms', 'city', 'homeType', 'livingArea', 'zipcode', 'priceReduction', 'daysOnZillow']]
df = df[df['homeType'] != 'LOT']

In [7]:
df.columns

Index(['price', 'bathrooms', 'bedrooms', 'city', 'homeType', 'livingArea',
       'zipcode', 'priceReduction', 'daysOnZillow'],
      dtype='object')

### EDA

What is the relationship between bathrooms & price?  
What is the relationship between bedrooms & price?  
What is the relationship between the city & the price?  
What is the average price of a 3 bedroom, 2 bathroom home in Spanish Fork?  

In [4]:
df_avg_price_per_city = df.groupby('city').median(numeric_only = True).reset_index().sort_values('price', ascending = False)
fig = px.bar(df_avg_price_per_city, x = 'city', y = 'price')
fig.show()
fig.write_image('bar_chart_preliminary.png')


In [37]:
one_hot_encoded = pd.get_dummies(df['city'], drop_first = True)
df_prepared = pd.concat([df, one_hot_encoded], axis=1)

one_hot_encoded = pd.get_dummies(df_prepared['homeType'], drop_first = True)
df_prepared = pd.concat([df_prepared, one_hot_encoded], axis=1)

df_prepared = df_prepared.drop(columns = ['city', 'homeType', 'priceReduction', 'zipcode'])
df_prepared = df_prepared[-df_prepared['livingArea'].isna()]

model = sm.OLS(df_prepared['price'], df_prepared.drop(columns = 'price')).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.851
Model:,OLS,Adj. R-squared (uncentered):,0.842
Method:,Least Squares,F-statistic:,92.76
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,2.72e-186
Time:,21:16:23,Log-Likelihood:,-7658.7
No. Observations:,534,AIC:,15380.0
Df Residuals:,503,BIC:,15510.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bathrooms,1.527e+05,2.73e+04,5.585,0.000,9.9e+04,2.06e+05
bedrooms,-1.095e+04,2.26e+04,-0.486,0.627,-5.53e+04,3.34e+04
livingArea,213.0015,15.828,13.457,0.000,181.904,244.099
daysOnZillow,9774.9552,2826.899,3.458,0.001,4220.971,1.53e+04
American Fork,-5.295e+05,9.11e+04,-5.811,0.000,-7.09e+05,-3.51e+05
Cedar Hills,-4.392e+05,2.59e+05,-1.697,0.090,-9.48e+05,6.93e+04
Draper,-3.951e+05,2.28e+05,-1.733,0.084,-8.43e+05,5.27e+04
Eagle Mountain,-7.114e+05,9.46e+04,-7.519,0.000,-8.97e+05,-5.26e+05
Elk Ridge,-7.932e+05,4.33e+05,-1.833,0.067,-1.64e+06,5.68e+04

0,1,2,3
Omnibus:,515.832,Durbin-Watson:,1.918
Prob(Omnibus):,0.0,Jarque-Bera (JB):,40297.338
Skew:,3.945,Prob(JB):,0.0
Kurtosis:,44.819,Cond. No.,92700.0
