# Final Project: House Price Prediction
## Corpus Christi Team
### Step 3c (Prepare the data for ML)

Remove unnecessary columns for ML, replace empty or invalid cells,
assign data types, encode, etc.

#### Import requested libraries

In [33]:
import pandas as pd
import requests
import json
import re
import os
import glob
from datetime import datetime as dt
import warnings
from time import sleep
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Import your RapidAPI key
from config import krapid as key

#### Read the input data

In [3]:
# Load the input data (only the most recent file)
data_path = "data/*.csv"
listing = glob.glob(data_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: data\fix_cleaned_data_Houston_TX_Houses_merged_2023_1_1_12_5.csv


In [4]:
# Create the input DF
df_raw = pd.read_csv(latest_file)

#### Retreive the missing coordinates cells

In [27]:
# Get the list of zid's where lat and lng are invalid
missing_coordinates_list = df[df['Lat'].isnull()].zid.tolist()
length = len(missing_coordinates_list)

# Create an empty DF, only declare colun names
cnames = ['zid', 'LatOK', "LngOK"]
df_coord = pd.DataFrame(columns=cnames)

i = 1

for zid in missing_coordinates_list:
    # API specific (do not modify)
    host = "zillow-com1.p.rapidapi.com"
    url = "https://" + host + "/property" # Query of single property details

    # API specific (do not modify)
    querystring = {"zpid": zid}

    headers = {
        "X-RapidAPI-Key": key,
        "X-RapidAPI-Host": host
    }

    response = requests.request("GET", url, headers=headers, params=querystring)
    json_response = response.json()
    
    # Max 2 calls per second are allowed
    sleep(0.6)

    # If the response is successful (i.e., 200), then get the property details
    if response.status_code == 200:
        
        latOK = json_response['latitude']
        lngOK = json_response['longitude']
        
        print(f'working in item {i} of {length}, zid: {zid}, lat: {latOK}, lng: {lngOK}')
        
        i = i + 1

        # Append to output DF
        df_row = {'zid': zid, 'LatOK': latOK, 'LngOK': lngOK}
        df_coord = df_coord.append(df_row, ignore_index=True)

working in item 1 of 339, zid: 2061573385, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2061963213, lat: 29.86276, lng: -95.298004
working in item 1 of 339, zid: 2060675685, lat: 29.898052, lng: -95.179955
working in item 1 of 339, zid: 2060310473, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062368294, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2063405724, lat: 29.859318, lng: -95.382195
working in item 1 of 339, zid: 2061600037, lat: 30.008938, lng: -95.443504
working in item 1 of 339, zid: 2063176996, lat: 29.859318, lng: -95.382195
working in item 1 of 339, zid: 2063178253, lat: 29.859318, lng: -95.382195
working in item 1 of 339, zid: 2063178036, lat: 29.859318, lng: -95.382195
working in item 1 of 339, zid: 2066644430, lat: 29.602442, lng: -95.44372
working in item 1 of 339, zid: 2060302552, lat: 29.855608, lng: -95.446785
working in item 1 of 339, zid: 2061866165, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2061

working in item 1 of 339, zid: 2061186467, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2063388122, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2060999643, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2060343153, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2086424388, lat: 30.016987, lng: -95.112785
working in item 1 of 339, zid: 2060362270, lat: 29.73581, lng: -95.52097
working in item 1 of 339, zid: 2069113251, lat: 29.832806, lng: -95.41583
working in item 1 of 339, zid: 2085542934, lat: 29.844372, lng: -95.40341
working in item 1 of 339, zid: 2060940629, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2060940686, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2060905909, lat: 29.836266, lng: -95.41629
working in item 1 of 339, zid: 2060914198, lat: 29.836042, lng: -95.41629
working in item 1 of 339, zid: 2061455608, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2060952897, lat

working in item 1 of 339, zid: 2071343412, lat: 29.825167, lng: -95.277336
working in item 1 of 339, zid: 2064076690, lat: 29.898052, lng: -95.179955
working in item 1 of 339, zid: 2063420127, lat: 29.859318, lng: -95.382195
working in item 1 of 339, zid: 2061466855, lat: 29.811197, lng: -95.35167
working in item 1 of 339, zid: 2062182303, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062182305, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062182306, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062183225, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062183226, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062183227, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2062183234, lat: 29.85329, lng: -95.4391
working in item 1 of 339, zid: 2064210808, lat: 29.797062, lng: -95.48026
working in item 1 of 339, zid: 2060303269, lat: 30.091225, lng: -95.18045
working in item 1 of 339, zid: 2061909335, lat: 3

working in item 1 of 339, zid: 2061711108, lat: 29.82681, lng: -95.42553
working in item 1 of 339, zid: 2061782100, lat: 29.82681, lng: -95.42553
working in item 1 of 339, zid: 2061782343, lat: 29.82681, lng: -95.42553
working in item 1 of 339, zid: 2061782434, lat: 29.82681, lng: -95.42553


#### Output the Coordinates DF

In [28]:
# Output file name
name = latest_file.split("data")[2]
out_name = 'missing_coordinates_data'+ name

# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

# Save the merged DF to a csv file
df_coord.to_csv(f'./data/{out_name}', index=False)

## I manually filled the missing coordinates cells
### Now the table need to be re-loaded

In [39]:
# Load the input data (only the most recent file)
data_path = "data/*.csv"
listing = glob.glob(data_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: data\manual_edit_fix_cleaned_data_Houston_TX_Houses_merged_2023_1_1_12_5.csv


In [40]:
# Create the input DF
df_raw = pd.read_csv(latest_file)

In [41]:
df_raw.head(3)

Unnamed: 0,zid,Lat,Lng,Price,Bedrooms,Bathrooms,constructedArea,zipCode,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT,Address,lotArea,Image,CountyId,taxRate,annualHOA
0,27541924,29.6036,-95.485405,276000,3.0,2,3612.0,77489,1,0,1,0,1971.0,2.0,2,1159,6.0,0.5,3,1.3,2,1.1,1,31000,1213,8764.272,"6602 Indian Lake Dr, Missouri City, TX 77489",8764.272,https://photos.zillowstatic.com/fp/bc5a7196226...,2698,2.41,$400 annually
1,27542111,29.600388,-95.48821,220000,3.0,2,1270.0,77489,1,0,1,0,1972.0,1.0,1,924,6.0,0.3,3,1.3,2,1.0,1,65100,503,7884.36,"6727 Castleview Ln, Missouri City, TX 77489",7884.36,https://photos.zillowstatic.com/fp/1336380a837...,2698,2.41,$144 annually
2,27542455,29.595436,-95.48588,230000,4.0,2,1765.0,77489,1,0,1,0,1974.0,1.0,2,966,6.0,0.3,3,1.0,2,0.6,1,115275,5140,7148.196,"15902 Ridgerock Rd, Missouri City, TX 77489",7148.196,https://photos.zillowstatic.com/fp/5603ce32232...,2698,2.41,$250 annually


#### Do some more conditioning

In [42]:
df_copy = df_raw.copy()

df = df_copy.drop(['Address', 'lotArea', 'Image', 'CountyId', 'taxRate', 'annualHOA'], axis=1)
df.head(3)

Unnamed: 0,zid,Lat,Lng,Price,Bedrooms,Bathrooms,constructedArea,zipCode,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT
0,27541924,29.6036,-95.485405,276000,3.0,2,3612.0,77489,1,0,1,0,1971.0,2.0,2,1159,6.0,0.5,3,1.3,2,1.1,1,31000,1213,8764.272
1,27542111,29.600388,-95.48821,220000,3.0,2,1270.0,77489,1,0,1,0,1972.0,1.0,1,924,6.0,0.3,3,1.3,2,1.0,1,65100,503,7884.36
2,27542455,29.595436,-95.48588,230000,4.0,2,1765.0,77489,1,0,1,0,1974.0,1.0,2,966,6.0,0.3,3,1.0,2,0.6,1,115275,5140,7148.196


#### Save the table

In [43]:
# Output file name
name = latest_file.split("data")[2]
out_name = 'Pre_ML_data'+ name

# Save the merged DF to a csv file
df.to_csv(f'./data/{out_name}', index=False)

#### QC a specific zid

In [50]:
host = "zillow-com1.p.rapidapi.com"
url = "https://" + host + "/property" # Query of single property details

# API specific (do not modify)
querystring = {"zpid": '2080317452'}

headers = {
    "X-RapidAPI-Key": key,
    "X-RapidAPI-Host": host
}

response = requests.request("GET", url, headers=headers, params=querystring)
json_response = response.json()

json_response

{'listingProvider': None,
 'buildingPermits': None,
 'propertyTaxRate': 2.29,
 'contact_recipients': [{'agent_reason': 1,
   'zpro': None,
   'recent_sales': 0,
   'review_count': 5,
   'display_name': 'Marianne Cowan',
   'zuid': 'X1-ZUwvno69ocbabt_8s1xy',
   'rating_average': 5,
   'badge_type': 'Premier Agent',
   'phone': {'prefix': '626', 'areacode': '281', 'number': '5640'},
   'image_url': 'https://photos.zillowstatic.com/h_n/ISi7defpty2atb0000000000.jpg'}],
 'solarPotential': None,
 'longitude': -95.17974,
 'zpid': 2080317452,
 'address': {'city': 'Porter',
  'neighborhood': None,
  'state': 'TX',
  'streetAddress': '0 Hueni Rd',
  'zipcode': '77365'},
 'cityId': 39051,
 'timeOnZillow': '215 days',
 'url': '/homedetails/0-Hueni-Rd-Porter-TX-77365/2080317452_zpid/',
 'zestimate': None,
 'imgSrc': 'https://photos.zillowstatic.com/fp/db183e1ca44ab7626ce8db6903f48fa6-p_d.jpg',
 'zipcode': '77365',
 'zestimateLowPercent': None,
 'livingAreaValue': 0,
 'dateSold': '',
 'livingArea': 

## I conducted some more manual edit to get the table ready for ML
### Now the table need to be re-loaded

In [55]:
data_path = "data/*.csv"
listing = glob.glob(data_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: data\Edited_Pre_ML_data_Houston_TX_Houses_merged_2023_1_1_12_5.csv


In [58]:
# Create the input DF
df_ml = pd.read_csv(latest_file)
df_ml = df_ml.drop(['zid'], axis=1)

In [59]:
df_ml.head(3)

Unnamed: 0,Lat,Lng,Price,Bedrooms,Bathrooms,constructedArea,zipCode,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT
0,29.6036,-95.485405,276000,3,2,3612,77489,1,0,1,0,1971,2,2,1159,6,0.5,3,1.3,2,1.1,1,31000,1213,8764.272
1,29.600388,-95.48821,220000,3,2,1270,77489,1,0,1,0,1972,1,1,924,6,0.3,3,1.3,2,1.0,1,65100,503,7884.36
2,29.595436,-95.48588,230000,4,2,1765,77489,1,0,1,0,1974,1,2,966,6,0.3,3,1.0,2,0.6,1,115275,5140,7148.196


#### QC the df to be input for the ML model

In [60]:
df_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4499 entries, 0 to 4498
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Lat               4499 non-null   float64
 1   Lng               4499 non-null   float64
 2   Price             4499 non-null   int64  
 3   Bedrooms          4499 non-null   int64  
 4   Bathrooms         4499 non-null   int64  
 5   constructedArea   4499 non-null   int64  
 6   zipCode           4499 non-null   int64  
 7   hasGarage         4499 non-null   int64  
 8   hasPool           4499 non-null   int64  
 9   hasCooling        4499 non-null   int64  
 10  hasView           4499 non-null   int64  
 11  yearBuilt         4499 non-null   int64  
 12  stories           4499 non-null   int64  
 13  parkingSpaces     4499 non-null   int64  
 14  annualHOI         4499 non-null   int64  
 15  schoolElemRating  4499 non-null   int64  
 16  schoolElemDist    4499 non-null   float64


In [62]:
# Output file name
name = latest_file.split("data")[2]
out_name = 'ML_data'+ name

# Save the merged DF to a csv file
df_ml.to_csv(f'./data/{out_name}', index=False)