In [None]:
# Final Project: House Price Prediction
## Corpus Christi Team
### Step 3b (Prepare the data for visualization and for machine learning)

Remove unnecessary columns for ML, replace empty or invalid cells,
assign data types, encode, etc.

#### Import requested libraries

In [1]:
import pandas as pd
import requests
import json
import re
import os
import glob
from datetime import datetime as dt
import warnings
from time import sleep
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Import your RapidAPI key
from config import krapid as key

#### Read the input data

In [2]:
# Load the input data (only the most recent file)
in_path = "./data/*.csv"
listing = glob.glob(in_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: ./data\03a_cleaned_data_Houston_TX_Houses_merged_2023_1_8_12_2.csv


In [3]:
# Create the input DF
df_raw = pd.read_csv(latest_file)

#### Do some data pre-processing

In [4]:
df = df_raw.copy()

# Convert lotArea in Acres to sqft using formula: sqft = Acres x 43560
df['lotAreaSQFT'] = df['lotArea'].apply(lambda x: x*43560 if x <= 100 else x)

In [5]:
# Reaplce all null values by the average of the column values
df = df.fillna(df.mean())

In [6]:
# Remove outlier price:
df = df[df['Price'] >= 200000] 

# Average the school rates and distances
df['avgSchoolRating'] = df[['schoolElemRating', 'schoolMidRating', 'schoolHighRating']].mean(axis=1)
df['avgSchoolDist'] = df[['schoolElemDist', 'schoolMidDist', 'schoolHighDist']].mean(axis=1)

# Check for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 4500
Data columns (total 34 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   zid               4500 non-null   int64  
 1   Address           4500 non-null   object 
 2   Lat               4500 non-null   float64
 3   Lng               4500 non-null   float64
 4   Price             4500 non-null   int64  
 5   Image             4500 non-null   object 
 6   Bedrooms          4500 non-null   float64
 7   Bathrooms         4500 non-null   int64  
 8   lotArea           4500 non-null   float64
 9   constructedArea   4500 non-null   float64
 10  zipCode           4500 non-null   int64  
 11  CountyId          4500 non-null   int64  
 12  taxRate           4500 non-null   float64
 13  hasGarage         4500 non-null   int64  
 14  hasPool           4500 non-null   int64  
 15  hasCooling        4500 non-null   int64  
 16  hasView           4500 non-null   int64  


#### Save the pre-processed data (this can be used for data viz)

In [7]:
name = latest_file.split("data")[2]
ou_name = '03b_preprocessed_data'+ name

# Create output directory if it does not exist
ou_path = './data/'
os.makedirs(ou_path, exist_ok=True)

# Save to a csv file
df.to_csv(f'{ou_path}{ou_name}', index=False)

#### Do some basic pre-conditioning for ML

In [8]:
# Drop unnecessary columns for ML
df = df.drop(['zid', 'Address', 'lotArea', 'Image', 'annualHOA', 'annualHOI'], axis=1)

# Check for null values
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 4500
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Lat               4500 non-null   float64
 1   Lng               4500 non-null   float64
 2   Price             4500 non-null   int64  
 3   Bedrooms          4500 non-null   float64
 4   Bathrooms         4500 non-null   int64  
 5   constructedArea   4500 non-null   float64
 6   zipCode           4500 non-null   int64  
 7   CountyId          4500 non-null   int64  
 8   taxRate           4500 non-null   float64
 9   hasGarage         4500 non-null   int64  
 10  hasPool           4500 non-null   int64  
 11  hasCooling        4500 non-null   int64  
 12  hasView           4500 non-null   int64  
 13  yearBuilt         4500 non-null   float64
 14  stories           4500 non-null   float64
 15  parkingSpaces     4500 non-null   int64  
 16  schoolElemRating  4500 non-null   float64


#### Save the data conditioned for ML

In [9]:
name = latest_file.split("data")[2]
ou_name = '03b_preprocessed4ML_data'+ name

# Create output directory if it does not exist
ou_path = './data/'
os.makedirs(ou_path, exist_ok=True)

# Save to a csv file
df.to_csv(f'{ou_path}{ou_name}', index=False)