# Final Project: House Price Prediction
## Corpus Christi Team
### Step 3a (Clean the Data)

#### Import requested libraries

In [1]:
import pandas as pd
import warnings
import glob
import os
import time
from time import sleep
from tqdm import tqdm
import re

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

#### Input Data

In [2]:
# Load the input data (only the most recent file)
in_path = "../02_Merge_Data/data/*.csv"
listing = glob.glob(in_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: ../02_Merge_Data/data\data_Houston_TX_Houses_merged_2023_1_8_12_2.csv


#### Load the csv file as a DF

In [3]:
df_raw = pd.read_csv(latest_file)
numRows = len(df_raw)
numCols = len(df_raw.columns)
print(f'rows x cols: {numRows} x {numCols} (raw input data)')

rows x cols: 4501 x 36 (raw input data)


#### Clean the DF

In [4]:
# Check for duplicated traces (there should be none as they were removed in the previous step)
df = df_raw.copy()
df['zid'].duplicated().sum()

0

In [5]:
# Remove columns where all values are NaN:
df = df.dropna(axis=1, how='all')
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed empty columns)')

rows x cols: 4501 x 35 (removed empty columns)


In [6]:
# Remove columns with a single unique value
df.drop(columns=df.columns[df.nunique()==1], inplace=True)
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed single value columns)')

rows x cols: 4501 x 33 (removed single value columns)


In [7]:
# Remove unnecessary columns:
df = df.drop(['Page', 'Item'], axis=1)
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed unnecessary columns)')

rows x cols: 4501 x 31 (removed unnecessary columns)


#### Save the cleaned raw data

In [8]:
# At this point, the data is considered as cleaned raw.

# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

# For the output file name, just append a prefix using the input file name
in_name = latest_file.split("data")[2]

ou_path = './data/'
ou_name = '03a_raw_data'+ in_name 

# Save the merged DF to a csv file
df.to_csv(f'{ou_path}{ou_name}', index=False)

#### Continue with the data pre-processing

In [9]:
# Convert lotArea in Acres to sqft using formula: sqft = Acres x 43560
df['lotAreaSQFT'] = df['lotArea'].apply(lambda x: x*43560 if x <= 100 else x)

#### Save the clean data

In [10]:
name = latest_file.split("data")[2]
out_name = '03a_cleaned_data'+ name

# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

# Save the merged DF to a csv file
df.to_csv(f'./data/{out_name}', index=False)

In [11]:
# Read the output file to check it was saved correctly 
df_4check = pd.read_csv('./data/'+ out_name)
df_4check.head(3)

Unnamed: 0,zid,Address,Lat,Lng,Price,Image,Bedrooms,Bathrooms,lotArea,constructedArea,zipCode,CountyId,taxRate,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,annualHOA,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT
0,27541924,"6602 Indian Lake Dr, Missouri City, TX 77489",29.6036,-95.485405,276000,https://photos.zillowstatic.com/fp/bc5a7196226...,3.0,2,8764.272,3612.0,77489,2698,2.41,1,0,1,0,1971.0,2.0,2,1159,$400 annually,6.0,0.5,3,1.3,2,1.1,1,31000,1213,8764.272
1,27542111,"6727 Castleview Ln, Missouri City, TX 77489",29.600388,-95.48821,220000,https://photos.zillowstatic.com/fp/1336380a837...,3.0,2,7884.36,1270.0,77489,2698,2.41,1,0,1,0,1972.0,1.0,1,924,$144 annually,6.0,0.3,3,1.3,2,1.0,1,65100,503,7884.36
2,27542455,"15902 Ridgerock Rd, Missouri City, TX 77489",29.595436,-95.48588,230000,https://photos.zillowstatic.com/fp/5603ce32232...,4.0,2,7148.196,1765.0,77489,2698,2.41,1,0,1,0,1974.0,1.0,2,966,$250 annually,6.0,0.3,3,1.0,2,0.6,1,115275,5140,7148.196
