# Final Project: House Price Prediction
## Corpus Christi Team
### Step 3a (Clean the Data)

#### Import requested libraries

In [1]:
import pandas as pd
import warnings
import glob
import os
import time
from time import sleep
from tqdm import tqdm
import re

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

#### Input Data

In [2]:
# Load the input data (only the most recent file)
data_path = "../02_Merge_Data/data/*.csv"
listing = glob.glob(data_path)
latest_file = max(listing, key=os.path.getctime)
print(f'The file selected is: {latest_file}')

The file selected is: ../02_Merge_Data/data\data_Houston_TX_Houses_merged_2023_1_1_12_5.csv


#### Load the csv file as a DF

In [3]:
df_raw = pd.read_csv(latest_file)
numRows = len(df_raw)
numCols = len(df_raw.columns)
print(f'rows x cols: {numRows} x {numCols} (raw input data)')

rows x cols: 7575 x 36 (raw input data)


#### Clean the DF

In [4]:
# Remove duplicated rows
df = df_raw.drop_duplicates(subset='zid', keep="first")
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed duplicated rows)')

rows x cols: 4501 x 36 (removed duplicated rows)


In [5]:
# Remove columns where all values are NaN:
df = df.dropna(axis=1, how='all')
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed empty columns)')

rows x cols: 4501 x 35 (removed empty columns)


In [6]:
# Remove columns with a single unique value
df.drop(columns=df.columns[df.nunique()==1], inplace=True)
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed single value columns)')

rows x cols: 4501 x 33 (removed single value columns)


In [7]:
#### Remove unnecessary columns for ML:
df = df.drop(['Page', 'Item'], axis=1)
numRows = len(df)
numCols = len(df.columns)
print(f'rows x cols: {numRows} x {numCols} (removed unnecessary columns)')

rows x cols: 4501 x 31 (removed unnecessary columns)


In [8]:
# Columns heavily imbalanced. Drop before input to ML
#df['countyId'].value_counts()
#df['taxRate'].value_counts()

In [9]:
# Convert lotArea in Acres to sqft using formula: sqft = Acres x 43560
df['lotAreaSQFT'] = df['lotArea'].apply(lambda x: x*43560 if x <= 100 else x)

In [10]:
df.head(2)

Unnamed: 0,zid,Address,Lat,Lng,Price,Image,Bedrooms,Bathrooms,lotArea,constructedArea,zipCode,CountyId,taxRate,hasGarage,hasPool,hasCooling,hasView,yearBuilt,stories,parkingSpaces,annualHOI,annualHOA,schoolElemRating,schoolElemDist,schoolMidRating,schoolMidDist,schoolHighRating,schoolHighDist,priceIncreased,priceDiff,days,lotAreaSQFT
0,28375002,"8315 Enchanted Forest Dr, Houston, TX 77088",29.88962,-95.475,129900.0,https://photos.zillowstatic.com/fp/e6e94b579a3...,3.0,2,0.2617,1969.0,77088,1090,2.29,1,0,1,0,1982.0,1.0,2,995,$300 annually,4.0,0.8,4,0.6,2,5.0,1,107100,2432,11399.652
1,84026428,"415 Remington Bend Ct, Houston, TX 77073",29.971046,-95.41435,250000.0,https://photos.zillowstatic.com/fp/776702db898...,3.0,3,4159.98,1862.0,77073,1090,2.29,1,0,1,0,2007.0,2.0,2,995,$570 annually,5.0,0.7,2,6.0,1,3.0,0,-13000,49,4159.98


In [None]:
# Filter a column by its values
#df.loc[df['constructedArea'] == 0] 

#### Save the clean data

In [11]:
name = latest_file.split("data")[2]
out_name = 'cleaned_data'+ name

# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

# Save the merged DF to a csv file
df.to_csv(f'./data/{out_name}', index=False)

In [None]:
# Condition the annual HOA column (NEEDS REVIEW)
#df['annualHOA']
#df['annualHOACleaned'] = df['annualHOA'].str.replace('annually|\$|\,', '')

#df['match'] = df["annualHOACleaned"].str.count('monthly', re.I)
#df['annualHOACleaned'] = df['annualHOACleaned'].str.replace('monthly', '')
#df['annualHOACleaned'] = df['annualHOACleaned'].fillna(0)
#df['annalHOACleaned'] = df['annualHOACleaned'].astype('int')
#df['annualHOACleaned']

#df['match']

#df['annualHOACleaned'].apply(lambda x: x/12 if df['match'] == 1 else x)
#df['annualHOACleaned'] = df['annualHOACleaned'].fillna(0)


#df['annualHOACleaned'].apply(lambda x: x/12)
#df['annualHOACleaned'].apply(lambda x: x/12 if df['match'] == 1.0 else x)
#df['annualHOACleaned'] = df['annualHOACleaned'].fillna(0)