# Final Project: House Price Prediction
## Corpus Christi Team
### Step 2 (Merge the Data)

#### Import requested libraries

In [2]:
import pandas as pd
import warnings
import glob
import os
import time
from time import sleep
from tqdm import tqdm

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

#### Input Data

In [3]:
data_path = "../01_Get_Data_w_Property_Details/data/*.csv"
listing = glob.glob(data_path)

#### Define Search Criteria Variables

In [4]:
# Variables
city = "Houston"
state = "TX"
location = city + ", " + state
homeType = "Houses"

#### Output Data

In [5]:
# Create output directory if it does not exist
os.makedirs('./data/', exist_ok=True)

# Output file name
year, month, day, hour, min = map(int, time.strftime("%Y %m %d %H %M").split())
time_stamp = f'{year}_{month}_{day}_{hour}_{min}'
out_name = f"data_{city}_{state}_{homeType}_merged_{time_stamp}.csv"

#### Merge the Data Frames

In [6]:
cnames = ['Page', 'Item', 'zid', 'State', 'City', 'Address', 'Lat', 'Lng', 'Price', 'Image', 'Bedrooms', 'Bathrooms', 'lotArea', 'constructedArea', 'zipCode', 'CountyId', 'taxRate', 'hasGarage', 'hasPool', 'hasCooling', 'hasView', 'yearBuilt', 'stories', 'parkingSpaces', 'annualHOI', 'annualHOA', 'Zone', 'schoolElemRating', 'schoolElemDist', 'schoolMidRating', 'schoolMidDist', 'schoolHighRating', 'schoolHighDist', 'priceIncreased', 'priceDiff', 'days']
df_merged = pd.DataFrame(columns=cnames)

def merge_data(csv):
    global df_merged
    df = pd.read_csv(csv)
    df_merged = df_merged.append(df, ignore_index=True)
    del(df)

pbar = tqdm(listing[:len(listing)])
for i in pbar:
    sleep(0.5)
    pbar.set_description(f'Processing file {i}')
    csv = i
    merge_data(csv)

Processing file ../01_Get_Data_w_Property_Details/data\data_Houston_TX_Houses_p9_price_700001_900000.csv: 100%


#### Remove duplicated rows

In [7]:
# Remove Duplicated rows
numRows = len(df_merged)
numCols = len(df_merged.columns)
print(f'rows x cols: {numRows} x {numCols} (raw input data)')

# Remove duplicated rows
df_merged = df_merged.drop_duplicates(subset='zid', keep="first")
numRows = len(df_merged)
numCols = len(df_merged.columns)
print(f'rows x cols: {numRows} x {numCols} (removed duplicated rows)')

rows x cols: 7575 x 36 (raw input data)
rows x cols: 4501 x 36 (removed duplicated rows)


#### Save the data

In [10]:
# Sort by id (smallest to largest)
df_merged = df_merged.sort_values(by=['zid'])

# Save the merged DF to a csv file
df_merged.to_csv(f'./data/{out_name}', index=False)