In [1209]:
# This notebook will take the housing data scraped earlier and clean/format it for further investigation.
import pandas as pd
import numpy as np
import csv
from matplotlib import pyplot as plt

pd.options.display.max_columns = None

In [1210]:
priceRange = [
    [1, 250000], [250001, 300000], [300001, 350000], [350001, 400000],
    [400001, 450000], [450001, 500000], [500001, 750000], [750001, 1000000],
    [1000001, 2000000], [2000001, 5000000], [5000001, 50000000]
    ]

In [1211]:
# Create function to take in column of dataframe and return modified column
# Function will take in resoFacts.associationFee and determine if its monthly, quarterly, etc and div value by appropriate amount
# Returned dataframe series will just a number representing monthly HOA fee due.

def hoaFees(series: pd.DataFrame):
    paymentFreq = series.apply(lambda x: str(x).strip("$,1234567890 "))
    payment = series.apply(lambda x: str(x).strip("$,-abcdefghijklmnopqrstuvwxyz ").replace(',', ''))
    payment = payment.apply(lambda x: float(x))
    paymentFreq.replace('', 0, inplace=True)
    for i, freq in enumerate(paymentFreq):
        if freq == 'monthly':
            payment[i] = payment[i]
        elif freq == 'quarterly':
            payment[i] = payment[i]/4
        elif freq == 'semi-annually':
            payment[i] = payment[i]/6
        elif freq == 'annually':
            payment[i] = payment[i]/12
        else:
            payment[i] = 0    
    payment = round(payment)
    return payment


# Concat partial URL with website prefix
def fullURL(hdpURL: str):
    hdpURL = 'www.zillow.com' + str(hdpURL)
    return hdpURL


# Set area units to a consistent value of 'Square Feet'
def areaUnitConversion(units: pd.DataFrame):
    # Input validation
    if 'lotAreaUnits' not in units.columns or 'lotAreaValue' not in units.columns:
        raise ValueError("Input DataFrame must have 'lotAreaUnits' and 'lotAreaValue' columns.")
    # Create a copy of the DataFrame to avoid in-place changes
    units_copy = units.copy(deep=True)
    for idx in units_copy.index:
        if units_copy.loc[idx, 'lotAreaUnits'] == 'Acres':
            units_copy.loc[idx, 'lotAreaValue'] *= 43560
            units_copy.loc[idx, 'lotAreaUnits'] = 'Square Feet'
        elif units_copy.loc[idx, 'lotAreaUnits'] == 'sqft':
            units_copy.loc[idx, 'lotAreaUnits'] = 'Square Feet'
    return units_copy


def timeOnZillowConversion(toz: str):
    units = toz.strip(' 1234567890')
    if units == 'day':
        units = toz
    elif units == 'days':
        units = toz
    else:
        units = '1 day'
    return units


'''
Function to calculate outliers and return list of outliers and non-outliers.
'''
def outliersAll(sortedList: pd.DataFrame):
    outlierBoolean = []
    Q1 = np.percentile(sortedList, 25)
    Q3 = np.percentile(sortedList, 75)
    IQR = Q3 - Q1
    upperBound = Q3 + 1.5*IQR
    lowerBound = Q1 - 1.5*IQR
    for value in sortedList:
        if (value < lowerBound or value > upperBound):
            outlierBoolean.append(1)
        else:
            outlierBoolean.append(0)
    return outlierBoolean


def outliersSpecific(sortedList: pd.DataFrame, column: str):
    outlierBoolean = [0] * sortedList.shape[0]
    for zipcode in sortedList['zipcode'].unique():
        count = 0
        zipcodeIndex = sortedList.index[sortedList['zipcode']==zipcode]
        Q1 = np.percentile(sortedList[column].loc[sortedList['zipcode']==zipcode], 25)
        Q3 = np.percentile(sortedList[column].loc[sortedList['zipcode']==zipcode], 75)
        IQR = Q3 - Q1
        upperBound = Q3 + 1.5*IQR
        lowerBound = Q1 - 1.5*IQR
        for TF in zipcodeIndex:
            if (sortedList[column][TF] < lowerBound or sortedList[column][TF] > upperBound):
                outlierBoolean[TF] = 1
                count+=1
    return outlierBoolean

'''
Function to take any boolean feature and deal with blanks
'''
def cleanBoolean(feature: pd.Series, replaceBlank=False):
    feature = feature.replace(r'\s+', replaceBlank, regex=True)
    feature = feature.replace(np.nan, replaceBlank)
    return feature

def cleanValue(feature: pd.Series, replaceBlank: dict):
    feature = feature.fillna(value=replaceBlank.get('replace_value'))
    feature.replace(r'\s+', replaceBlank.get('replace_value'), regex=True, inplace=True)
    return feature

'''
Parse JSON format for school rankings
'''
def schoolScores(schools: pd.Series):
    finalRating = []
    finalDistance = []
    for i in range(0, schools.shape[0]):
        distance = []
        grades = []
        rating = []
        level = []
        item = eval(schools.iloc[i])
        for j in range(0, len(item)):
            distance.append(item[j].get('distance'))
            grades.append(item[j].get('grades'))
            rating.append(item[j].get('rating'))
            level.append(item[j].get('level'))
        schoolDict = {'distance':distance, 'grades':grades, 'rating':rating, 'level':level}
        df_school = pd.DataFrame(schoolDict)
        df_school = df_school[['distance', 'rating']].groupby(df_school['level']).max().reset_index()
        finalRating.append(df_school['rating'].max())
        finalDistance.append(df_school['distance'].min())
    output = pd.DataFrame({'schoolMaxRating':finalRating, 'schoolMinDistance':finalDistance})
    return output

In [1212]:
for i, price in enumerate(priceRange):
    if i == 0:
        df_forSale = pd.read_csv(f'Raw_Housing_Data/housingData_{price[0]}_{price[1]}.csv',
                                 index_col=0, dtype={'zipcode':str})
        df_recentlySold = pd.read_csv(f'Raw_Housing_Data/housingData_recentlySold_{price[0]}_{price[1]}.csv',
                                      index_col=0, dtype={'zipcode':str})
    else:
        df_forSale = pd.concat([df_forSale, pd.read_csv(f'Raw_Housing_Data/housingData_{price[0]}_{price[1]}.csv',
                                                        index_col=0)], axis=0, ignore_index=True)
        df_recentlySold = pd.concat([df_recentlySold, pd.read_csv(f'Raw_Housing_Data/housingData_recentlySold_{price[0]}_{price[1]}.csv',
                                                                  index_col=0)], axis=0, ignore_index=True)

In [1213]:
columnNames_forSale = list(map(lambda x: x.split(', '), df_forSale.columns))
columnNames_recentlySold = list(map(lambda x: x.split(','), df_recentlySold))

with open('Raw_Housing_Data/dataframeColumnNames_forSale.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(columnNames_forSale)

with open('Raw_Housing_Data/dataframeColumnNames_recentlySold.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(columnNames_recentlySold)

In [1214]:
# Manually select column names I want to keep in my downsized data frame
columnNames_forSale = ['zpid','homeType','homeStatus','timeOnZillow.1','price','streetAddress','zipcode','yearBuilt','bedrooms','bathrooms','livingAreaValue','livingAreaUnits',
                       'lotAreaValue','lotAreaUnits','zestimate','latitude','longitude','favoriteCount','pageViewCount','newConstructionType','resoFacts.associationFee',
                       'listingSubType.isForeclosure','resoFacts.isSeniorCommunity','resoFacts.propertySubType','hdpUrl']

columnNames_recentlySold = ['zpid','homeType','homeStatus','dateSoldString','timeOnZillow.1','price','streetAddress','zipcode','yearBuilt','bathrooms','bedrooms','livingAreaValue','livingAreaUnits',
                            'lotAreaValue','lotAreaUnits','zestimate','latitude','longitude','favoriteCount','pageViewCount','newConstructionType','resoFacts.associationFee',
                            'listingSubType.isForeclosure','resoFacts.isSeniorCommunity', 'resoFacts.propertySubType','hdpUrl']

addtl = ['taxAssessedValue','cityId','rentZestimate','restimateHighPercent','restimateLowPercent','schools','mortgageRates.thirtyYearFixedRate','mortgageRates.fifteenYearFixedRate',
         'mortgageRates.arm5Rate','resoFacts.garageSpaces','resoFacts.hasAdditionalParcels','resoFacts.hasAssociation','resoFacts.hasAttachedGarage',
         'resoFacts.hasAttachedProperty','resoFacts.hasGarage','resoFacts.hasPrivatePool','resoFacts.hasSpa','resoFacts.hasView','resoFacts.hasWaterfrontView','resoFacts.taxAnnualAmount',
         'resoFacts.waterViewYN']

columnNames_forSale = columnNames_forSale + addtl
columnNames_recentlySold = columnNames_recentlySold + addtl

df_forSale_trunc = df_forSale.loc[:,columnNames_forSale]
df_recentlySold_trunc = df_recentlySold.loc[:,columnNames_recentlySold]
del df_forSale, df_recentlySold

In [1215]:
# Do some initial data cleanup
# Set NaN values to 0. Compute quivalent monthly HOA fees to keep data consistent.
df_forSale_trunc['resoFacts.associationFee'] = df_forSale_trunc['resoFacts.associationFee'].fillna(0)
df_recentlySold_trunc['resoFacts.associationFee'] = df_recentlySold_trunc['resoFacts.associationFee'].fillna(0)
df_forSale_trunc['resoFacts.associationFee'] = hoaFees(df_forSale_trunc['resoFacts.associationFee'])
df_recentlySold_trunc['resoFacts.associationFee'] = hoaFees(df_recentlySold_trunc['resoFacts.associationFee'])

# Set NaN values to 'False'. True is already designated by default, where applicable.
df_forSale_trunc['resoFacts.isSeniorCommunity'] = df_forSale_trunc['resoFacts.isSeniorCommunity'].fillna(False)
df_recentlySold_trunc['resoFacts.isSeniorCommunity'] = df_recentlySold_trunc['resoFacts.isSeniorCommunity'].fillna(False)

# Reformat the string representing the property subtype so it can be easily worked on.
# Removing any RV Community data as it is not a valid 'single family home' sub type for this scope.
df_forSale_trunc['resoFacts.propertySubType'] = df_forSale_trunc['resoFacts.propertySubType'].apply(lambda x: str(x).strip("['']"))
df_recentlySold_trunc['resoFacts.propertySubType'] = df_recentlySold_trunc['resoFacts.propertySubType'].apply(lambda x: str(x).strip("['']"))
df_forSale_trunc = df_forSale_trunc.loc[df_forSale_trunc['resoFacts.propertySubType']!='RV Community']
df_recentlySold_trunc = df_recentlySold_trunc.loc[df_recentlySold_trunc['resoFacts.propertySubType']!='RV Community']

# Add prefix to website
df_forSale_trunc['hdpUrl']=df_forSale_trunc['hdpUrl'].apply(fullURL)
df_recentlySold_trunc['hdpUrl']=df_recentlySold_trunc['hdpUrl'].apply(fullURL)

# Convert yearBuilt to integer and drop any rows that have yearBuilt as NaN
df_forSale_trunc.dropna(subset=['yearBuilt'], inplace=True)
df_recentlySold_trunc.dropna(subset=['yearBuilt'], inplace=True)
df_forSale_trunc['yearBuilt']=df_forSale_trunc['yearBuilt'].astype(int)
df_recentlySold_trunc['yearBuilt']=df_recentlySold_trunc['yearBuilt'].astype(int)

# Drop all labels with lotAreaValue = NaN
df_forSale_trunc.dropna(subset=['lotAreaValue'], inplace=True)
df_recentlySold_trunc.dropna(subset=['lotAreaValue'], inplace=True)

# Drop all labels with lat/long = NaN
df_forSale_trunc.dropna(subset=['latitude', 'longitude'], inplace=True)
df_recentlySold_trunc.dropna(subset=['latitude', 'longitude'], inplace=True)

'''
Switch zestimate value from 0 to list price
Shown in EDA section, list price and zestimate are almost 1:1 correlated.
'''
indexList = df_forSale_trunc.index[df_forSale_trunc['zestimate'].isna()].tolist()
df_forSale_trunc.loc[indexList, 'zestimate'] = df_forSale_trunc.loc[indexList, 'price']
indexList = df_recentlySold_trunc.index[df_recentlySold_trunc['zestimate'].isna()].tolist()
df_recentlySold_trunc.loc[indexList, 'zestimate'] = df_recentlySold_trunc.loc[indexList, 'price']
# df_forSale_trunc.fillna(value=value, inplace=True)
# df_recentlySold_trunc.fillna(value=value, inplace=True)
'''End of zestimate update section'''

# Drop all labels with livingAreaValue = NaN -- This is the primary feature for price prediction.
# Could try to do more searches on $/sqft but for now this will suffice.
df_forSale_trunc.dropna(subset=['livingAreaValue'], inplace=True)
df_recentlySold_trunc.dropna(subset=['livingAreaValue'], inplace=True)

# Drop all livingAreaValue rows == 0. Zero here will cause inf. on price/sqft calc. <- breaks feature scaling and model building.
df_forSale_trunc = df_forSale_trunc.loc[df_forSale_trunc['livingAreaValue']>=500]
df_recentlySold_trunc = df_recentlySold_trunc.loc[df_recentlySold_trunc['livingAreaValue']>=500]

# Drop all bathroom and bedroom values of NaN
df_forSale_trunc.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)
df_recentlySold_trunc.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)

# Fill favoriteCount and pageViewCount with 0
value = {'favoriteCount':0, 'pageViewCount':0}
df_forSale_trunc.fillna(value=value, inplace=True)
df_recentlySold_trunc.fillna(value=value, inplace=True)

# Need to set all timeOnZillow.1 values of less than one day to one day.
df_recentlySold_trunc['timeOnZillow.1'].mask(df_recentlySold_trunc['dateSoldString']=='2024-02-15', '1 day', inplace=True)
df_recentlySold_trunc['timeOnZillow.1'] = df_recentlySold_trunc['timeOnZillow.1'].apply(timeOnZillowConversion)
df_forSale_trunc['timeOnZillow.1'] = df_forSale_trunc['timeOnZillow.1'].apply(timeOnZillowConversion)
df_forSale_trunc['timeOnZillow.1'] = df_forSale_trunc['timeOnZillow.1'].apply(lambda x: x.strip(' days')).astype(int)
df_recentlySold_trunc['timeOnZillow.1'] = df_recentlySold_trunc['timeOnZillow.1'].apply(lambda x: x.strip(' days')).astype(int)
df_recentlySold_trunc.rename(columns={'timeOnZillow.1':'daysOnZillow'}, inplace=True)
df_forSale_trunc.rename(columns={'timeOnZillow.1':'daysOnZillow'}, inplace=True)

# Need to set newConstructionType to either True of False
df_recentlySold_trunc['newConstructionType'] = cleanBoolean(df_recentlySold_trunc['newConstructionType'].copy())
df_forSale_trunc['newConstructionType'] = cleanBoolean(df_forSale_trunc['newConstructionType'].copy())

# Need to keep lotAreaUnits and livingAreaUnits consistently set to Square Feet.
# Will need to create a function to deal with these.
df_forSale_trunc[['lotAreaValue','lotAreaUnits']] = areaUnitConversion(df_forSale_trunc[['lotAreaValue','lotAreaUnits']])
df_recentlySold_trunc[['lotAreaValue','lotAreaUnits']] = areaUnitConversion(df_recentlySold_trunc[['lotAreaValue','lotAreaUnits']])

# Convert dateSoldString to datetime object
df_recentlySold_trunc['dateSoldString'] = pd.to_datetime(df_recentlySold_trunc['dateSoldString'])

'''Creating new feature -- price/sqft'''
df_forSale_trunc['price/sqft'] = df_forSale_trunc['price'].astype(float)/df_forSale_trunc['livingAreaValue'].astype(float)
df_recentlySold_trunc['price/sqft'] = df_recentlySold_trunc['price'].astype(float)/df_recentlySold_trunc['livingAreaValue'].astype(float)

df_forSale_trunc.rename(columns={'price':'listPrice'}, inplace=True)

# Set zipcode to integer value
df_forSale_trunc['zipcode'] = df_forSale_trunc['zipcode'].astype(int)
df_recentlySold_trunc['zipcode'] = df_recentlySold_trunc['zipcode'].astype(int)

df_forSale_trunc['resoFacts.hasAdditionalParcels'] = cleanBoolean(df_forSale_trunc['resoFacts.hasAdditionalParcels'].copy())
df_recentlySold_trunc['resoFacts.hasAdditionalParcels'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasAdditionalParcels'].copy())
df_forSale_trunc['resoFacts.hasAssociation'] = cleanBoolean(df_forSale_trunc['resoFacts.hasAssociation'].copy())
df_recentlySold_trunc['resoFacts.hasAssociation'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasAssociation'].copy())
df_forSale_trunc['resoFacts.hasAttachedGarage'] = cleanBoolean(df_forSale_trunc['resoFacts.hasAttachedGarage'].copy())
df_recentlySold_trunc['resoFacts.hasAttachedGarage'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasAttachedGarage'].copy())
df_forSale_trunc['resoFacts.hasAttachedProperty'] = cleanBoolean(df_forSale_trunc['resoFacts.hasAttachedProperty'].copy())
df_recentlySold_trunc['resoFacts.hasAttachedProperty'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasAttachedProperty'].copy())
df_forSale_trunc['resoFacts.hasGarage'] = cleanBoolean(df_forSale_trunc['resoFacts.hasGarage'].copy())
df_recentlySold_trunc['resoFacts.hasGarage'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasGarage'].copy())
df_forSale_trunc['resoFacts.hasPrivatePool'] = cleanBoolean(df_forSale_trunc['resoFacts.hasPrivatePool'].copy())
df_recentlySold_trunc['resoFacts.hasPrivatePool'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasPrivatePool'].copy())
df_forSale_trunc['resoFacts.hasSpa'] = cleanBoolean(df_forSale_trunc['resoFacts.hasSpa'].copy())
df_recentlySold_trunc['resoFacts.hasSpa'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasSpa'].copy())
df_forSale_trunc['resoFacts.hasView'] = cleanBoolean(df_forSale_trunc['resoFacts.hasView'].copy())
df_recentlySold_trunc['resoFacts.hasView'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasView'].copy())
df_forSale_trunc['resoFacts.hasWaterfrontView'] = cleanBoolean(df_forSale_trunc['resoFacts.hasWaterfrontView'].copy())
df_recentlySold_trunc['resoFacts.hasWaterfrontView'] = cleanBoolean(df_recentlySold_trunc['resoFacts.hasWaterfrontView'].copy())
df_forSale_trunc['resoFacts.waterViewYN'] = cleanBoolean(df_forSale_trunc['resoFacts.waterViewYN'].copy())
df_recentlySold_trunc['resoFacts.waterViewYN'] = cleanBoolean(df_recentlySold_trunc['resoFacts.waterViewYN'].copy())

value = {'replace_value':0}
df_forSale_trunc['resoFacts.taxAnnualAmount'] = cleanValue(df_forSale_trunc['resoFacts.taxAnnualAmount'].copy(), value)
df_recentlySold_trunc['resoFacts.taxAnnualAmount'] = cleanValue(df_recentlySold_trunc['resoFacts.taxAnnualAmount'].copy(), value)
df_forSale_trunc['resoFacts.garageSpaces'] = cleanValue(df_forSale_trunc['resoFacts.garageSpaces'].copy(), value)
df_recentlySold_trunc['resoFacts.garageSpaces'] = cleanValue(df_recentlySold_trunc['resoFacts.garageSpaces'].copy(), value)

valueMax = {'replace_value':df_forSale_trunc['restimateHighPercent'].max()}
valueMin = {'replace_value':df_forSale_trunc['restimateHighPercent'].min()}
df_forSale_trunc['restimateHighPercent'] = cleanValue(df_forSale_trunc['restimateHighPercent'].copy(), valueMax)
df_forSale_trunc['restimateLowPercent'] = cleanValue(df_forSale_trunc['restimateLowPercent'].copy(), valueMin)

valueMax = {'replace_value':df_recentlySold_trunc['restimateHighPercent'].max()}
valueMin = {'replace_value':df_recentlySold_trunc['restimateHighPercent'].min()}
df_recentlySold_trunc['restimateHighPercent'] = cleanValue(df_recentlySold_trunc['restimateHighPercent'].copy(), valueMax)
df_recentlySold_trunc['restimateLowPercent'] = cleanValue(df_recentlySold_trunc['restimateLowPercent'].copy(), valueMin)

valueRecentlySold = {'replace_value':df_recentlySold_trunc['taxAssessedValue'].median()}
valueForSale = {'replace_value':df_forSale_trunc['taxAssessedValue'].median()}
df_recentlySold_trunc['taxAssessedValue'] = cleanValue(df_recentlySold_trunc['taxAssessedValue'].copy(), valueRecentlySold)
df_forSale_trunc['taxAssessedValue'] = cleanValue(df_forSale_trunc['taxAssessedValue'].copy(), valueForSale)

valueRecentlySold = {'replace_value':list(df_forSale_trunc['cityId'].mode())[-1]}
valueForSale = {'replace_value':list(df_forSale_trunc['cityId'].mode())[-1]}
df_recentlySold_trunc['cityId'] = cleanValue(df_recentlySold_trunc['cityId'].copy(), valueRecentlySold)
df_forSale_trunc['cityId'] = cleanValue(df_forSale_trunc['cityId'].copy(), valueForSale)

valueRecentlySold = {'replace_value':df_recentlySold_trunc['rentZestimate'].mean()}
valueForSale = {'replace_value':df_forSale_trunc['rentZestimate'].mean()}
df_recentlySold_trunc['rentZestimate'] = cleanValue(df_recentlySold_trunc['rentZestimate'].copy(), valueRecentlySold)
df_forSale_trunc['rentZestimate'] = cleanValue(df_forSale_trunc['rentZestimate'].copy(), valueForSale)

valueRecentlySold = {'replace_value':df_recentlySold_trunc['mortgageRates.thirtyYearFixedRate'].mean()}
valueForSale = {'replace_value':df_forSale_trunc['mortgageRates.thirtyYearFixedRate'].mean()}
df_recentlySold_trunc['mortgageRates.thirtyYearFixedRate'] = cleanValue(df_recentlySold_trunc['mortgageRates.thirtyYearFixedRate'].copy(), valueRecentlySold)
df_forSale_trunc['mortgageRates.thirtyYearFixedRate'] = cleanValue(df_forSale_trunc['mortgageRates.thirtyYearFixedRate'].copy(), valueForSale)

valueRecentlySold = {'replace_value':df_recentlySold_trunc['mortgageRates.fifteenYearFixedRate'].mean()}
valueForSale = {'replace_value':df_forSale_trunc['mortgageRates.fifteenYearFixedRate'].mean()}
df_recentlySold_trunc['mortgageRates.fifteenYearFixedRate'] = cleanValue(df_recentlySold_trunc['mortgageRates.fifteenYearFixedRate'].copy(), valueRecentlySold)
df_forSale_trunc['mortgageRates.fifteenYearFixedRate'] = cleanValue(df_forSale_trunc['mortgageRates.fifteenYearFixedRate'].copy(), valueForSale)

valueRecentlySold = {'replace_value':df_recentlySold_trunc['mortgageRates.arm5Rate'].mean()}
valueForSale = {'replace_value':df_forSale_trunc['mortgageRates.arm5Rate'].mean()}
df_recentlySold_trunc['mortgageRates.arm5Rate'] = cleanValue(df_recentlySold_trunc['mortgageRates.arm5Rate'].copy(), valueRecentlySold)
df_forSale_trunc['mortgageRates.arm5Rate'] = cleanValue(df_forSale_trunc['mortgageRates.arm5Rate'].copy(), valueForSale)

In [1216]:
# Reset Index
df_forSale_trunc.reset_index(inplace=True)
df_forSale_trunc.drop(columns=['index'], inplace=True)

df_recentlySold_trunc.reset_index(inplace=True)
df_recentlySold_trunc.drop(columns=['index'], inplace=True)

In [1217]:
df_forSale_trunc = pd.concat([df_forSale_trunc.copy(), schoolScores(df_forSale_trunc['schools'])], axis=1, ignore_index=False)
df_forSale_trunc.drop(columns=['schools'], inplace=True)
df_recentlySold_trunc = pd.concat([df_recentlySold_trunc, schoolScores(df_recentlySold_trunc['schools'])], axis=1, ignore_index=False)
df_recentlySold_trunc.drop(columns=['schools'], inplace=True)

In [1218]:
'''
Check for outliers in list price and sold price.
'''
df_recentlySold_trunc.sort_values(by='price', inplace=True)
soldPrice = df_recentlySold_trunc[['zipcode','price']]
outlierBoolean = outliersSpecific(soldPrice, column='price')
df_recentlySold_trunc.sort_index(inplace=True)
df_recentlySold_trunc.insert(5, "priceOutlier", outlierBoolean, True)

df_forSale_trunc.sort_values(by='listPrice', inplace=True)
salePrice = df_forSale_trunc[['zipcode','listPrice']]
outlierBoolean = outliersSpecific(salePrice, column='listPrice')
df_forSale_trunc.sort_index(inplace=True)
df_forSale_trunc.insert(4, "priceOutlier", outlierBoolean, True)

In [1219]:
df_forSale_trunc.describe(include='all')

Unnamed: 0,zpid,homeType,homeStatus,daysOnZillow,priceOutlier,listPrice,streetAddress,zipcode,yearBuilt,bedrooms,bathrooms,livingAreaValue,livingAreaUnits,lotAreaValue,lotAreaUnits,zestimate,latitude,longitude,favoriteCount,pageViewCount,newConstructionType,resoFacts.associationFee,listingSubType.isForeclosure,resoFacts.isSeniorCommunity,resoFacts.propertySubType,hdpUrl,taxAssessedValue,cityId,rentZestimate,restimateHighPercent,restimateLowPercent,mortgageRates.thirtyYearFixedRate,mortgageRates.fifteenYearFixedRate,mortgageRates.arm5Rate,resoFacts.garageSpaces,resoFacts.hasAdditionalParcels,resoFacts.hasAssociation,resoFacts.hasAttachedGarage,resoFacts.hasAttachedProperty,resoFacts.hasGarage,resoFacts.hasPrivatePool,resoFacts.hasSpa,resoFacts.hasView,resoFacts.hasWaterfrontView,resoFacts.taxAnnualAmount,resoFacts.waterViewYN,price/sqft,schoolMaxRating,schoolMinDistance
count,2244.0,2244,2244,2244.0,2244.0,2244.0,2244,2244.0,2244.0,2244.0,2244.0,2244.0,2244,2244.0,2244,2244.0,2244.0,2244.0,2244.0,2244.0,2244,2244.0,2244,2244,2244,2244,2244.0,2244.0,2244.0,2244.0,2244.0,2244.0,2244.0,2244.0,2244.0,2244,2244,2244,2244,2244,2244,2244,2244,2244,2244.0,2244,2244.0,2244.0,2244.0
unique,,1,1,,,,2241,,,,,,1,,1,,,,,,2,,2,2,7,2244,,,,,,,,,,2,2,2,2,2,2,2,2,2,,2,,,
top,,SINGLE_FAMILY,FOR_SALE,,,,2831 Quentin Ave SE,,,,,,Square Feet,,Square Feet,,,,,,False,,False,False,Single Family Residence,www.zillow.com/homedetails/5650-Stamford-St-Mi...,,,,,,,,,,True,False,False,False,True,False,False,False,False,,False,,,
freq,,2244,2244,,,,2,,,,,,2244,,2244,,,,,,1868,,2229,2184,2233,1,,,,,,,,,,2071,1445,2132,2238,1974,1738,2016,1334,1890,,1793,,,
mean,238264300.0,,,70.474599,0.069964,567721.7,,32909.590909,1995.668895,3.51738,2.438503,2010.41934,,796105.0,,547810.6,28.173084,-80.686519,47.241533,865.347148,,34.016488,,,,,317423.8,42408.464349,3053.454119,29.496435,11.360071,6.51019,5.88319,6.884716,3.08066,,,,,,,,,,3412.310606,,268.002886,5.996881,1.963993
std,521229400.0,,,83.608752,0.255144,539508.9,,48.620087,23.920054,0.785604,0.880895,806.566324,,16131900.0,,508136.1,0.222387,0.081578,55.101532,891.230078,,84.464093,,,,,291805.3,61033.056616,2281.311847,30.908022,6.079026,0.177605,0.312042,0.260713,25.713816,,,,,,,,,,10760.882324,,140.245006,2.040056,1.685194
min,43367830.0,,,1.0,0.0,75000.0,,32754.0,1884.0,0.0,1.0,540.0,,0.0,,71600.0,27.833658,-80.930824,0.0,8.0,,0.0,,,,,1980.0,7433.0,1200.0,5.0,5.0,5.838,5.438,5.792,0.0,,,,,,,,,,0.0,,63.004032,2.0,0.0
25%,43453660.0,,,16.0,0.0,334999.0,,32907.0,1979.0,3.0,2.0,1536.0,,7840.8,,325500.0,27.989336,-80.72946,14.0,286.5,,0.0,,,,,171740.0,14563.0,2040.5,10.0,6.0,6.51019,5.855,6.884716,2.0,,,,,,,,,,814.0,,198.367009,4.0,0.9
50%,54612330.0,,,41.0,0.0,399900.0,,32909.0,1999.0,3.0,2.0,1865.0,,10018.8,,390000.0,28.121854,-80.679285,32.0,653.0,,0.0,,,,,270250.0,32805.0,2454.0,19.0,10.0,6.54,5.855,6.978,2.0,,,,,,,,,,2474.5,,228.089349,6.0,1.5
75%,104131900.0,,,104.0,0.0,599900.0,,32940.0,2021.0,4.0,3.0,2271.5,,12196.8,,585050.0,28.339631,-80.638823,61.0,1146.75,,33.0,,,,,385860.0,53787.0,3053.454119,25.0,17.0,6.54,5.855,6.978,2.0,,,,,,,,,,4409.25,,283.695983,8.0,2.4


In [1220]:
df_recentlySold_trunc.describe(include='all')

Unnamed: 0,zpid,homeType,homeStatus,dateSoldString,daysOnZillow,priceOutlier,price,streetAddress,zipcode,yearBuilt,bathrooms,bedrooms,livingAreaValue,livingAreaUnits,lotAreaValue,lotAreaUnits,zestimate,latitude,longitude,favoriteCount,pageViewCount,newConstructionType,resoFacts.associationFee,listingSubType.isForeclosure,resoFacts.isSeniorCommunity,resoFacts.propertySubType,hdpUrl,taxAssessedValue,cityId,rentZestimate,restimateHighPercent,restimateLowPercent,mortgageRates.thirtyYearFixedRate,mortgageRates.fifteenYearFixedRate,mortgageRates.arm5Rate,resoFacts.garageSpaces,resoFacts.hasAdditionalParcels,resoFacts.hasAssociation,resoFacts.hasAttachedGarage,resoFacts.hasAttachedProperty,resoFacts.hasGarage,resoFacts.hasPrivatePool,resoFacts.hasSpa,resoFacts.hasView,resoFacts.hasWaterfrontView,resoFacts.taxAnnualAmount,resoFacts.waterViewYN,price/sqft,schoolMaxRating,schoolMinDistance
count,5676.0,5676,5676,5676,5676.0,5676.0,5676.0,5676,5676.0,5676.0,5676.0,5676.0,5676.0,5676,5676.0,5676,5676.0,5676.0,5676.0,5676.0,5676.0,5676,5676.0,5676,5676,5676,5676,5676.0,5676.0,5676.0,5676.0,5676.0,5676.0,5676.0,5676.0,5676.0,5676,5676,5676,5676,5676,5676,5676,5676,5676,5676.0,5676,5676.0,5676.0,5676.0
unique,,1,2,,,,,5661,,,,,,1,,1,,,,,,2,,2,2,7,5676,,,,,,,,,,2,2,2,2,2,2,2,2,2,,2,,,
top,,SINGLE_FAMILY,RECENTLY_SOLD,,,,,657 Amy Ct SW,,,,,,Square Feet,,Square Feet,,,,,,False,,False,False,Single Family Residence,www.zillow.com/homedetails/1475-Leeward-Ave-SE...,,,,,,,,,,True,False,False,False,True,False,False,False,False,,False,,,
freq,,5676,5661,,,,,2,,,,,,5676,,5676,,,,,,5671,,5650,5488,5157,1,,,,,,,,,,4864,3285,5484,5584,4718,4505,5002,3296,4606,,4414,,,
mean,227995900.0,,,2023-10-06 10:40:35.517970432,128.312368,0.060782,490383.1,,32913.25,1995.156096,2.368851,3.457012,1977.30673,,332876.1,,500883.6,28.18705,-80.689202,2.291755,58.466702,,40.000352,,,,,321479.9,39287.533474,2979.819273,17.629845,12.612579,6.602521,5.99196,6.735064,3.414024,,,,,,,,,,2886.273608,,240.441933,6.088266,1.976286
min,43367950.0,,,2023-02-15 00:00:00,1.0,0.0,220.0,,32754.0,1901.0,0.0,0.0,504.0,,0.0,,14000.0,27.833752,-80.92859,0.0,0.0,,0.0,,,,,2500.0,7433.0,1150.0,5.0,5.0,5.99,5.779,5.783,0.0,,,,,,,,,,0.0,,0.15873,2.0,0.1
25%,43446090.0,,,2023-08-18 00:00:00,62.0,0.0,303000.0,,32907.0,1979.0,2.0,3.0,1492.0,,7840.0,,311100.0,28.012474,-80.732945,0.0,7.0,,0.0,,,,,178982.5,13560.0,2025.0,9.0,8.0,6.509,5.958,6.731,1.0,,,,,,,,,,821.0,,186.94493,4.0,0.9
50%,48188740.0,,,2023-10-18 00:00:00,117.0,0.0,399427.5,,32926.0,1997.0,2.0,3.0,1828.0,,10018.8,,403100.0,28.162102,-80.68591,0.0,14.0,,0.0,,,,,278225.0,19307.0,2394.5,15.0,11.0,6.683,5.969,6.83,2.0,,,,,,,,,,2305.0,,218.657384,6.0,1.5
75%,104131600.0,,,2023-12-08 00:00:00,177.0,0.0,525000.0,,32940.0,2020.0,3.0,4.0,2256.0,,12196.8,,531875.0,28.342488,-80.63976,0.0,28.0,,38.25,,,,,384835.0,53787.0,2996.0,24.0,18.0,6.683,5.974,6.834,2.0,,,,,,,,,,3936.25,,265.563793,8.0,2.6
max,2134275000.0,,,2024-02-15 00:00:00,364.0,1.0,6000000.0,,32976.0,2024.0,13.0,11.0,20909.0,,455393700.0,,6236500.0,28.78083,-80.456375,442.0,8740.0,,1961.0,,,,,4647100.0,398464.0,34947.0,100.0,44.0,6.894,6.734,7.075,4040.0,,,,,,,,,,54405.0,,1396.807298,10.0,13.7


In [1221]:
df_forSale_trunc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2244 entries, 0 to 2243
Data columns (total 49 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   zpid                                2244 non-null   int64  
 1   homeType                            2244 non-null   object 
 2   homeStatus                          2244 non-null   object 
 3   daysOnZillow                        2244 non-null   int64  
 4   priceOutlier                        2244 non-null   int64  
 5   listPrice                           2244 non-null   float64
 6   streetAddress                       2244 non-null   object 
 7   zipcode                             2244 non-null   int64  
 8   yearBuilt                           2244 non-null   int64  
 9   bedrooms                            2244 non-null   float64
 10  bathrooms                           2244 non-null   float64
 11  livingAreaValue                     2244 non-nul

In [1222]:
df_recentlySold_trunc.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5676 entries, 0 to 5675
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   zpid                                5676 non-null   int64         
 1   homeType                            5676 non-null   object        
 2   homeStatus                          5676 non-null   object        
 3   dateSoldString                      5676 non-null   datetime64[ns]
 4   daysOnZillow                        5676 non-null   int64         
 5   priceOutlier                        5676 non-null   int64         
 6   price                               5676 non-null   float64       
 7   streetAddress                       5676 non-null   object        
 8   zipcode                             5676 non-null   int64         
 9   yearBuilt                           5676 non-null   int64         
 10  bathrooms                    

In [1223]:
for column_ in df_forSale_trunc.columns:
    nullValueCount = df_forSale_trunc[column_].isnull().sum()
    print(f'Number of NaN values in {column_}: {nullValueCount}')

Number of NaN values in zpid: 0
Number of NaN values in homeType: 0
Number of NaN values in homeStatus: 0
Number of NaN values in daysOnZillow: 0
Number of NaN values in priceOutlier: 0
Number of NaN values in listPrice: 0
Number of NaN values in streetAddress: 0
Number of NaN values in zipcode: 0
Number of NaN values in yearBuilt: 0
Number of NaN values in bedrooms: 0
Number of NaN values in bathrooms: 0
Number of NaN values in livingAreaValue: 0
Number of NaN values in livingAreaUnits: 0
Number of NaN values in lotAreaValue: 0
Number of NaN values in lotAreaUnits: 0
Number of NaN values in zestimate: 0
Number of NaN values in latitude: 0
Number of NaN values in longitude: 0
Number of NaN values in favoriteCount: 0
Number of NaN values in pageViewCount: 0
Number of NaN values in newConstructionType: 0
Number of NaN values in resoFacts.associationFee: 0
Number of NaN values in listingSubType.isForeclosure: 0
Number of NaN values in resoFacts.isSeniorCommunity: 0
Number of NaN values in

In [1224]:
for column_ in df_recentlySold_trunc.columns:
    nullValueCount = df_recentlySold_trunc[column_].isnull().sum()
    print(f'Number of NaN values in {column_}: {nullValueCount}')

Number of NaN values in zpid: 0
Number of NaN values in homeType: 0
Number of NaN values in homeStatus: 0
Number of NaN values in dateSoldString: 0
Number of NaN values in daysOnZillow: 0
Number of NaN values in priceOutlier: 0
Number of NaN values in price: 0
Number of NaN values in streetAddress: 0
Number of NaN values in zipcode: 0
Number of NaN values in yearBuilt: 0
Number of NaN values in bathrooms: 0
Number of NaN values in bedrooms: 0
Number of NaN values in livingAreaValue: 0
Number of NaN values in livingAreaUnits: 0
Number of NaN values in lotAreaValue: 0
Number of NaN values in lotAreaUnits: 0
Number of NaN values in zestimate: 0
Number of NaN values in latitude: 0
Number of NaN values in longitude: 0
Number of NaN values in favoriteCount: 0
Number of NaN values in pageViewCount: 0
Number of NaN values in newConstructionType: 0
Number of NaN values in resoFacts.associationFee: 0
Number of NaN values in listingSubType.isForeclosure: 0
Number of NaN values in resoFacts.isSeni

In [1225]:
# Check for duplicates
print(f'There are {df_forSale_trunc.listPrice.loc[df_forSale_trunc.duplicated()].count()} duplicates in df_forSale_trunc.')
print(f'There are {df_recentlySold_trunc.price.loc[df_recentlySold_trunc.duplicated()].count()} duplicates in df_recentlySold_trunc.')

There are 0 duplicates in df_forSale_trunc.
There are 0 duplicates in df_recentlySold_trunc.


In [1226]:
# Export dataframes to csv
df_forSale_trunc.to_csv('cleaned_forSale_data.csv', sep=',', index=True, encoding='utf-8')
df_recentlySold_trunc.to_csv('cleaned_recentlySold_data.csv', sep=',', index=True, encoding='utf-8')

In [1227]:
del df_forSale_trunc, df_recentlySold_trunc