In [1108]:
# This notebook will take the housing data scraped earlier and clean/format it for further investigation.
import pandas as pd
import numpy as np
import csv

pd.options.display.max_columns = None

In [1109]:
priceRange = [
    [1, 250000], [250001, 300000], [300001, 350000], [350001, 400000],
    [400001, 450000], [450001, 500000], [500001, 750000], [750001, 1000000],
    [1000001, 2000000], [2000001, 5000000], [5000001, 50000000]
    ]

In [1110]:
# Create function to take in column of dataframe and return modified column
# Function will take in resoFacts.associationFee and determine if its monthly, quarterly, etc and div value by appropriate amount
# Returned dataframe series will just a number representing monthly HOA fee due.

def hoaFees(series: pd.DataFrame):
    paymentFreq = series.apply(lambda x: str(x).strip("$,1234567890 "))
    payment = series.apply(lambda x: str(x).strip("$,-abcdefghijklmnopqrstuvwxyz ").replace(',', ''))
    payment = payment.apply(lambda x: float(x))

    paymentFreq.replace('', 0, inplace=True)

    for i, freq in enumerate(paymentFreq):
        if freq == 'monthly':
            payment[i] = payment[i]

        elif freq == 'quarterly':
            payment[i] = payment[i]/4

        elif freq == 'semi-annually':
            payment[i] = payment[i]/6

        elif freq == 'annually':
            payment[i] = payment[i]/12

        else:
            payment[i] = 0
    
    payment = round(payment)
    return payment

# Concat partial URL with website prefix
def fullURL(hdpURL: str):
    hdpURL = 'www.zillow.com' + str(hdpURL)
    return hdpURL

# Set area units to a consistent value of 'Square Feet'
def areaUnitConversion(units: pd.Series):
    for idx in units.index:
        if units.lotAreaUnits[idx] == 'Acres':
            units.lotAreaValue[idx] = units.lotAreaValue[idx] * 43560
            units.lotAreaUnits[idx] = 'Square Feet'

        elif units.lotAreaUnits[idx] == 'sqft':
            units.lotAreaUnits[idx] = 'Square Feet'

    return units

In [1111]:
for i, price in enumerate(priceRange):
    if i == 0:
        df_forSale = pd.read_csv(f'Raw_Housing_Data/housingData_{price[0]}_{price[1]}.csv',
                                 index_col=0, dtype={'zipcode':str})
        df_recentlySold = pd.read_csv(f'Raw_Housing_Data/housingData_recentlySold_{price[0]}_{price[1]}.csv',
                                      index_col=0, dtype={'zipcode':str})
    else:
        df_forSale = pd.concat([df_forSale, pd.read_csv(f'Raw_Housing_Data/housingData_{price[0]}_{price[1]}.csv',
                                                        index_col=0)], axis=0, ignore_index=True)
        df_recentlySold = pd.concat([df_recentlySold, pd.read_csv(f'Raw_Housing_Data/housingData_recentlySold_{price[0]}_{price[1]}.csv',
                                                                  index_col=0)], axis=0, ignore_index=True)

In [1112]:
columnNames_forSale = list(map(lambda x: x.split(', '), df_forSale.columns))
columnNames_recentlySold = list(map(lambda x: x.split(','), df_recentlySold))

with open('Raw_Housing_Data/dataframeColumnNames_forSale.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(columnNames_forSale)

with open('Raw_Housing_Data/dataframeColumnNames_recentlySold.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(columnNames_recentlySold)

In [1113]:
# Manually select column names I want to keep in my downsized data frame
columnNames_forSale = ['zpid','homeType','homeStatus','timeOnZillow.1','price','streetAddress','zipcode','yearBuilt','bedrooms','bathrooms',
                       'livingAreaValue','livingAreaUnits','lotAreaValue','lotAreaUnits','zestimate','latitude','longitude',
                       'favoriteCount','pageViewCount',
                       'newConstructionType','resoFacts.associationFee','listingSubType.isForeclosure','resoFacts.isSeniorCommunity','resoFacts.propertySubType','hdpUrl']

columnNames_recentlySold = ['zpid','homeType','homeStatus','dateSoldString','timeOnZillow.1','price','streetAddress','zipcode','yearBuilt','bathrooms','bedrooms',
                            'livingAreaValue','livingAreaUnits','lotAreaValue','lotAreaUnits','zestimate','latitude','longitude',
                            'favoriteCount','pageViewCount',
                            'newConstructionType','resoFacts.associationFee','listingSubType.isForeclosure','resoFacts.isSeniorCommunity', 'resoFacts.propertySubType','hdpUrl']

df_forSale_trunc = df_forSale.loc[:,columnNames_forSale]
df_recentlySold_trunc = df_recentlySold.loc[:,columnNames_recentlySold]
# del df_forSale, df_recentlySold

In [1114]:
# Do some initial data cleanup
# Set NaN values to 0. Compute quivalent monthly HOA fees to keep data consistent.
df_forSale_trunc['resoFacts.associationFee'] = df_forSale_trunc['resoFacts.associationFee'].fillna(0)
df_recentlySold_trunc['resoFacts.associationFee'] = df_recentlySold_trunc['resoFacts.associationFee'].fillna(0)
df_forSale_trunc['resoFacts.associationFee'] = hoaFees(df_forSale_trunc['resoFacts.associationFee'])
df_recentlySold_trunc['resoFacts.associationFee'] = hoaFees(df_recentlySold_trunc['resoFacts.associationFee'])

# Set NaN values to 'False'. True is already designated by default, where applicable.
df_forSale_trunc['resoFacts.isSeniorCommunity'] = df_forSale_trunc['resoFacts.isSeniorCommunity'].fillna(False)
df_recentlySold_trunc['resoFacts.isSeniorCommunity'] = df_recentlySold_trunc['resoFacts.isSeniorCommunity'].fillna(False)

# Reformat the string representing the property subtype so it can be easily worked on.
# Removing any RV Community data as it is not a valid 'single family home' sub type for this scope.
df_forSale_trunc['resoFacts.propertySubType'] = df_forSale_trunc['resoFacts.propertySubType'].apply(lambda x: str(x).strip("['']"))
df_recentlySold_trunc['resoFacts.propertySubType'] = df_recentlySold_trunc['resoFacts.propertySubType'].apply(lambda x: str(x).strip("['']"))
df_forSale_trunc = df_forSale_trunc.loc[df_forSale_trunc['resoFacts.propertySubType']!='RV Community']
df_recentlySold_trunc = df_recentlySold_trunc.loc[df_recentlySold_trunc['resoFacts.propertySubType']!='RV Community']

# Add prefix to website
df_forSale_trunc['hdpUrl']=df_forSale_trunc['hdpUrl'].apply(fullURL)
df_recentlySold_trunc['hdpUrl']=df_recentlySold_trunc['hdpUrl'].apply(fullURL)

# Convert yearBuilt to integer and drop any rows that have yearBuilt as NaN
df_forSale_trunc.dropna(subset=['yearBuilt'], inplace=True)
df_recentlySold_trunc.dropna(subset=['yearBuilt'], inplace=True)
df_forSale_trunc['yearBuilt']=df_forSale_trunc['yearBuilt'].astype(int)
df_recentlySold_trunc['yearBuilt']=df_recentlySold_trunc['yearBuilt'].astype(int)

# Drop all labels with lotAreaValue = NaN
df_forSale_trunc.dropna(subset=['lotAreaValue'], inplace=True)
df_recentlySold_trunc.dropna(subset=['lotAreaValue'], inplace=True)

# Drop all labels with lat/long = NaN
df_forSale_trunc.dropna(subset=['latitude', 'longitude'], inplace=True)
df_recentlySold_trunc.dropna(subset=['latitude', 'longitude'], inplace=True)

# Fill all zestimate = NaN values with 0
value = {'zestimate':0}
df_forSale_trunc.fillna(value=value, inplace=True)
df_recentlySold_trunc.fillna(value=value, inplace=True)

# Drop all labels with livingAreaValue = NaN -- This is the primary feature for price prediction.
# Could try to do more searches on $/sqft but for now this will suffice.
df_forSale_trunc.dropna(subset=['livingAreaValue'], inplace=True)
df_recentlySold_trunc.dropna(subset=['livingAreaValue'], inplace=True)

# Drop all bathroom and bedroom values of NaN
df_forSale_trunc.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)
df_recentlySold_trunc.dropna(subset=['bathrooms', 'bedrooms'], inplace=True)

# Fill favoriteCount and pageViewCount with 0
value = {'favoriteCount':0, 'pageViewCount':0}
df_forSale_trunc.fillna(value=value, inplace=True)
df_recentlySold_trunc.fillna(value=value, inplace=True)

# Need to set all timeOnZillow.1 values of less than one day to one day.
df_recentlySold_trunc['timeOnZillow.1'].mask(df_recentlySold_trunc['dateSoldString']=='2024-02-15', '1 day', inplace=True)

# Need to set newConstructionType to either True of False
df_recentlySold_trunc['newConstructionType'] = df_recentlySold_trunc['newConstructionType'].apply(lambda x: str(x))
df_recentlySold_trunc['newConstructionType'] = df_recentlySold_trunc['newConstructionType']!='nan'

# Need to keep lotAreaUnits and livingAreaUnits consistently set to Square Feet.
# Will need to create a function to deal with these.
temp = df_forSale_trunc[['lotAreaValue','lotAreaUnits']]
temp = areaUnitConversion(temp)
# df_forSale_trunc[['lotAreaValue','lotAreaUnits']] = areaUnitConversion(temp)
# temp = df_recentlySold_trunc[['lotAreaValue','lotAreaUnits']]
# df_recentlySold_trunc[['lotAreaValue','lotAreaUnits']] = areaUnitConversion(temp)
del temp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  units.lotAreaValue[idx] = units.lotAreaValue[idx] * 43560
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  units.lotAreaUnits[idx] = 'Square Feet'


In [1115]:
# Reset Index
df_forSale_trunc.reset_index(inplace=True)
df_forSale_trunc.drop(columns=['index'], inplace=True)

df_recentlySold_trunc.reset_index(inplace=True)
df_recentlySold_trunc.drop(columns=['index'], inplace=True)

In [1116]:
df_forSale_trunc.describe(include='all')

Unnamed: 0,zpid,homeType,homeStatus,timeOnZillow.1,price,streetAddress,zipcode,yearBuilt,bedrooms,bathrooms,livingAreaValue,livingAreaUnits,lotAreaValue,lotAreaUnits,zestimate,latitude,longitude,favoriteCount,pageViewCount,newConstructionType,resoFacts.associationFee,listingSubType.isForeclosure,resoFacts.isSeniorCommunity,resoFacts.propertySubType,hdpUrl
count,2245.0,2245,2245,2245,2245.0,2245,2245.0,2245.0,2245.0,2245.0,2245.0,2245,2245.0,2245,2245.0,2245.0,2245.0,2245.0,2245.0,376,2245.0,2245,2245,2245,2245
unique,,1,1,283,,2242,42.0,,,,,1,,2,,,,,,1,,2,2,7,2245
top,,SINGLE_FAMILY,FOR_SALE,3 days,,990 Quaker St,32909.0,,,,,Square Feet,,Square Feet,,,,,,BUILDER_SPEC,,False,False,Single Family Residence,www.zillow.com/homedetails/5650-Stamford-St-Mi...
freq,,2245,2245,66,,2,325.0,,,,,2245,,1498,,,,,,376,,2230,2185,2234,1
mean,238308900.0,,,,567535.2,,,1995.680624,3.516258,2.437862,2009.5951,,5619.762218,,363731.8,28.173165,-80.686566,47.288196,866.01559,,34.018263,,,,
std,521117600.0,,,,539461.1,,,23.921179,0.787224,0.881222,807.331728,,4250.178588,,467328.1,0.22237,0.081591,55.133603,891.594184,,84.445313,,,,
min,43367830.0,,,,75000.0,,,1884.0,0.0,1.0,160.0,,0.0,,0.0,27.833658,-80.930824,0.0,8.0,,0.0,,,,
25%,43453680.0,,,,334999.0,,,1979.0,3.0,2.0,1536.0,,0.73,,0.0,27.989408,-80.72988,14.0,287.0,,0.0,,,,
50%,54612390.0,,,,399900.0,,,1999.0,3.0,2.0,1865.0,,7405.2,,325600.0,28.121946,-80.67933,32.0,653.0,,0.0,,,,
75%,104132200.0,,,,599900.0,,,2021.0,4.0,3.0,2271.0,,10018.8,,442900.0,28.340923,-80.63883,61.0,1149.0,,33.0,,,,


In [1117]:
df_recentlySold_trunc.describe(include='all')

Unnamed: 0,zpid,homeType,homeStatus,dateSoldString,timeOnZillow.1,price,streetAddress,zipcode,yearBuilt,bathrooms,bedrooms,livingAreaValue,livingAreaUnits,lotAreaValue,lotAreaUnits,zestimate,latitude,longitude,favoriteCount,pageViewCount,newConstructionType,resoFacts.associationFee,listingSubType.isForeclosure,resoFacts.isSeniorCommunity,resoFacts.propertySubType,hdpUrl
count,5682.0,5682,5682,5682,5682,5682.0,5682,5682.0,5682.0,5682.0,5682.0,5682.0,5682,5682.0,5682,5682.0,5682.0,5682.0,5682.0,5682.0,5682,5682.0,5682,5682,5682,5682
unique,,1,2,286,327,,5667,51.0,,,,,1,,3,,,,,,2,,2,2,7,5682
top,,SINGLE_FAMILY,RECENTLY_SOLD,2023-12-29,76 days,,2922 Tidepool Pl,32940.0,,,,,Square Feet,,Square Feet,,,,,,False,,False,False,Single Family Residence,www.zillow.com/homedetails/1475-Leeward-Ave-SE...
freq,,5682,5667,101,89,,2,660.0,,,,,5682,,3740,,,,,,5677,,5656,5493,5160,1
mean,228211000.0,,,,,490048.0,,,1995.138684,2.367582,3.455121,1975.512496,,5465.043544,,484897.7,28.187325,-80.689312,2.289335,58.419395,,40.037311,,,,
std,500067900.0,,,,,388132.6,,,22.871024,0.811598,0.764815,858.913839,,4170.162272,,385498.9,0.207348,0.079472,14.691526,253.115132,,98.398632,,,,
min,43367950.0,,,,,220.0,,,1901.0,0.0,0.0,0.0,,0.0,,0.0,27.833752,-80.92859,0.0,0.0,,0.0,,,,
25%,43446060.0,,,,,302990.0,,,1979.0,2.0,3.0,1492.0,,0.57,,302600.0,28.012663,-80.73313,0.0,7.0,,0.0,,,,
50%,48188740.0,,,,,399150.0,,,1997.0,2.0,3.0,1828.0,,6969.6,,395500.0,28.162615,-80.685962,0.0,14.0,,0.0,,,,
75%,104131700.0,,,,,525000.0,,,2020.0,3.0,4.0,2256.0,,9147.6,,523000.0,28.342618,-80.639813,0.0,28.0,,38.0,,,,


---
Follow the **five** steps found in the **Data Detox** Towards Data Science article.
1) Check data types and nulls - DONE
2) Check for duplicate entries - DONE
3) Check for unnecessary white space
4) Check min and max values
5) Check for outliers

In [1118]:
df_forSale_trunc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2245 entries, 0 to 2244
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   zpid                          2245 non-null   int64  
 1   homeType                      2245 non-null   object 
 2   homeStatus                    2245 non-null   object 
 3   timeOnZillow.1                2245 non-null   object 
 4   price                         2245 non-null   float64
 5   streetAddress                 2245 non-null   object 
 6   zipcode                       2245 non-null   object 
 7   yearBuilt                     2245 non-null   int64  
 8   bedrooms                      2245 non-null   float64
 9   bathrooms                     2245 non-null   float64
 10  livingAreaValue               2245 non-null   float64
 11  livingAreaUnits               2245 non-null   object 
 12  lotAreaValue                  2245 non-null   float64
 13  lot

In [1119]:
df_recentlySold_trunc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5682 entries, 0 to 5681
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   zpid                          5682 non-null   int64  
 1   homeType                      5682 non-null   object 
 2   homeStatus                    5682 non-null   object 
 3   dateSoldString                5682 non-null   object 
 4   timeOnZillow.1                5682 non-null   object 
 5   price                         5682 non-null   float64
 6   streetAddress                 5682 non-null   object 
 7   zipcode                       5682 non-null   object 
 8   yearBuilt                     5682 non-null   int64  
 9   bathrooms                     5682 non-null   float64
 10  bedrooms                      5682 non-null   float64
 11  livingAreaValue               5682 non-null   float64
 12  livingAreaUnits               5682 non-null   object 
 13  lot

In [1120]:
for column_ in columnNames_forSale:
    nullValueCount = df_forSale_trunc[column_].isnull().sum()
    print(f'Number of NaN values in {column_}: {nullValueCount}')

Number of NaN values in zpid: 0
Number of NaN values in homeType: 0
Number of NaN values in homeStatus: 0
Number of NaN values in timeOnZillow.1: 0
Number of NaN values in price: 0
Number of NaN values in streetAddress: 0
Number of NaN values in zipcode: 0
Number of NaN values in yearBuilt: 0
Number of NaN values in bedrooms: 0
Number of NaN values in bathrooms: 0
Number of NaN values in livingAreaValue: 0
Number of NaN values in livingAreaUnits: 0
Number of NaN values in lotAreaValue: 0
Number of NaN values in lotAreaUnits: 0
Number of NaN values in zestimate: 0
Number of NaN values in latitude: 0
Number of NaN values in longitude: 0
Number of NaN values in favoriteCount: 0
Number of NaN values in pageViewCount: 0
Number of NaN values in newConstructionType: 1869
Number of NaN values in resoFacts.associationFee: 0
Number of NaN values in listingSubType.isForeclosure: 0
Number of NaN values in resoFacts.isSeniorCommunity: 0
Number of NaN values in resoFacts.propertySubType: 0
Number of

In [1121]:
for column_ in columnNames_recentlySold:
    nullValueCount = df_recentlySold_trunc[column_].isnull().sum()
    print(f'Number of NaN values in {column_}: {nullValueCount}')

Number of NaN values in zpid: 0
Number of NaN values in homeType: 0
Number of NaN values in homeStatus: 0
Number of NaN values in dateSoldString: 0
Number of NaN values in timeOnZillow.1: 0
Number of NaN values in price: 0
Number of NaN values in streetAddress: 0
Number of NaN values in zipcode: 0
Number of NaN values in yearBuilt: 0
Number of NaN values in bathrooms: 0
Number of NaN values in bedrooms: 0
Number of NaN values in livingAreaValue: 0
Number of NaN values in livingAreaUnits: 0
Number of NaN values in lotAreaValue: 0
Number of NaN values in lotAreaUnits: 0
Number of NaN values in zestimate: 0
Number of NaN values in latitude: 0
Number of NaN values in longitude: 0
Number of NaN values in favoriteCount: 0
Number of NaN values in pageViewCount: 0
Number of NaN values in newConstructionType: 0
Number of NaN values in resoFacts.associationFee: 0
Number of NaN values in listingSubType.isForeclosure: 0
Number of NaN values in resoFacts.isSeniorCommunity: 0
Number of NaN values in

In [1122]:
# Check for duplicates
print(f'There are {df_forSale_trunc.price.loc[df_forSale_trunc.duplicated()].count()} duplicates in df_forSale_trunc.')
print(f'There are {df_recentlySold_trunc.price.loc[df_recentlySold_trunc.duplicated()].count()} duplicates in df_recentlySold_trunc.')

There are 0 duplicates in df_forSale_trunc.
There are 0 duplicates in df_recentlySold_trunc.


In [1123]:
# Check for outliers
# Currently there is one house built in 1884 in df_forSale_trunc. This would count as an outlier I think. Probably need to remove.

In [1124]:
# del df_forSale_trunc, df_recentlySold_trunc