In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import pprint

In [2]:
# Read the data into a Pandas DataFrame
realEstateCa_df = pd.read_csv('RealEstate_California.csv')
realEstateCa_df.head()

Unnamed: 0,index_id,id,stateId,countyId,cityId,country,datePostedString,is_bankOwned,is_forAuction,event,...,parking,garageSpaces,hasGarage,levels,pool,spa,isNewConstruction,hasPetsAllowed,homeType,county
0,0,95717-2087851113,9,77,24895,USA,1/13/2021,0,0,Listed for sale,...,0,0,0,0,0,0,0,0,LOT,Placer County
1,1,94564-18496265,9,189,36958,USA,7/12/2021,0,0,Listed for sale,...,1,2,1,One Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County
2,2,94564-18484475,9,190,36958,USA,7/8/2021,0,0,Listed for sale,...,1,2,1,One Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County
3,3,94564-18494835,9,191,36958,USA,7/7/2021,0,0,Listed for sale,...,1,1,1,Two Story,0,1,0,0,SINGLE_FAMILY,Contra Costa County
4,4,94564-2069722747,9,192,36958,USA,7/7/2021,0,0,Listed for sale,...,0,0,0,0,0,0,0,0,LOT,Contra Costa County


In [3]:
# Brief summary of the realEstateCa_df DataFrame before cleaning.
realEstateCa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35389 entries, 0 to 35388
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index_id            35389 non-null  int64  
 1   id                  35389 non-null  object 
 2   stateId             35389 non-null  int64  
 3   countyId            35389 non-null  int64  
 4   cityId              35389 non-null  int64  
 5   country             35389 non-null  object 
 6   datePostedString    35386 non-null  object 
 7   is_bankOwned        35389 non-null  int64  
 8   is_forAuction       35389 non-null  int64  
 9   event               35389 non-null  object 
 10  time                35100 non-null  float64
 11  price               35389 non-null  int64  
 12  pricePerSquareFoot  35389 non-null  int64  
 13  city                35389 non-null  object 
 14  state               35389 non-null  object 
 15  yearBuilt           35389 non-null  int64  
 16  stre

In [4]:
# Get the realEstateCa_df columns.
print(realEstateCa_df.columns)

Index(['index_id', 'id', 'stateId', 'countyId', 'cityId', 'country',
       'datePostedString', 'is_bankOwned', 'is_forAuction', 'event', 'time',
       'price', 'pricePerSquareFoot', 'city', 'state', 'yearBuilt',
       'streetAddress', 'zipcode', 'longitude', 'latitude', 'hasBadGeocode',
       'description', 'currency', 'livingArea', 'livingAreaValue',
       'lotAreaUnits', 'bathrooms', 'bedrooms', 'buildingArea', 'parking',
       'garageSpaces', 'hasGarage', 'levels', 'pool', 'spa',
       'isNewConstruction', 'hasPetsAllowed', 'homeType', 'county'],
      dtype='object')


In [5]:
# Get count of unique ids
unique_ids = realEstateCa_df["id"].nunique()
unique_ids

31238

In [6]:
realEstateCa_df.dropna(inplace = True)
realEstateCa_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34815 entries, 0 to 35388
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index_id            34815 non-null  int64  
 1   id                  34815 non-null  object 
 2   stateId             34815 non-null  int64  
 3   countyId            34815 non-null  int64  
 4   cityId              34815 non-null  int64  
 5   country             34815 non-null  object 
 6   datePostedString    34815 non-null  object 
 7   is_bankOwned        34815 non-null  int64  
 8   is_forAuction       34815 non-null  int64  
 9   event               34815 non-null  object 
 10  time                34815 non-null  float64
 11  price               34815 non-null  int64  
 12  pricePerSquareFoot  34815 non-null  int64  
 13  city                34815 non-null  object 
 14  state               34815 non-null  object 
 15  yearBuilt           34815 non-null  int64  
 16  stre

In [7]:
print(realEstateCa_df.duplicated())

0        False
1        False
2        False
3        False
4        False
         ...  
35384    False
35385    False
35386    False
35387    False
35388    False
Length: 34815, dtype: bool


In [8]:
# Remove homes listed for less than $1000
filter1 = realEstateCa_df[realEstateCa_df['price'] >= 1000]

# Remove homes with less than 120 living area
filter2 = filter1[filter1['livingArea'] >= 120]

# Remove homes with no bathrooms
filter3 = filter2[filter2['bathrooms'] != 0]

# Remove homes with no homeType that have MULTI_FAMILY, LOT, MILY
filter4 = filter3.loc[filter3['homeType'] == 'MULTI_FAMILY', :] = filter3.loc[filter3['homeType'] != 'LOT', :] = filter3.loc[filter3['homeType'] != 'MILY', :]

# New data name
realestate_data = filter4.drop('time',axis=1)
realestate_data.head()

Unnamed: 0,index_id,id,stateId,countyId,cityId,country,datePostedString,is_bankOwned,is_forAuction,event,...,parking,garageSpaces,hasGarage,levels,pool,spa,isNewConstruction,hasPetsAllowed,homeType,county
1,1,94564-18496265,9,189,36958,USA,7/12/2021,0,0,Listed for sale,...,1,2,1,One Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County
2,2,94564-18484475,9,190,36958,USA,7/8/2021,0,0,Listed for sale,...,1,2,1,One Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County
3,3,94564-18494835,9,191,36958,USA,7/7/2021,0,0,Listed for sale,...,1,1,1,Two Story,0,1,0,0,SINGLE_FAMILY,Contra Costa County
5,5,94564-18484390,9,193,36958,USA,7/6/2021,0,0,Listed for sale,...,1,2,1,One Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County
6,6,94564-50919342,9,194,36958,USA,7/5/2021,0,0,Listed for sale,...,1,2,1,Two Story,0,0,0,0,SINGLE_FAMILY,Contra Costa County


In [9]:
unique_ids = realestate_data["id"].nunique()
unique_ids

22953

In [11]:
# Export categories_df and subcategories_df as CSV files.
realestate_data.to_csv("realestate_data.csv", index=False)

