In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick

In [None]:
# Read HouseTS.csv into area_df
area_df = pd.read_csv('../raw_data/HouseTS.csv')

# Read realtor-data.csv into house_df
house_df = pd.read_csv('../raw_data/realtor-data.csv')

In [None]:
# Create list of unique zipcodes in area_df
unique_zipcodes_area_df = area_df['zipcode'].unique().tolist()
print(len(unique_zipcodes_area_df))

# Filter house_df by unique_zipcoes_area_df
filtered_house_df = house_df[house_df['zip_code'].isin(unique_zipcodes_area_df)]
filtered_house_df.info()

6226


In [None]:
# Drop columns 'brokered_by' and 'status'
filtered_house_df = filtered_house_df.drop(columns=['brokered_by', 'status'])

# Drop duplicates
filtered_house_df = filtered_house_df.drop_duplicates()

# Drop columns 'street', 'city', 'state' and 'prev_sold_date'
filtered_house_df = filtered_house_df.drop(columns=['street', 'city', 'state', 'prev_sold_date'])

# Drop rows with NaN values from 'price'
filtered_house_df = filtered_house_df.dropna(subset=['price'])

In [None]:
# Create list where 'bed' & 'bath' & 'house_size' are NaN
nan_values = filtered_house_df[
    (pd.isna(filtered_house_df['bed'])) &
    (pd.isna(filtered_house_df['bath'])) &
    (pd.isna(filtered_house_df['house_size']))
]

nan_values

In [None]:
# Filter out rows that are in nan_values
cleaned_house_df = filtered_house_df[~filtered_house_df.index.isin(nan_values.index)]

# Verify the result
print(f"Original rows in filtered_house_df: {len(filtered_house_df)}")
print(f"Rows in nan_values (potential land sales): {len(nan_values)}")
print(f"Rows in cleaned_house_df: {len(cleaned_house_df)}")
print("\nFirst few rows of cleaned_house_df:")
print(cleaned_house_df.head())

In [None]:
# Price per square foot of the house
cleaned_house_df['ppsf_house'] = round(cleaned_house_df['price'] / cleaned_house_df['house_size'], 2)

# Price per acre of the lot (if lot_sqft is in square feet, 1 acre = 43560 sq ft)
cleaned_house_df['ppa_lot'] = round(cleaned_house_df['price'] / cleaned_house_df['acre_lot'], 2)

In [244]:
cleaned_house_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 780780 entries, 5199 to 2220331
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   price       780780 non-null  float64
 1   bed         770107 non-null  float64
 2   bath        758935 non-null  float64
 3   acre_lot    618782 non-null  float64
 4   zip_code    780780 non-null  float64
 5   house_size  723948 non-null  float64
 6   ppsf_house  723948 non-null  float64
 7   ppa_lot     618782 non-null  float64
dtypes: float64(8)
memory usage: 53.6 MB


In [245]:
# Calculate Q1 and Q3 for the 'ppsf_house' and 'acre_lot'
Q1_house = cleaned_house_df['ppsf_house'].quantile(0.1)
Q1_lot = cleaned_house_df['ppa_lot'].quantile(0.1)

Q3_house = cleaned_house_df['ppsf_house'].quantile(0.9)
Q3_lot = cleaned_house_df['ppa_lot'].quantile(0.9)

In [None]:
# Filter the DataFrame
filtered_house_df = cleaned_house_df[
    (cleaned_house_df['ppsf_house'] > Q1_house) &
    (cleaned_house_df['ppsf_house'] < Q3_house) &
    (cleaned_house_df['ppa_lot'] > 0) &
    (cleaned_house_df['ppa_lot'] < Q3_lot)
]

# Display the filtered DataFrame
filtered_house_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 436746 entries, 5199 to 2220331
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   price       436746 non-null  float64
 1   bed         433660 non-null  float64
 2   bath        433175 non-null  float64
 3   acre_lot    436746 non-null  float64
 4   zip_code    436746 non-null  float64
 5   house_size  436746 non-null  float64
 6   ppsf_house  436746 non-null  float64
 7   ppa_lot     436746 non-null  float64
dtypes: float64(8)
memory usage: 30.0 MB


In [None]:
"""
# Drop rows with any NaN values
cleaned_df = filtered_house_df.dropna()

# Display the number of rows after dropping NaNs
print(f"Number of rows after dropping NaN values: {len(cleaned_df)}")

# Optional: Display the updated DataFrame info to confirm
print(cleaned_df.info())
"""

In [250]:
# Create a copy to the original DataFrame
imputed_df = filtered_house_df.copy()

# Impute missing values with the median for 'bed' and 'bath'
imputed_df['bed'] = imputed_df['bed'].fillna(imputed_df['bed'].median())
imputed_df['bath'] = imputed_df['bath'].fillna(imputed_df['bath'].median())

# Verify no NaN values remain in 'bed' and 'bath'
print("Number of NaN values after imputation:")
print(imputed_df.isna().sum())

# Optional: Display DataFrame info to confirm
imputed_df.describe()

Number of NaN values after imputation:
price         0
bed           0
bath          0
acre_lot      0
zip_code      0
house_size    0
ppsf_house    0
ppa_lot       0
dtype: int64


Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,ppsf_house,ppa_lot
count,436746.0,436746.0,436746.0,436746.0,436746.0,436746.0,436746.0,436746.0
mean,578447.1,3.461621,2.664679,9.129796,59031.15917,2161.956803,271.313944,3010703.0
std,450228.2,1.188102,1.238211,723.294854,30626.41762,1238.217495,113.604414,2373609.0
min,20000.0,1.0,1.0,0.01,1431.0,140.0,135.43,1.6
25%,335000.0,3.0,2.0,0.14,30248.0,1420.0,184.19,1310000.0
50%,471000.0,3.0,2.0,0.19,63385.0,1882.0,237.74,2416667.0
75%,680000.0,4.0,3.0,0.33,85739.0,2569.0,328.1075,4055556.0
max,20500000.0,108.0,175.0,100000.0,98686.0,114000.0,634.34,11046780.0
