In [625]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

df = pd.read_csv("data/listings_data.csv")

In [626]:
print("Initial Data Sample:\n")
print(df.head())

Initial Data Sample:

   Square Feet   Price  Price/Sqft      Zip    City State      Street Address  \
0       1140.0  475000  416.666667  98001.0  Algona    WA      707 Celery Ave   
1       1310.0  470000  358.778626  98001.0  Algona    WA         512 Main St   
2       1680.0  500000  297.619048  98001.0  Algona    WA       221 2nd Ave N   
3       1240.0  525000  423.387097  98001.0  Algona    WA  1036 Algona Blvd N   
4          NaN  660000         NaN  98001.0  Algona    WA       515 4th Ave N   

   Bedrooms  Bathrooms                                                URL  \
0       3.0        3.0  https://www.redfin.com/WA/Algona/707-Celery-Av...   
1       4.0        4.0  https://www.redfin.com/WA/Algona/512-Main-St-9...   
2       4.0        4.0  https://www.redfin.com/WA/Algona/221-2nd-Ave-N...   
3       4.0        4.0  https://www.redfin.com/WA/Algona/1036-Algona-B...   
4       NaN        NaN  https://www.redfin.com/WA/Algona/515-4th-Ave-N...   

                            

In [627]:
print("\nDataset Overview:\n")
print(df.info())


Dataset Overview:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3151 entries, 0 to 3150
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Square Feet     2813 non-null   float64
 1   Price           3151 non-null   object 
 2   Price/Sqft      2813 non-null   float64
 3   Zip             3149 non-null   float64
 4   City            3149 non-null   object 
 5   State           3149 non-null   object 
 6   Street Address  3151 non-null   object 
 7   Bedrooms        2805 non-null   float64
 8   Bathrooms       2805 non-null   float64
 9   URL             3151 non-null   object 
 10  Image           3133 non-null   object 
dtypes: float64(5), object(6)
memory usage: 270.9+ KB
None


In [628]:
print("\nDescriptive Statistics:\n")
print(df.describe())


Descriptive Statistics:

        Square Feet   Price/Sqft           Zip     Bedrooms    Bathrooms
count   2813.000000  2813.000000   3149.000000  2805.000000  2805.000000
mean    2201.772840   526.485800  98057.575421     3.308734     3.308734
std     1532.618476   358.979561     71.981342     1.548161     1.548161
min      317.000000    18.172727  98001.000000     0.000000     0.000000
25%     1248.000000   350.025038  98019.000000     2.000000     2.000000
50%     1860.000000   449.640288  98034.000000     3.000000     3.000000
75%     2705.000000   592.391304  98072.000000     4.000000     4.000000
max    30036.000000  6352.087114  99809.000000    36.000000    36.000000


In [629]:
print("\nChecking for Missing Values:\n")
print(df.isnull().sum())



Checking for Missing Values:

Square Feet       338
Price               0
Price/Sqft        338
Zip                 2
City                2
State               2
Street Address      0
Bedrooms          346
Bathrooms         346
URL                 0
Image              18
dtype: int64


In [630]:
string_columns = ["Zip", "Street Address", "State", "URL", "Image", "City"]
for cols in string_columns:
    df[cols]= df[cols].fillna("")

In [631]:
numeric_columns = ["Price", "Square Feet", "Bathrooms", "Bedrooms"]
for cols in numeric_columns:
    df[cols] = df[cols].replace({"Unknown": np.nan, 0: np.nan})

In [632]:
df = df.dropna(subset=["Price", "Square Feet"], how="all")
df = df.drop_duplicates()


In [633]:
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [634]:
for col in string_columns:
    df[col] = df[col].astype(str)

In [635]:
df["City"] = df["City"].str.title().str.strip()
df["State"] = df["State"].str.upper().str.strip()
df["Zip"] = df["Zip"].str.replace(".0","", regex=False)

In [636]:
grouped = df.groupby("City")
Q1 = grouped["Price"].transform(lambda x: x.quantile(0.25))
Q3 = grouped["Price"].transform(lambda x: x.quantile(0.75))
IQR = Q3 - Q1

lower_lim = Q1 - 1.5 * IQR
upper_lim = Q3 + 1.5 * IQR

df["Is_Outlier"] = (df["Price"] < lower_lim) | (df["Price"] > upper_lim)
print(df[df["Is_Outlier"]])

      Square Feet    Price   Price/Sqft    Zip         City State  \
4             NaN   660000          NaN  98001       Algona    WA   
5             NaN   110000          NaN  98001       Algona    WA   
20         3050.0  1598750   524.180328  98001       Auburn    WA   
120        5092.0  1780000   349.567950  98001       Auburn    WA   
167        5880.0  2299950   391.147959  98092       Auburn    WA   
...           ...      ...          ...    ...          ...   ...   
3071       3975.0  5990000  1506.918239  98072  Woodinville    WA   
3079       8185.0  5995000   732.437385  98072  Woodinville    WA   
3088       4222.0  5399999  1279.014448  98072  Woodinville    WA   
3092       4140.0  6900000  1666.666667  98077  Woodinville    WA   
3095      11935.0  6350000   532.048597  98077  Woodinville    WA   

             Street Address  Bedrooms  Bathrooms  \
4             515 4th Ave N       NaN        NaN   
5             318 3rd Ave N       NaN        NaN   
20         3724

In [637]:
def winsorize_column(group):
    return winsorize(group, limits=(0.05, 0.05))

df["Price"] = grouped["Price"].transform(winsorize_column)
df["Square Feet"] = grouped["Square Feet"].transform(winsorize_column)


In [638]:
df["Price/Sqft"] = df["Price"] / df["Square Feet"]

In [639]:
bins = [0, 200000, 500000, 1000000, float("inf")]
labels = ["Low", "Medium", "High", "Luxury"]

df["Price Category"] = pd.cut(df['Price'], bins=bins, labels=labels)


In [640]:
df = df.drop(columns=["Is_Outlier"])

In [641]:
print("\nFinal Dataset Overview:")
print(df.info())


Final Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
Index: 2944 entries, 0 to 3150
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Square Feet     2662 non-null   float64 
 1   Price           2944 non-null   int64   
 2   Price/Sqft      2662 non-null   float64 
 3   Zip             2944 non-null   object  
 4   City            2944 non-null   object  
 5   State           2944 non-null   object  
 6   Street Address  2944 non-null   object  
 7   Bedrooms        2622 non-null   float64 
 8   Bathrooms       2622 non-null   float64 
 9   URL             2944 non-null   object  
 10  Image           2944 non-null   object  
 11  Price Category  2944 non-null   category
dtypes: category(1), float64(4), int64(1), object(6)
memory usage: 279.1+ KB
None


In [642]:
print("\nFinal Descriptive Statistics:")
print(df.describe())


Final Descriptive Statistics:
        Square Feet         Price   Price/Sqft     Bedrooms    Bathrooms
count   2662.000000  2.944000e+03  2662.000000  2622.000000  2622.000000
mean    2233.056349  1.207817e+06   511.794024     3.318078     3.318078
std     1472.616958  2.010390e+06   326.123455     1.558074     1.558074
min      317.000000  5.000000e+04    45.454545     1.000000     1.000000
25%     1250.000000  4.998000e+05   338.500501     2.000000     2.000000
50%     1870.000000  7.750000e+05   445.459462     3.000000     3.000000
75%     2759.750000  1.274960e+06   577.849779     4.000000     4.000000
max    17599.000000  5.800000e+07  5290.322581    36.000000    36.000000


In [643]:
print("\nUnique Values in Categorical Columns:")
for col in ['City', 'State', 'Price Category']:
    print(f"\n{col} unique values:")
    print(df[col].value_counts())


Unique Values in Categorical Columns:

City unique values:
City
Kent                257
Auburn              251
Renton              226
Bellevue            212
Kirkland            201
Federal Way         193
Bothell             187
Redmond             115
Woodinville         100
Shoreline            98
Issaquah             94
Burien               77
Des Moines           70
Black Diamond        66
North Bend           64
Carnation            62
Duvall               61
Maple Valley         54
Enumclaw             54
Sammamish            54
Seattle              51
Seatac               49
Kenmore              47
Covington            39
Tukwila              39
Mercer Island        38
Newcastle            31
Snoqualmie Pass      29
Snoqualmie           18
Normandy Park        17
Lake Forest Park     16
Medina               15
Milton                8
Tacoma                7
Algona                6
Clyde Hill            6
Greenwater            6
Yarrow Point          5
Pacific               5

In [644]:
print("\nRemaining Missing Values:")
print(df.isnull().sum())


Remaining Missing Values:
Square Feet       282
Price               0
Price/Sqft        282
Zip                 0
City                0
State               0
Street Address      0
Bedrooms          322
Bathrooms         322
URL                 0
Image               0
Price Category      0
dtype: int64


In [645]:
df.to_csv("./Data/cleaned_listings_data.csv", index=False)
print("\nCleaned dataset saved successfully!")


Cleaned dataset saved successfully!
