In [92]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from dotenv import load_dotenv
from pymongo import MongoClient
import os

load_dotenv()
db_password = os.getenv("MONGODB_PWD")
connection_string = f"mongodb+srv://jayg8868:{db_password}@king-county-housing.mnhm7.mongodb.net/?retryWrites=true&w=majority&appName=king-county-housing"
try:
    client = MongoClient(connection_string)
    
    housing_data = client.housing_data
    raw_king_co_listings_data = housing_data.raw_king_co_listings_data
    
    raw_data = list(raw_king_co_listings_data.find())
    df = pd.DataFrame(raw_data)

except Exception as e:
    print(f"An Error Occured: {e}")



In [93]:
print("Initial Data Sample:\n")
print(df.head())

Initial Data Sample:

                        _id Square Feet   Price    Zip    City State  \
0  675a56b254bbed8997915b27        1680  500000  98001  Algona    WA   
1  675a56b254bbed8997915b28        1240  525000  98001  Algona    WA   
2  675a56b254bbed8997915b29              660000  98001  Algona    WA   
3  675a56b254bbed8997915b2a              110000  98001  Algona    WA   
4  675a56b254bbed8997915b2b        1727  515000  98092  Auburn    WA   

        Street Address Bedrooms Bathrooms  \
0        221 2nd Ave N        4         4   
1   1036 Algona Blvd N        4         4   
2        515 4th Ave N                      
3        318 3rd Ave N                      
4  1110 63rd St SE 102        2         2   

                                                 URL  \
0  https://www.redfin.com/WA/Algona/221-2nd-Ave-N...   
1  https://www.redfin.com/WA/Algona/1036-Algona-B...   
2  https://www.redfin.com/WA/Algona/515-4th-Ave-N...   
3  https://www.redfin.com/WA/Algona/318-3rd-Ave-N.

In [94]:
print("\nDataset Overview:\n")
print(df.info())


Dataset Overview:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2740 entries, 0 to 2739
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   _id             2740 non-null   object
 1   Square Feet     2740 non-null   object
 2   Price           2740 non-null   object
 3   Zip             2738 non-null   object
 4   City            2738 non-null   object
 5   State           2738 non-null   object
 6   Street Address  2740 non-null   object
 7   Bedrooms        2740 non-null   object
 8   Bathrooms       2740 non-null   object
 9   URL             2740 non-null   object
 10  Image           2721 non-null   object
dtypes: object(11)
memory usage: 235.6+ KB
None


In [95]:
print("\nDescriptive Statistics:\n")
print(df.describe())


Descriptive Statistics:

                             _id Square Feet   Price    Zip  City State  \
count                       2740        2740    2740   2738  2738  2738   
unique                      2740        1185    1347     65    45     1   
top     675a56b254bbed8997915b27              550000  98092  Kent    WA   
freq                           1         335      19    104   232  2738   

                   Street Address Bedrooms Bathrooms  \
count                        2740     2740      2740   
unique                       2527       18        18   
top     Homes Available Soon Plan        2         2   
freq                           11      630       630   

                                                      URL  \
count                                                2740   
unique                                               2556   
top     https://www.redfin.com/WA/Redmond/6051-137th-A...   
freq                                                    3   

           

In [96]:
print("\nChecking for Missing Values:\n")
print(df.isnull().sum())



Checking for Missing Values:

_id                0
Square Feet        0
Price              0
Zip                2
City               2
State              2
Street Address     0
Bedrooms           0
Bathrooms          0
URL                0
Image             19
dtype: int64


In [97]:
string_columns = ["Zip", "Street Address", "State", "URL", "Image", "City"]
for cols in string_columns:
    df[cols]= df[cols].fillna("")

In [98]:
numeric_columns = ["Price", "Square Feet", "Bathrooms", "Bedrooms"]
for cols in numeric_columns:
    df[cols] = df[cols].replace({"Unknown": np.nan, 0: np.nan})

In [99]:
df = df.dropna(subset=["Price", "Square Feet"], how="any")
df = df.drop_duplicates()
print(df)

                           _id Square Feet     Price    Zip          City  \
0     675a56b254bbed8997915b27        1680    500000  98001        Algona   
1     675a56b254bbed8997915b28        1240    525000  98001        Algona   
2     675a56b254bbed8997915b29                660000  98001        Algona   
3     675a56b254bbed8997915b2a                110000  98001        Algona   
4     675a56b254bbed8997915b2b        1727    515000  98092        Auburn   
...                        ...         ...       ...    ...           ...   
2735  675a56b254bbed89979165d6                499000  98072   Woodinville   
2736  675a56b254bbed89979165d7                575000  98077   Woodinville   
2737  675a56b254bbed89979165d8        5200   5795000  98004  Yarrow Point   
2738  675a56b254bbed89979165d9        7465  15750000  98004  Yarrow Point   
2739  675a56b254bbed89979165da        6240  17288000  98004  Yarrow Point   

     State       Street Address Bedrooms Bathrooms  \
0       WA        221

In [100]:
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [101]:
for col in string_columns:
    df[col] = df[col].astype(str)

In [102]:
df["City"] = df["City"].str.title().str.strip()
df["State"] = df["State"].str.upper().str.strip()
df["Zip"] = df["Zip"].str.replace(".0","", regex=False)

In [103]:
grouped = df.groupby("City")
Q1 = grouped["Price"].transform(lambda x: x.quantile(0.25))
Q3 = grouped["Price"].transform(lambda x: x.quantile(0.75))
IQR = Q3 - Q1

lower_lim = Q1 - 1.5 * IQR
upper_lim = Q3 + 1.5 * IQR

df["Is_Outlier"] = (df["Price"] < lower_lim) | (df["Price"] > upper_lim)
print(df[df["Is_Outlier"]])

                           _id  Square Feet    Price    Zip         City  \
3     675a56b254bbed8997915b2a          NaN   110000  98001       Algona   
7     675a56b254bbed8997915b2e       3990.0  1725000  98092       Auburn   
29    675a56b254bbed8997915b44       3050.0  1598750  98001       Auburn   
100   675a56b254bbed8997915b8b       5092.0  1780000  98001       Auburn   
139   675a56b254bbed8997915bb2       5880.0  2299950  98092       Auburn   
...                        ...          ...      ...    ...          ...   
2681  675a56b254bbed89979165a0       8185.0  5995000  98072  Woodinville   
2684  675a56b254bbed89979165a3       5198.0  4598990  98072  Woodinville   
2690  675a56b254bbed89979165a9       4140.0  6900000  98077  Woodinville   
2695  675a56b254bbed89979165ae      11935.0  6350000  98077  Woodinville   
2728  675a56b254bbed89979165cf       5373.0  4698990  98072  Woodinville   

     State         Street Address  Bedrooms  Bathrooms  \
3       WA          318 3rd A

In [104]:
def winsorize_column(group):
    return winsorize(group, limits=(0.05, 0.05))

df["Price"] = grouped["Price"].transform(winsorize_column)
df["Square Feet"] = grouped["Square Feet"].transform(winsorize_column)


In [105]:
df["Price/Sqft"] = df["Price"] / df["Square Feet"]

In [106]:
bins = df['Price'].quantile([0, 0.25, 0.5, 0.75, 1.0])
labels = ["Low", "Medium-low", "Medium-High", "High"]

df["Price Category"] = pd.cut(df["Price"], bins=bins, labels=labels, include_lowest=True)

In [107]:
df = df.drop(columns=["Is_Outlier"])

In [108]:
print("\nFinal Dataset Overview:")
print(df.info())


Final Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
Index: 2729 entries, 0 to 2739
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   _id             2729 non-null   object  
 1   Square Feet     2422 non-null   float64 
 2   Price           2729 non-null   int64   
 3   Zip             2729 non-null   object  
 4   City            2729 non-null   object  
 5   State           2729 non-null   object  
 6   Street Address  2729 non-null   object  
 7   Bedrooms        2396 non-null   float64 
 8   Bathrooms       2396 non-null   float64 
 9   URL             2729 non-null   object  
 10  Image           2729 non-null   object  
 11  Price/Sqft      2422 non-null   float64 
 12  Price Category  2729 non-null   category
dtypes: category(1), float64(4), int64(1), object(7)
memory usage: 280.0+ KB
None


In [109]:
print("\nFinal Descriptive Statistics:")
print(df.describe())


Final Descriptive Statistics:
        Square Feet         Price     Bedrooms    Bathrooms   Price/Sqft
count   2422.000000  2.729000e+03  2396.000000  2396.000000  2422.000000
mean    2233.047894  1.182538e+06     3.305509     3.305509   509.102834
std     1622.901446  2.028134e+06     1.598586     1.598586   322.260421
min      317.000000  3.000000e+04     0.000000     0.000000    41.809803
25%     1250.000000  4.850000e+05     2.000000     2.000000   340.909091
50%     1870.000000  7.740000e+05     3.000000     3.000000   443.987012
75%     2726.000000  1.258075e+06     4.000000     4.000000   575.921782
max    30036.000000  5.800000e+07    36.000000    36.000000  5290.322581


In [110]:
print("\nUnique Values in Categorical Columns:")
for col in ['City', 'State', 'Price Category']:
    print(f"\n{col} unique values:")
    print(df[col].value_counts())


Unique Values in Categorical Columns:

City unique values:
City
Kent                232
Auburn              227
Renton              198
Bothell             193
Bellevue            186
Federal Way         183
Kirkland            179
Redmond             124
Woodinville         109
Shoreline            88
Issaquah             80
Burien               79
Des Moines           64
Black Diamond        61
Seatac               59
Duvall               59
Carnation            59
North Bend           54
Enumclaw             52
Seattle              51
Sammamish            51
Maple Valley         46
Tukwila              43
Kenmore              39
Covington            32
Newcastle            30
Mercer Island        26
Snoqualmie Pass      24
Normandy Park        16
Snoqualmie           16
Lake Forest Park     15
Medina                9
Pacific               6
Milton                5
Greenwater            5
Tacoma                5
Yarrow Point          4
Hunts Point           4
Algona                4

In [111]:
print("\nRemaining Missing Values:")
print(df.isnull().sum())


Remaining Missing Values:
_id                 0
Square Feet       307
Price               0
Zip                 0
City                0
State               0
Street Address      0
Bedrooms          333
Bathrooms         333
URL                 0
Image               0
Price/Sqft        307
Price Category      0
dtype: int64


In [112]:
df.to_csv(os.path.join('..', 'data', 'cleaned_listings_data.csv'), index=False)
print(os.path.join('..', 'data', 'cleaned_listings_data.csv'))
print("Cleaned dataset saved to CSV successfully!")

..\data\cleaned_listings_data.csv
Cleaned dataset saved to CSV successfully!


In [None]:
df_dict = df.to_dict(orient="records")
try:
    cleaned_king_co_listings_data = housing_data.cleaned_king_co_listings_data
    cleaned_king_co_listings_data.insert_many(df_dict)

    print("Cleaned dataset saved to DB successfully!")
    
except Exception as e:
    print(f"Data could not be saved to DB: {e}")


Cleaned dataset saved to DB successfully!
