In [105]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/listings_data.csv")

In [106]:
print("Initial Data Sample:\n")
print(df.head())

print("\nDataset Overview:\n")
print(df.info())

print("\nDescriptive Statistics:\n")
print(df.describe())

print("\nChecking for Missing Values:\n")
print(df.isnull().sum())


Initial Data Sample:

   Square Feet   Price  Price/Sqft      Zip    City State      Street Address  \
0       1140.0  475000  416.666667  98001.0  Algona    WA      707 Celery Ave   
1       1310.0  470000  358.778626  98001.0  Algona    WA         512 Main St   
2       1680.0  500000  297.619048  98001.0  Algona    WA       221 2nd Ave N   
3       1240.0  525000  423.387097  98001.0  Algona    WA  1036 Algona Blvd N   
4          NaN  660000         NaN  98001.0  Algona    WA       515 4th Ave N   

   Bedrooms  Bathrooms                                                URL  \
0       3.0        3.0  https://www.redfin.com/WA/Algona/707-Celery-Av...   
1       4.0        4.0  https://www.redfin.com/WA/Algona/512-Main-St-9...   
2       4.0        4.0  https://www.redfin.com/WA/Algona/221-2nd-Ave-N...   
3       4.0        4.0  https://www.redfin.com/WA/Algona/1036-Algona-B...   
4       NaN        NaN  https://www.redfin.com/WA/Algona/515-4th-Ave-N...   

                            

In [107]:
df[["Price", "Square Feet"]] = df[["Price", "Square Feet"]].replace("Unknown", np.nan)
df["Zip"].fillna("")
df = df.dropna(subset=["Price", "Square Feet"], how="all")
df = df.drop_duplicates()

In [108]:
df["City"] = df["City"].str.title().str.strip()
df["State"] = df["State"].str.upper().str.strip()

In [109]:
numeric_columns = ["Price", "Square Feet", "Price/Sqft", "Bathrooms", "Bedrooms"]
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

string_columns = ["Zip", "Street Address", "State", "URL", "Image"]
for col in string_columns:
    df[col] = df[col].astype(str)
    
df["Zip"] = df["Zip"].str.replace(".0","", regex=False)


In [None]:
bins = [0, 200000, 500000, 1000000, float("inf")]
labels = ["Low", "Medium", "High", "Luxury"]

df["Price Category"] = pd.cut(df['Price'], bins=bins, labels=labels)


Unnamed: 0,Square Feet,Price,Price/Sqft,Zip,City,State,Street Address,Bedrooms,Bathrooms,URL,Image,Price Category
0,1140.0,475000,416.666667,98001,Algona,WA,707 Celery Ave,3.0,3.0,https://www.redfin.com/WA/Algona/707-Celery-Av...,https://ssl.cdn-redfin.com/photo/1/islphoto/29...,Medium
1,1310.0,470000,358.778626,98001,Algona,WA,512 Main St,4.0,4.0,https://www.redfin.com/WA/Algona/512-Main-St-9...,https://ssl.cdn-redfin.com/photo/1/islphoto/21...,Medium
2,1680.0,500000,297.619048,98001,Algona,WA,221 2nd Ave N,4.0,4.0,https://www.redfin.com/WA/Algona/221-2nd-Ave-N...,https://ssl.cdn-redfin.com/photo/1/islphoto/16...,Medium
3,1240.0,525000,423.387097,98001,Algona,WA,1036 Algona Blvd N,4.0,4.0,https://www.redfin.com/WA/Algona/1036-Algona-B...,https://ssl.cdn-redfin.com/photo/1/islphoto/65...,High
4,,660000,,98001,Algona,WA,515 4th Ave N,,,https://www.redfin.com/WA/Algona/515-4th-Ave-N...,https://ssl.cdn-redfin.com/photo/1/islphoto/20...,High
...,...,...,...,...,...,...,...,...,...,...,...,...
3146,,575000,,98077,Woodinville,WA,22123 NE 170th Pl,,,https://www.redfin.com/WA/Woodinville/22123-NE...,https://ssl.cdn-redfin.com/photo/1/islphoto/64...,High
3147,5200.0,5795000,1114.423077,98004,Yarrow Point,WA,4229 95th Ave NE,5.0,5.0,https://www.redfin.com/WA/Bellevue/4229-95th-A...,https://ssl.cdn-redfin.com/photo/1/islphoto/27...,Luxury
3148,7465.0,15750000,2109.845948,98004,Yarrow Point,WA,9015 NE 47th St,5.0,5.0,https://www.redfin.com/WA/Yarrow-Point/9015-NE...,https://ssl.cdn-redfin.com/photo/1/islphoto/38...,Luxury
3149,7830.0,8650000,1104.725415,98004,Yarrow Point,WA,4626 95th Ave NE,5.0,5.0,https://www.redfin.com/WA/Bellevue/4626-95th-A...,https://ssl.cdn-redfin.com/photo/1/islphoto/70...,Luxury
