In [1]:
import pandas as pd
import numpy as np

In [9]:
raw_path = "../data/raw/listings_raw.csv"
df = pd.read_csv(raw_path)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36111 entries, 0 to 36110
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            36111 non-null  int64  
 1   listing_url                                   36111 non-null  object 
 2   scrape_id                                     36111 non-null  int64  
 3   last_scraped                                  36111 non-null  object 
 4   source                                        36111 non-null  object 
 5   name                                          36109 non-null  object 
 6   description                                   35153 non-null  object 
 7   neighborhood_overview                         18704 non-null  object 
 8   picture_url                                   36111 non-null  object 
 9   host_id                                       36111 non-null 

In [None]:
print("Original shape:", df.shape)

cols_map = {
    "price": "price",
    "room_type": "room_type",
    "neighbourhood_cleansed": "neighbourhood",
    "minimum_nights": "minimum_nights",
    "availability_365": "availability_365",
    "number_of_reviews": "number_of_reviews",
    "host_id": "host_id",
    "calculated_host_listings_count": "host_listings_count",
    "latitude": "latitude",
    "longitude": "longitude",
}

df = df[list(cols_map.keys())].rename(columns=cols_map)


Original shape: (36111, 79)


In [12]:
df.head()

Unnamed: 0,price,room_type,neighbourhood,minimum_nights,availability_365,number_of_reviews,host_id,host_listings_count,latitude,longitude
0,$66.00,Private room,Sunnyside,30,77,16,317540555,3,40.74698,-73.91763
1,,Private room,Nolita,30,0,0,68718914,1,40.72314,-73.99323
2,,Private room,Sunset Park,30,0,1,317770098,1,40.64607,-74.00552
3,,Entire home/apt,Clinton Hill,90,0,1,17211451,1,40.6837,-73.96115
4,$76.00,Private room,East Village,30,168,20,4765305,8,40.72147,-73.9827


In [16]:
df["price"].head()

0    $66.00
1       NaN
2       NaN
3       NaN
4    $76.00
Name: price, dtype: object

In [18]:
# Clean and convert price from strings like "$66.00" or "$1,200.00" to float

df["price"] = (df["price"].astype(str).str.replace(r"[$,]", "", regex=True))

df["price"] = pd.to_numeric(df["price"], errors="coerce")

df["price"].head(), df["price"].dtype


(0    66.0
 1     NaN
 2     NaN
 3     NaN
 4    76.0
 Name: price, dtype: float64,
 dtype('float64'))

In [19]:
# Drop rows with missing critical fields
critical_cols = ["price", "room_type", "neighbourhood"]
df = df.dropna(subset=critical_cols).copy()


In [21]:
numeric_cols = ["minimum_nights", "availability_365", "number_of_reviews", "host_listings_count"]
for col in numeric_cols:df[col] = df[col].fillna(0)

print(df.shape)
df.isna().sum()


(21328, 10)


price                  0
room_type              0
neighbourhood          0
minimum_nights         0
availability_365       0
number_of_reviews      0
host_id                0
host_listings_count    0
latitude               0
longitude              0
dtype: int64

In [22]:
df["price"].describe()

count    21328.000000
mean       680.526819
std       4480.453282
min         10.000000
25%         89.000000
50%        154.000000
75%        279.000000
max      50104.000000
Name: price, dtype: float64

In [26]:
# Remove the top most expensive listings
upper_limit = df["price"].quantile(0.99) 

print("99th percentile price:", upper_limit)

df = df[df["price"] <= upper_limit].copy() 
print(df.shape)
df["price"].describe()


99th percentile price: 1336.0
(20905, 10)


count    20905.000000
mean       210.379000
std        184.039583
min         10.000000
25%         88.000000
50%        150.000000
75%        266.000000
max       1336.000000
Name: price, dtype: float64

In [27]:
df["room_type"] = df["room_type"].astype("category")
df["neighbourhood"] = df["neighbourhood"].astype("category")

df.dtypes

price                   float64
room_type              category
neighbourhood          category
minimum_nights            int64
availability_365          int64
number_of_reviews         int64
host_id                   int64
host_listings_count       int64
latitude                float64
longitude               float64
dtype: object

In [28]:
path = "../data/clean/listings_clean.csv"
df.to_csv(path, index=False)