In [29]:
import pandas as pd

In [30]:
listings = pd.read_csv("data/listings.csv")
rows, cols = listings.shape

In [31]:
# Calculating percentage of missing data in each column
no_nan_values = listings.isna().sum()
missing_rate_per_col = no_nan_values / rows * 100
missing_rate_per_col.sort_values(ascending=False)

license                           100.000000
neighbourhood_group               100.000000
price                              35.143972
last_review                        26.035944
reviews_per_month                  26.035944
host_name                           0.041386
host_id                             0.000000
neighbourhood                       0.000000
latitude                            0.000000
longitude                           0.000000
room_type                           0.000000
name                                0.000000
minimum_nights                      0.000000
number_of_reviews                   0.000000
calculated_host_listings_count      0.000000
availability_365                    0.000000
number_of_reviews_ltm               0.000000
id                                  0.000000
dtype: float64

In [32]:
# Removing columns with no data and rows with no price
filtered_listings = (
    listings
    .drop(columns=["license", "neighbourhood_group"])
    .dropna(subset=["price"])
    .assign(reviews_per_month=lambda df: df["reviews_per_month"].fillna(0))
)

In [36]:
filtered_listings = filtered_listings.assign(last_review=lambda df: pd.to_datetime(df["last_review"]))
filtered_listings["has_review"] = filtered_listings["last_review"].notna()
filtered_listings["days_since_review"] = (pd.to_datetime("today") - filtered_listings["last_review"]).dt.days

In [39]:
filtered_listings = filtered_listings.assign(host_name=lambda df: df["host_name"].fillna("Unknown"))

In [44]:
filtered_listings.dtypes

id                                         int64
name                                      object
host_id                                    int64
host_name                                 object
neighbourhood                             object
latitude                                 float64
longitude                                float64
room_type                                 object
price                                    float64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[ns]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
number_of_reviews_ltm                      int64
has_review                                  bool
days_since_review                        float64
dtype: object

In [45]:
filtered_listings.to_csv("data/listings_clean.csv")