In [9]:
import pandas as pd
from datetime import datetime
from ast import literal_eval

## Load Data

In [10]:
df = pd.read_csv('data/listings.csv.gz', index_col=0, compression='gzip', )

## Drop Irrelevant Features

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95144 entries, 56229 to 1307795865634995863
Data columns (total 74 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   listing_url                                   95144 non-null  object 
 1   scrape_id                                     95144 non-null  int64  
 2   last_scraped                                  95144 non-null  object 
 3   source                                        95144 non-null  object 
 4   name                                          95144 non-null  object 
 5   description                                   92196 non-null  object 
 6   neighborhood_overview                         43463 non-null  object 
 7   picture_url                                   95134 non-null  object 
 8   host_id                                       95144 non-null  int64  
 9   host_url                                      95

Furthermore, remove features with more than 50% missing values.

In [12]:
print(f"Features with more than 50% missing values:\n{df.columns[df.isnull().mean() > .5]}")

df = df.dropna(thresh=.5*len(df), axis=1)

Features with more than 50% missing values:
Index(['neighborhood_overview', 'host_neighbourhood', 'neighbourhood',
       'neighbourhood_group_cleansed', 'calendar_updated', 'license'],
      dtype='object')


Remove columns which hold only one unique value, making it redundant to include for training machine learning models.

In [13]:
print(f"Features with only one unique value:\n{df.columns[df.nunique() == 1]}")

df = df.loc[:,df.apply(pd.Series.nunique) != 1]

Features with only one unique value:
Index(['scrape_id', 'has_availability'], dtype='object')


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95144 entries, 56229 to 1307795865634995863
Data columns (total 66 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   listing_url                                   95144 non-null  object 
 1   last_scraped                                  95144 non-null  object 
 2   source                                        95144 non-null  object 
 3   name                                          95144 non-null  object 
 4   description                                   92196 non-null  object 
 5   picture_url                                   95134 non-null  object 
 6   host_id                                       95144 non-null  int64  
 7   host_url                                      95144 non-null  object 
 8   host_name                                     95137 non-null  object 
 9   host_since                                    95

In [15]:
feats = [
    'listing_url',
    'last_scraped',
    'source',
    'name',
    'description',
    'picture_url',
    'host_id',
    'host_url',
    'host_name',
    'host_location',
    'host_about',
    'host_response_rate',
    'host_acceptance_rate',
    'host_thumbnail_url',
    'host_picture_url',
    'host_listings_count',
    'host_verifications',
    'latitude',
    'longitude',
    'property_type',
    'bathrooms_text',
    'bedrooms',
    'minimum_minimum_nights',
    'maximum_minimum_nights',
    'minimum_maximum_nights',
    'maximum_maximum_nights',
    'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'availability_30',
    'availability_60',
    'availability_90',
    'availability_365',
    'calendar_last_scraped',
    'number_of_reviews_ltm',
    'number_of_reviews_l30d',
    'last_review',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms',
    ]
df = df.drop(feats, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95144 entries, 56229 to 1307795865634995863
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   host_since                 95137 non-null  object 
 1   host_response_time         62709 non-null  object 
 2   host_is_superhost          93396 non-null  object 
 3   host_total_listings_count  95137 non-null  float64
 4   host_has_profile_pic       95137 non-null  object 
 5   host_identity_verified     95137 non-null  object 
 6   neighbourhood_cleansed     95144 non-null  object 
 7   room_type                  95144 non-null  object 
 8   accommodates               95144 non-null  int64  
 9   bathrooms                  62744 non-null  float64
 10  beds                       62690 non-null  float64
 11  amenities                  95144 non-null  object 
 12  price                      62777 non-null  object 
 13  minimum_nights             95144 

## Stardardizing and Formatting Data

Convert the price from object to float

In [16]:
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

Convert f/t entries to boolean [0, 1] entries

In [17]:
boolean_cols = [
    'host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'instant_bookable'
]

m = {'t': True, 'f': False}

for col in boolean_cols:
    df[col] = df[col].map(m).astype(bool).astype(int)

Convert date strings to DateTime objects

In [18]:
baseline_date = datetime(2025, 3, 1)

date_cols = [
    'host_since',
    'first_review',
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col])
    df[col] = (baseline_date - df[col]).dt.days

Convert host_response_time into a numerical value

In [19]:
m = {'within an hour': 1, 'within a few hours': 2, 'within a day': 3, 'a few days or more': 4}

df['host_response_time'] = df['host_response_time'].map(m).astype(float)

Convert neighbourhood_cleansed and room_type to a numeric value through one-hot encoding

In [20]:
# Neighbourhood
dummies = pd.get_dummies(df['neighbourhood_cleansed']).astype(int)
df = df.join(dummies)
df.drop('neighbourhood_cleansed', axis=1, inplace=True)

# Room type
dummies = pd.get_dummies(df['room_type']).astype(int)
df = df.join(dummies)
df.drop('room_type', axis=1, inplace=True)

Convert the list of amenities to a integer count of the list

In [21]:
df['amenities'] = df['amenities'].apply(literal_eval).apply(len)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 95144 entries, 56229 to 1307795865634995863
Data columns (total 55 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   host_since                 95137 non-null  float64
 1   host_response_time         62709 non-null  float64
 2   host_is_superhost          95144 non-null  int64  
 3   host_total_listings_count  95137 non-null  float64
 4   host_has_profile_pic       95144 non-null  int64  
 5   host_identity_verified     95144 non-null  int64  
 6   accommodates               95144 non-null  int64  
 7   bathrooms                  62744 non-null  float64
 8   beds                       62690 non-null  float64
 9   amenities                  95144 non-null  int64  
 10  price                      62777 non-null  float64
 11  minimum_nights             95144 non-null  int64  
 12  maximum_nights             95144 non-null  int64  
 13  number_of_reviews          95144 

## Missing Data

In [23]:
df.isnull().sum()

host_since                       7
host_response_time           32435
host_is_superhost                0
host_total_listings_count        7
host_has_profile_pic             0
host_identity_verified           0
accommodates                     0
bathrooms                    32400
beds                         32454
amenities                        0
price                        32367
minimum_nights                   0
maximum_nights                   0
number_of_reviews                0
first_review                 24584
review_scores_rating         24584
instant_bookable                 0
reviews_per_month            24584
Barking and Dagenham             0
Barnet                           0
Bexley                           0
Brent                            0
Bromley                          0
Camden                           0
City of London                   0
Croydon                          0
Ealing                           0
Enfield                          0
Greenwich           

Remove all rows of data where the price is unknown, as these cannot be used to train or test a model to predict the price.

In [24]:
print(f"Number of rows containing no price: {df['price'].isnull().sum()}")

df = df.dropna(subset=['price'])

Number of rows containing no price: 32367


Remove duplicate entries from the DataFrame

In [25]:
print(f"Number of duplicate entries: {df.duplicated().sum()}")

df = df.drop_duplicates()

Number of duplicate entries: 258


Remove rows which have more than half of the features missing.

In [26]:
print(f"Number of rows missing more than 50% of features: {df.isnull().mean(axis=1).gt(.5).sum()}")

df = df[df.isnull().mean(axis=1) < .5]

Number of rows missing more than 50% of features: 0


In [27]:
print(df.isnull().sum().to_string())

host_since                       2
host_response_time            7432
host_is_superhost                0
host_total_listings_count        2
host_has_profile_pic             0
host_identity_verified           0
accommodates                     0
bathrooms                       83
beds                           130
amenities                        0
price                            0
minimum_nights                   0
maximum_nights                   0
number_of_reviews                0
first_review                 14267
review_scores_rating         14267
instant_bookable                 0
reviews_per_month            14267
Barking and Dagenham             0
Barnet                           0
Bexley                           0
Brent                            0
Bromley                          0
Camden                           0
City of London                   0
Croydon                          0
Ealing                           0
Enfield                          0
Greenwich           

## Outliers

Remove all rows where the price is higher than the 99th percentile. This is to remove faulty listings, which can negatively impact the model.

In [28]:
print(f"Number of outlier above 99th percentile: {len(df[df['price'] > df['price'].quantile(0.99)])}")

df = df[df['price'] < df['price'].quantile(0.99)]

Number of outlier above 99th percentile: 625


---

In [29]:
df.to_csv('data/listings_cleaned.csv.gz', index=True, compression='gzip')