In [None]:
import pandas as pd
from datetime import datetime
from ast import literal_eval

## Load Data

In [None]:
df = pd.read_csv('data/listings.csv.gz', index_col=0, compression='gzip', )

## Drop Irrelevant Features

In [None]:
df.info()

Furthermore, remove features with more than 50% missing values.

In [None]:
print(f"Features with more than 50% missing values:\n{df.columns[df.isnull().mean() > .5]}")

df = df.dropna(thresh=.5*len(df), axis=1)

Remove columns which hold only one unique value, making it redundant to include for training machine learning models.

In [None]:
print(f"Features with only one unique value:\n{df.columns[df.nunique() == 1]}")

df = df.loc[:,df.apply(pd.Series.nunique) != 1]

In [None]:
df.info()

In [None]:
feats = [
    'listing_url',
    'last_scraped',
    'source',
    'name',
    'description',
    'picture_url',
    'host_id',
    'host_url',
    'host_name',
    'host_location',
    'host_about',
    'host_response_rate',
    'host_acceptance_rate',
    'host_thumbnail_url',
    'host_picture_url',
    'host_listings_count',
    'host_verifications',
    'latitude',
    'longitude',
    'property_type',
    'bathrooms_text',
    'bedrooms',
    'minimum_minimum_nights',
    'maximum_minimum_nights',
    'minimum_maximum_nights',
    'maximum_maximum_nights',
    'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'availability_30',
    'availability_60',
    'availability_90',
    'availability_365',
    'calendar_last_scraped',
    'number_of_reviews_ltm',
    'number_of_reviews_l30d',
    'last_review',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms',
    ]
df = df.drop(feats, axis=1)
df.info()

## Stardardizing and Formatting Data

Convert the price from object to float

In [None]:
df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)

Convert f/t entries to boolean [0, 1] entries

In [None]:
boolean_cols = [
    'host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'instant_bookable'
]

m = {'t': True, 'f': False}

for col in boolean_cols:
    df[col] = df[col].map(m).astype(bool).astype(int)

Convert date strings to DateTime objects

In [None]:
baseline_date = datetime(2025, 3, 1)

date_cols = [
    'host_since',
    'first_review',
]

for col in date_cols:
    df[col] = pd.to_datetime(df[col])
    df[col] = (baseline_date - df[col]).dt.days

Convert host_response_time into a numerical value

In [None]:
m = {'within an hour': 1, 'within a few hours': 2, 'within a day': 3, 'a few days or more': 4}

df['host_response_time'] = df['host_response_time'].map(m).astype(float)

Convert neighbourhood_cleansed and room_type to a numeric value through one-hot encoding

In [None]:
# Neighbourhood
dummies = pd.get_dummies(df['neighbourhood_cleansed']).astype(int)
df = df.join(dummies)
df.drop('neighbourhood_cleansed', axis=1, inplace=True)

# Room type
dummies = pd.get_dummies(df['room_type']).astype(int)
df = df.join(dummies)
df.drop('room_type', axis=1, inplace=True)

Convert the list of amenities to a integer count of the list

In [None]:
df['amenities'] = df['amenities'].apply(literal_eval).apply(len)

In [None]:
df.info()

## Missing Data

In [None]:
df.isnull().sum()

Remove all rows of data where the price is unknown, as these cannot be used to train or test a model to predict the price.

In [None]:
print(f"Number of rows containing no price: {df['price'].isnull().sum()}")

df = df.dropna(subset=['price'])

Remove duplicate entries from the DataFrame

In [None]:
print(f"Number of duplicate entries: {df.duplicated().sum()}")

df = df.drop_duplicates()

Remove rows which have more than half of the features missing.

In [None]:
print(f"Number of rows missing more than 50% of features: {df.isnull().mean(axis=1).gt(.5).sum()}")

df = df[df.isnull().mean(axis=1) < .5]

In [None]:
print(df.isnull().sum().to_string())

## Outliers

Remove all rows where the price is higher than the 99th percentile. This is to remove faulty listings, which can negatively impact the model.

In [None]:
print(f"Number of outlier above 99th percentile: {len(df[df['price'] > df['price'].quantile(0.99)])}")

df = df[df['price'] < df['price'].quantile(0.99)]

---