In [165]:
import pandas as pd
import ast
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [166]:
try:
    df = pd.read_csv('listings.csv', encoding='utf-8')
    print("Dataset loaded successfully!")
    print(f"Initial shape: {df.shape}")
except FileNotFoundError:
    print("Please provide the correct path to your CSV file")

Dataset loaded successfully!
Initial shape: (13790, 79)


In [155]:
cols_with_nulls = df.columns[df.isnull().any()]
df[cols_with_nulls].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13790 entries, 0 to 13789
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   description                   13389 non-null  object 
 1   neighborhood_overview         6887 non-null   object 
 2   host_name                     13784 non-null  object 
 3   host_since                    13784 non-null  object 
 4   host_location                 10171 non-null  object 
 5   host_about                    6467 non-null   object 
 6   host_response_time            9961 non-null   object 
 7   host_response_rate            9961 non-null   object 
 8   host_acceptance_rate          10637 non-null  object 
 9   host_is_superhost             13532 non-null  object 
 10  host_thumbnail_url            13784 non-null  object 
 11  host_picture_url              13784 non-null  object 
 12  host_neighbourhood            5367 non-null   object 
 13  h

In [141]:
# drop columns having 100% NaN
df.drop(columns=['neighbourhood_group_cleansed', 'calendar_updated', 'license'], inplace=True)

In [54]:
# drop column listing_url
df.drop('listing_url', axis=1, inplace=True)

In [55]:
# drop column host_url
df.drop('host_url', axis=1, inplace=True)

In [56]:
# delete 'scrape' from values in column 'source' 
df['source'] = df['source'].str.replace('scrape', '', regex=False)

In [57]:
# remove HTMl tags from column 'description'
df['description'] = df['description'].str.replace(r'<.*?>', '', regex=True)

In [58]:
# remove HTMl tags from column 'neighborhood_overview'
df['neighborhood_overview'] = df['neighborhood_overview'].str.replace(r'<.*?>', '', regex=True)

In [59]:
# rename wrongly spelled column 'neighborhood_overview' to 'neighbourhood_overview'
df.rename(columns={'neighborhood_overview': 'neighbourhood_overview'}, inplace=True)

In [142]:
# drop column neighbourhood
df.drop('neighbourhood', axis=1, inplace=True)

In [167]:
import warnings
# suppress FutureWarning regarding pandas 3.0
warnings.simplefilter(action='ignore', category=FutureWarning)

# Impute 'a few days or more" for NaNs
df['host_response_time'].fillna('a few days or more', inplace=True)
# Mapping
host_response_time_mapping = {
    'within an hour': 0,
    'within a few hours': 1,
    'within a day': 2,
    'a few days or more': 3}
# Actual mapping
df['host_response_time_mapped'] = df['host_response_time'].map(response_time_mapping)
# Drop original column
df.drop('host_response_time', axis=1, inplace=True)

In [134]:
# Parse strings into actual lists ---
def parse_to_list(entry):
    if pd.isna(entry) or entry == 'None': 
        return []
    if isinstance(entry, list): 
        return entry
    if isinstance(entry, str):
        if entry == '[]': 
            return []
        try:
            # Safely evaluate string to Python literal
            evaluated = ast.literal_eval(entry)
            # Ensure the evaluated result is a list
            return evaluated if isinstance(evaluated, list) else []
        except (ValueError, SyntaxError):
            print(f"Warning: Could not parse entry: {entry}")
            return []
    return [] # Fallback

df['parsed_verifications'] = df['host_verifications'].apply(parse_to_list)

# Use MultiLabelBinarizer to create binary columns ---
mlb = MultiLabelBinarizer()

# Fit MultiLabelBinarizer, transform the parsed lists
# and create a NumPy array of 0s and 1s
encoded_data = mlb.fit_transform(df['parsed_verifications'])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded_data,
                          columns=[f'verified_by_{cls.replace(" ", "_")}' for cls in mlb.classes_],
                          index=df.index)

# Concatenate with original DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original and temporary column
df.drop(['host_verifications', 'parsed_verifications'], axis=1, inplace=True)

In [157]:
df['price'] = df['price'].str.replace('$', '', regex=False)\
                         .str.replace(',', '', regex=False)\
                         .astype(float)

In [159]:
upper_threshold_percentile = df['price'].quantile(0.9989)
# Keep rows < threshold AND NaNs
df = df[(df['price'] <= upper_threshold_percentile) | (df['price'].isna())]

In [162]:
grouping_columns = ['neighbourhood_cleansed', 'bedrooms', 'bathrooms']

df['price_imputed_grouped_median'] = df['price']

# Calculate group medians and fill NaNs with the results
df['price_imputed_grouped_median'] = df.groupby(grouping_columns)['price_imputed_grouped_median'] \
                                     .transform(lambda x: x.fillna(x.median()))

# If result for a group is NaN, the glogal median is used
if df['price_imputed_grouped_median'].isnull().any():
    global_median = df['price'].median() 
    df['price_imputed_final'] = df['price_imputed_grouped_median'].fillna(global_median)
else:
    df['price_imputed_final'] = df['price_imputed_grouped_median']
