## Imports

In [123]:
import numpy as np 
import pandas as pd 
import re
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import CountVectorizer

### Effects of Policy on the Housing Market
[WiDS Winter 2024 Case]

## Loading Data & Overview

In [124]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Display the first few rows of each dataset
train_data.head()
test_data.head()

# For transformation use to prevent overwriting the original dataset
train_data_transformed = train_data
test_data_transformed = test_data

In [125]:
train_data_transformed['name'][0: 5]

0    Home in Vancouver · ★4.75 · 1 bedroom · 1 bed ...
1    Guest suite in Vancouver · ★New · 2 bedrooms ·...
2    Guest suite in Vancouver · ★4.85 · 2 bedrooms ...
3    Home in Vancouver · ★5.0 · 1 bedroom · 1 bed ·...
4    Guest suite in Vancouver · ★4.93 · 1 bedroom ·...
Name: name, dtype: object

As seen from the print above, column `Name` includes information such as  Home/Rental/Guest Suite,  Review, # Bedroom, # bed, and # bath, we will now preprocess that information and create new columns for each information



In [126]:
# Separating information into columns
train_data_transformed['type'] = train_data_transformed['name'].str.extract(r'^(Guest suite|Home|Rental)')
train_data_transformed['location'] = train_data_transformed['name'].str.extract(r'in ([^·]+)')
train_data_transformed['review'] = train_data_transformed['name'].str.extract(r'★([\d.]+)')
train_data_transformed['bedrooms'] = train_data_transformed['name'].str.extract(r'(\d+) bedrooms?')
train_data_transformed['beds'] = train_data_transformed['name'].str.extract(r'(\d+) beds?')
train_data_transformed['baths'] = train_data_transformed['name'].str.extract(r'(\d+) baths?')

# Converting numerical columns
train_data_transformed['review'] = train_data_transformed['review'].astype(float)
train_data_transformed['bedrooms'] = pd.to_numeric(train_data_transformed['bedrooms'])
train_data_transformed['beds'] = pd.to_numeric(train_data_transformed['beds'])
train_data_transformed['baths'] = pd.to_numeric(train_data_transformed['baths'])

train_data_transformed.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,name,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,...,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue,type,location,review,bedrooms,baths
0,0,879,19792418,Home in Vancouver · ★4.75 · 1 bedroom · 1 bed ...,Everything you need is nearby. <br /><br />Hig...,57488206,Jessi,,,,...,4.81,f,3,0.77,2108,Home,Vancouver,4.75,1.0,1.0
1,1,6416,1015650685503221866,Guest suite in Vancouver · ★New · 2 bedrooms ·...,,139792573,Daniel,within a few hours,100%,100%,...,,f,1,,2730,Guest suite,Vancouver,,2.0,1.0
2,2,1820,35265562,Guest suite in Vancouver · ★4.85 · 2 bedrooms ...,Beautiful neighbourhood close to prosperous Ma...,265504225,Alex,within an hour,100%,98%,...,4.75,f,1,3.22,2254,Guest suite,Vancouver,4.85,2.0,1.0
3,3,5346,911948980885194155,Home in Vancouver · ★5.0 · 1 bedroom · 1 bed ·...,We are located in a quiet residential neighbor...,22595056,Raymond,,,92%,...,5.0,f,1,1.28,3187,Home,Vancouver,5.0,1.0,1.0
4,4,2484,46069251,Guest suite in Vancouver · ★4.93 · 1 bedroom ·...,Kitsilano at it's best! Short walk to all the ...,65683877,Yendi,within an hour,100%,95%,...,4.85,f,1,2.01,3479,Guest suite,Vancouver,4.93,1.0,1.0


In [127]:
# Separating information into columns
test_data_transformed['type'] = test_data_transformed['name'].str.extract(r'^(Guest suite|Home|Rental)')
test_data_transformed['location'] = test_data_transformed['name'].str.extract(r'in ([^·]+)')
test_data_transformed['review'] = test_data_transformed['name'].str.extract(r'★([\d.]+)')
test_data_transformed['bedrooms'] = test_data_transformed['name'].str.extract(r'(\d+) bedrooms?')
test_data_transformed['beds'] = test_data_transformed['name'].str.extract(r'(\d+) beds?')
test_data_transformed['baths'] = test_data_transformed['name'].str.extract(r'(\d+) baths?')

# Converting numerical columns
test_data_transformed['review'] = test_data_transformed['review'].astype(float)
test_data_transformed['bedrooms'] = pd.to_numeric(test_data_transformed['bedrooms'])
test_data_transformed['beds'] = pd.to_numeric(test_data_transformed['beds'])
test_data_transformed['baths'] = pd.to_numeric(test_data_transformed['baths'])

test_data.head()

Unnamed: 0.1,Unnamed: 0,id,name,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,...,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,type,location,review,bedrooms,baths
0,0,4465400,Home in Vancouver · ★4.98 · 4 bedrooms · 4 bed...,We are less than a block to Kits pool - the la...,23168796,Oliver & Ashleigh,within a few hours,100%,67%,f,...,4.98,4.88,f,1,0.38,Home,Vancouver,4.98,4.0,5.0
1,1,608629349064811166,Rental unit in Vancouver · 1 bedroom · 1 bed ·...,One of the most eclectic areas in the Lower Ma...,16926150,Jesse,,,,f,...,,,f,1,,Rental,Vancouver,,1.0,1.0
2,2,981894508939312892,Rental unit in Vancouver · ★5.0 · 1 bedroom · ...,Le Soleil Hotel and Suites is situated in the ...,536871978,Ivan,within an hour,100%,95%,f,...,5.0,4.88,f,4,2.96,Rental,Vancouver,5.0,1.0,1.0
3,3,52014961,Guest suite in Vancouver · ★4.79 · 2 bedrooms ...,"Very safe and quite area,also very convenient ...",421206568,Xuerong,within an hour,100%,98%,t,...,4.77,4.69,f,4,2.53,Guest suite,Vancouver,4.79,2.0,2.0
4,4,836285112356489303,Rental unit in Vancouver · ★5.0 · 1 bedroom · ...,Mount Pleasant is a largely residential area w...,227662329,Jordan,within an hour,100%,100%,f,...,5.0,5.0,t,134,0.47,Rental,Vancouver,5.0,1.0,1.0


In [128]:
print(train_data_transformed.loc[train_data_transformed['location'] != 'Vancouver', 'location'].unique())
print(test_data_transformed.loc[test_data_transformed['location'] != 'Vancouver', 'location'].unique())

['Vancouver ' 'Delta ' 'Burnaby ' 'in Vancouver ' 'Vancouver bc '
 'Vancouver  ' 'vancouver ' 'West Vancouver ']
['Vancouver ' 'Vancouver  ']


Next, we will be merging all the similar unique values of locations together:

In [129]:
train_data_transformed['location'] = (
    train_data['location']
    .str.strip()  
    .str.lower()  
    .str.replace(r'\b(vancouver|in vancouver)\b', 'vancouver', regex=True)
    .str.replace('vancouver bc', 'vancouver')
)
print(train_data_transformed['location'].unique())


test_data_transformed['location'] = (
    test_data_transformed['location']
    .str.strip()  
    .str.lower()  
    .str.replace('Vancouver  ', 'vancouver')
)
print(test_data_transformed['location'].unique())

['vancouver' 'delta' 'burnaby' 'west vancouver']
['vancouver']


Column `Neighbourhood Overview` is correlated with the review, where a higher review will generally have a good review about the location and stay. As they will have high correlation, we will omit that column as we can use review.

Furthermore, we see that price is in a format $ value, and we want to adjust it to a numerical column

In [131]:
def clean_price(price):
    price_str = str(price)
    return float(price_str.replace('$', '').replace(',', ''))

train_data_transformed['price'] = train_data_transformed['price'].apply(clean_price)
test_data_transformed['price'] = test_data_transformed['price'].apply(clean_price)

In [132]:
train_data_transformed['price'].head()

0      NaN
1    132.0
2    119.0
3    263.0
4    112.0
Name: price, dtype: float64

### Data Preprocessing and Transformation

In [133]:

numeric_features = ["host_listings_count", "host_total_listings_count", "latitude", "longitude", "accommodates", 
                    "minimum_nights", "maximum_nights", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", 
                    "availability_30", "availability_60", "availability_90", "availability_365", 
                    "number_of_reviews", "number_of_reviews_ltm", "price", "review_scores_rating", 
                    "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", 
                    "review_scores_communication", "review_scores_location", "review_scores_value", 
                    "calculated_host_listings_count", "reviews_per_month", "review", "bedrooms", "baths"]

percentage_features = ["host_response_rate", "host_acceptance_rate"]

categorical_features = ["host_response_time", "neighbourhood_cleansed", "property_type", "room_type", "type", "location", "host_is_superhost", "instant_bookable"]

drop_features = ['Unnamed: 0.1', 'Unnamed: 0', 'id', 'name', 'neighborhood_overview', 'host_id', 'host_id', 'host_name', 'amenities']

# Transformation

def percentage_to_float(value):
    if isinstance(value, str) and '%' in value:
        return float(value.replace('%', ''))  
    return value 


numeric_transformer = make_pipeline(
    SimpleImputer(strategy="mean"),  
    StandardScaler()
)

percentage_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),  
    FunctionTransformer(lambda x: pd.DataFrame(x).applymap(percentage_to_float), validate=False),  
    StandardScaler() 
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"), 
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)  
)


# Preprocessing
preprocessor = make_column_transformer(
    ("drop", drop_features),  
    (numeric_transformer, numeric_features), 
    (percentage_transformer, percentage_features),  
    (categorical_transformer, categorical_features) 
)

preprocessor


In [134]:
preprocessor.fit(train_data_transformed)

  FunctionTransformer(lambda x: pd.DataFrame(x).applymap(percentage_to_float), validate=False),
