# Data Cleaning and Processing

Loading in the InsideAirbnb data and cleaning it up for our later model.

This includes removing unnecessary columns, removing outliers, and accounting for multicollinearity.

In [1]:
# Load libraries
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import scipy.stats
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from src import drop_column_using_vif_, show_vif_values

## Loading in the Data

Here we also removed unnecessary columns.

In [4]:
# Open yvr_listing_data.csv in the data folder
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data.csv'))

# Exclude columns manually that are completly textual description or apparently non-related to legality(including coordinates).
# Also exclude some redundant variables like 'neighbourhood' and 'neighbourhood_cleansed'

excluded_columns = ['listing_url','scrape_id', 'last_scraped', 'source', 
                       'name','description', 'neighborhood_overview', 'picture_url', 
                       'host_id', 'host_url', 'host_name', 'host_since', 
                       'host_location', 'host_about', 'host_thumbnail_url', 
                       'host_picture_url', 'latitude', 'longitude', 'calendar_updated', 
                       'calendar_last_scraped', 'amenities', 'bathrooms_text',
                       'first_review','last_review','neighbourhood','property_type','host_neighbourhood',
                       'maximum_minimum_nights','maximum_nights','minimum_minimum_nights',
                       'maximum_maximum_nights','minimum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm']

remained_columns = [col for col in listings_df if col not in excluded_columns]
remained_columns = list(set(remained_columns))

# Delete all textual description columns 

listings_df = listings_df[remained_columns]

# Dropped completely empty columns
listings_df= listings_df.dropna(axis=1, how='all')

# Drop listings with 'minimum_nights > 30' based on the regulation in Vancouver
listings_df = listings_df[listings_df['minimum_nights']<=30]

In [6]:
listings_df.columns

Index(['license', 'host_response_time', 'host_total_listings_count',
       'availability_90', 'host_is_superhost', 'number_of_reviews_l30d',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_rating', 'host_listings_count', 'reviews_per_month',
       'room_type', 'availability_30', 'number_of_reviews',
       'host_identity_verified', 'host_verifications', 'id', 'beds',
       'number_of_reviews_ltm', 'instant_bookable',
       'calculated_host_listings_count_entire_homes', 'availability_60',
       'minimum_nights', 'calculated_host_listings_count_private_rooms',
       'has_availability', 'bedrooms', 'review_scores_location',
       'review_scores_communication', 'host_acceptance_rate', 'accommodates',
       'review_scores_accuracy', 'host_has_profile_pic', 'price',
       'calculated_host_listings_count_shared_rooms', 'host_response_rate',
       'calculated_host_listings_count', 'review_scores_value',
       'availability_365', 'neighbourhood_clea

## Finding "Legal" Listings

Using regex, we scan through the listings licenses and determine which ones are valid.

In [7]:
%%capture --no-stdout
"""
Create a new column titled "legal_listing" that contains the boolean describing whether or not the listing has a valid license.
The column is True if the listing has a valid license or does not require one and False if the listing does not have a valid license.
To compute the value of the column, we use the following logic:

If the listing has a number in the "license" column with the regex pattern of r'.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?' 
OR the listing has a number in the "minimum_nights" column with a value equal to or greater than 30,
THEN the "legal_listing" is True. ELSE the "valid_license" is False.

Note:
The regex pattern '.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?' is used to find a numbers with the pattern ##-###### or ##-###-### with 
spaces/dashes/nothing in between the numbers. The number can be surrounded by any number of characters. 
TODO: Verify this is the correct pattern for the license numbers and find any other ways of verifying legitimate license numbers.
"""

###Just found there are some values like 'dd-ddd-ddd', so I changed regex pattern for better compatibility
#regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{6}).*?')
regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?')

# Create the valid_license column using the logic described above
listings_df['legal_listing'] = listings_df['license'].str.contains(regex_pattern) | (listings_df['minimum_nights'] >= 30)

# Create new dataframe storing values after normalization or preprocessing
listings_df_cleaned = pd.DataFrame()
listings_df_cleaned['id'] = listings_df['id']
listings_df_cleaned['legal_listing'] = listings_df['legal_listing']

# Drop the 'license' column for better processing
listings_df.drop('license',axis=1, inplace=True)

# Print count of valid and invalid licenses
print(listings_df['legal_listing'].value_counts())

legal_listing
True     4533
False    1820
Name: count, dtype: int64


## Dealing with Data Types

- Converting variables to the correct data types while also cleaning unnecessary characters.
- Accounting for categorical data with one-hot encoding.

In [8]:
#print(listings_df.columns)
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6353 entries, 0 to 6694
Data columns (total 39 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_response_time                            5411 non-null   object 
 1   host_total_listings_count                     6353 non-null   int64  
 2   availability_90                               6353 non-null   int64  
 3   host_is_superhost                             6200 non-null   object 
 4   number_of_reviews_l30d                        6353 non-null   int64  
 5   review_scores_cleanliness                     5419 non-null   float64
 6   review_scores_checkin                         5419 non-null   float64
 7   review_scores_rating                          5429 non-null   float64
 8   host_listings_count                           6353 non-null   int64  
 9   reviews_per_month                             5429 non-null   float6

### Dealing with Object Columns

In [9]:
# Print names of object columns
print(listings_df.select_dtypes(include=['object']).columns)

Index(['host_response_time', 'host_is_superhost', 'room_type',
       'host_identity_verified', 'host_verifications', 'instant_bookable',
       'has_availability', 'host_acceptance_rate', 'host_has_profile_pic',
       'price', 'host_response_rate', 'neighbourhood_cleansed'],
      dtype='object')


In [10]:
#converting 'price' column
# Convert price to a float variable
if listings_df['price'].dtype == 'object':
    listings_df['price'] = listings_df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# Convert 'host_acceptance_rate' to a float variable
if listings_df['host_acceptance_rate'].dtype == 'object':
    listings_df['host_acceptance_rate'] = listings_df['host_acceptance_rate'].str.replace('%', '').astype(float)

# Convert 'host_response_time' to a float variable
# The reason is a bit far-fetched for range(0,0.25,0.5,0.75,1), just make it easier for regression model operating. 
# Moreover it does make sense, to some extent
if listings_df['host_response_time'].dtype == 'object':
    listings_df['host_response_time'] = listings_df['host_response_time'].map({
        'within an hour': 1, 'within a few hours': 0.75, 'within a day': 0.5, 'a few days or more': 0.25}).fillna(0)

# Convert 'host_response_rate' to a float variable
if listings_df['host_response_rate'].dtype == 'object':
    listings_df['host_response_rate'] = listings_df['host_response_rate'].str.replace('%', '').astype(float)

# Convert 'host_verifications' to a float variable
if listings_df['host_verifications'].dtype == 'object':
    listings_df['host_verifications'] = listings_df['host_verifications'].map({
        "['email', 'phone', 'photographer', 'work_email']": 1, "['email', 'phone', 'work_email']": 0.75, 
        "['email', 'phone']": 0.5, "['phone', 'work_email']":0.5, 
        "['phone']": 0.25, "['email']": 0.25}).fillna(0)


# Convert 'host_is_superhost' to a bool variable
if listings_df['host_is_superhost'].dtype == 'object':
    listings_df['host_is_superhost'] = listings_df['host_is_superhost'].map({'t': 1, 'f': 0})

# Convert 'host_has_profile_pic' to a bool variable
if listings_df['host_has_profile_pic'].dtype == 'object':
    listings_df['host_has_profile_pic'] = listings_df['host_has_profile_pic'].map({'t': 1, 'f': 0})

# Convert 'has_availability' to a bool variable
if listings_df['has_availability'].dtype == 'object':
    listings_df['has_availability'] = listings_df['has_availability'].map({'t': 1, 'f': 0})

# Convert 'instant_bookable' to a bool variable
if listings_df['instant_bookable'].dtype == 'object':
    listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t': 1, 'f': 0})

# Convert 'host_identity_verified' to a bool variable
if listings_df['host_identity_verified'].dtype == 'object':
    listings_df['host_identity_verified'] = listings_df['host_identity_verified'].map({'t': 1, 'f': 0})

In [11]:
# Check the object columns again
object_columns = listings_df.select_dtypes(include='object')

object_columns_name = list(object_columns.columns)
object_columns_name


['room_type', 'neighbourhood_cleansed']

In [12]:
listings_df['host_verifications'].unique()

array([0.5 , 0.25, 0.75, 1.  ])

### One-hot to code categorical columns

In [13]:
print("Dropped categories:")
for colname in object_columns_name:
    # convert room_type column to 'category' dtype
    listings_df[colname] = listings_df[colname].astype('category')

    # Since we will be dropping the first category of each column, 
    # lets print out the first category of each column so we know what we are dropping
    print(colname, ':', listings_df[colname].cat.categories[0])

    # applying one-hot coding (drop_first means eliminate one freedom degree to prevent multicollinearity)
    one_hot_encoded = pd.get_dummies(listings_df[colname], prefix=colname, drop_first=True)
    # join new columns back to DataFrame
    listings_df = listings_df.join(one_hot_encoded)

Dropped categories:
room_type : Entire home/apt
neighbourhood_cleansed : Arbutus Ridge


In [14]:
# Print types of all columns
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6353 entries, 0 to 6694
Data columns (total 64 columns):
 #   Column                                           Non-Null Count  Dtype   
---  ------                                           --------------  -----   
 0   host_response_time                               6353 non-null   float64 
 1   host_total_listings_count                        6353 non-null   int64   
 2   availability_90                                  6353 non-null   int64   
 3   host_is_superhost                                6200 non-null   float64 
 4   number_of_reviews_l30d                           6353 non-null   int64   
 5   review_scores_cleanliness                        5419 non-null   float64 
 6   review_scores_checkin                            5419 non-null   float64 
 7   review_scores_rating                             5429 non-null   float64 
 8   host_listings_count                              6353 non-null   int64   
 9   reviews_per_month       

## Preparing for VIF Analysis

In [15]:
listings_df_VIF = listings_df.select_dtypes(include=['bool','float64','int64'])
listings_df_VIF = listings_df_VIF.astype('float64')

**Using VIF to filter relating variables**

In [16]:
# calculating VIF

# Drop all rows containing NAs or infs in listings_df_VIF
listings_df_VIF.replace([np.inf, -np.inf], np.nan, inplace=True)
listings_df_VIF.dropna(inplace=True)

## VIF Filtering

In [19]:
#%%capture --no-stdout

listings_df_VIF_new = drop_column_using_vif_(listings_df_VIF.drop('legal_listing', axis=1), thresh=2)

  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


In [18]:
# After VIF now we have the 'listings_df_VIF_new'
print(f"There are {listings_df_VIF_new.shape[1]} variables after VIF operation.")

# Add legal_listing back to csv
listings_df_VIF_new['legal_listing'] = listings_df_VIF['legal_listing']


# And save the new dataframe to csv.file
listings_df_VIF_new.to_csv(os.path.join('data','yvr_listing_data_cleaned.csv'),index=False)

There are 61 variables after VIF operation.
