# Data Cleaning and Processing

Loading in the InsideAirbnb data and cleaning it up for our later model.

This includes removing unnecessary columns, removing outliers, and accounting for multicollinearity.

In [1]:
# Load libraries
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import scipy.stats
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from src import drop_column_using_vif_, show_vif_values

## Loading in the Data

Here we also removed unnecessary columns.

In [2]:
# Open yvr_listing_data.csv in the data folder
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data.csv'))

# Exclude columns manually that are completly textual description or apparently non-related to legality(including coordinates).

excluded_columns = ['listing_url','scrape_id', 'last_scraped', 'source', 
                       'name','description', 'neighborhood_overview', 'picture_url', 
                       'host_id', 'host_url', 'host_name', 'host_since', 
                       'host_location', 'host_about', 'host_thumbnail_url', 
                       'host_picture_url', 'latitude', 'longitude', 'calendar_updated', 
                       'calendar_last_scraped', 'amenities', 'bathrooms_text',
                       'first_review','last_review']

remained_columns = [col for col in listings_df if col not in excluded_columns]
remained_columns = list(set(remained_columns))

# Delete all textual description columns 

listings_df = listings_df[remained_columns]

# Dropped completely empty columns
listings_df= listings_df.dropna(axis=1, how='all')



## Finding "Legal" Listings

Using regex, we scan through the listings licenses and determine which ones are valid.

In [3]:
%%capture --no-stdout
"""
Create a new column titled "legal_listing" that contains the boolean describing whether or not the listing has a valid license.
The column is True if the listing has a valid license or does not require one and False if the listing does not have a valid license.
To compute the value of the column, we use the following logic:

If the listing has a number in the "license" column with the regex pattern of r'.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?' 
OR the listing has a number in the "minimum_nights" column with a value equal to or greater than 30,
THEN the "legal_listing" is True. ELSE the "valid_license" is False.

Note:
The regex pattern '.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?' is used to find a numbers with the pattern ##-###### or ##-###-### with 
spaces/dashes/nothing in between the numbers. The number can be surrounded by any number of characters. 
TODO: Verify this is the correct pattern for the license numbers and find any other ways of verifying legitimate license numbers.
"""

###Just found there are some values like 'dd-ddd-ddd', so I changed regex pattern for better compatibility
#regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{6}).*?')
regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?')

# Create the valid_license column using the logic described above
listings_df['legal_listing'] = listings_df['license'].str.contains(regex_pattern) | (listings_df['minimum_nights'] >= 30)

# Create new dataframe storing values after normalization or preprocessing
listings_df_cleaned = pd.DataFrame()
listings_df_cleaned['id'] = listings_df['id']
listings_df_cleaned['legal_listing'] = listings_df['legal_listing']

# Drop the 'license' column for better processing
listings_df.drop('license',axis=1, inplace=True)

# Print count of valid and invalid licenses
print(listings_df['legal_listing'].value_counts())

legal_listing
True     4604
False    2091
Name: count, dtype: int64


## Dealing with Data Types

- Converting variables to the correct data types while also cleaning unnecessary characters.
- Accounting for categorical data with one-hot encoding.

In [4]:
#print(listings_df.columns)
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 49 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   review_scores_rating                          5641 non-null   float64
 1   maximum_maximum_nights                        6695 non-null   int64  
 2   price                                         6695 non-null   object 
 3   instant_bookable                              6695 non-null   object 
 4   availability_60                               6695 non-null   int64  
 5   host_is_superhost                             6536 non-null   object 
 6   minimum_nights_avg_ntm                        6695 non-null   float64
 7   number_of_reviews_ltm                         6695 non-null   int64  
 8   review_scores_location                        5627 non-null   float64
 9   host_neighbourhood                            6346 non-null   o

### Dealing with Object Columns

In [5]:
# Print names of object columns
print(listings_df.select_dtypes(include=['object']).columns)

Index(['price', 'instant_bookable', 'host_is_superhost', 'host_neighbourhood',
       'host_response_time', 'property_type', 'host_acceptance_rate',
       'host_verifications', 'neighbourhood_cleansed', 'host_response_rate',
       'host_identity_verified', 'has_availability', 'host_has_profile_pic',
       'neighbourhood', 'room_type'],
      dtype='object')


In [6]:
#converting 'price' column
# Convert price to a float variable
if listings_df['price'].dtype == 'object':
    listings_df['price'] = listings_df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# Convert 'host_acceptance_rate' to a float variable
if listings_df['host_acceptance_rate'].dtype == 'object':
    listings_df['host_acceptance_rate'] = listings_df['host_acceptance_rate'].str.replace('%', '').astype(float)

# Convert 'host_response_rate' to a float variable
if listings_df['host_response_rate'].dtype == 'object':
    listings_df['host_response_rate'] = listings_df['host_response_rate'].str.replace('%', '').astype(float)

# Convert 'host_is_superhost' to a bool variable
if listings_df['host_is_superhost'].dtype == 'object':
    listings_df['host_is_superhost'] = listings_df['host_is_superhost'].map({'t': 1, 'f': 0})

# Convert 'host_has_profile_pic' to a bool variable
if listings_df['host_has_profile_pic'].dtype == 'object':
    listings_df['host_has_profile_pic'] = listings_df['host_has_profile_pic'].map({'t': 1, 'f': 0})

# Convert 'has_availability' to a bool variable
if listings_df['has_availability'].dtype == 'object':
    listings_df['has_availability'] = listings_df['has_availability'].map({'t': 1, 'f': 0})

# Convert 'instant_bookable' to a bool variable
if listings_df['instant_bookable'].dtype == 'object':
    listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t': 1, 'f': 0})

# Convert 'host_identity_verified' to a bool variable
if listings_df['host_identity_verified'].dtype == 'object':
    listings_df['host_identity_verified'] = listings_df['host_identity_verified'].map({'t': 1, 'f': 0})

In [7]:
# Check the object columns again
object_columns = listings_df.select_dtypes(include='object')

object_columns_name = list(object_columns.columns)
object_columns_name

['host_neighbourhood',
 'host_response_time',
 'property_type',
 'host_verifications',
 'neighbourhood_cleansed',
 'neighbourhood',
 'room_type']

### One-hot to code categorical columns

In [8]:
print("Dropped categories:")
for colname in object_columns_name:
    # convert room_type column to 'category' dtype
    listings_df[colname] = listings_df[colname].astype('category')

    # Since we will be dropping the first category of each column, 
    # lets print out the first category of each column so we know what we are dropping
    print(colname, ':', listings_df[colname].cat.categories[0])

    # applying one-hot coding (drop_first means eliminate one freedom degree to prevent multicollinearity)
    one_hot_encoded = pd.get_dummies(listings_df[colname], prefix=colname, drop_first=True)
    # join new columns back to DataFrame
    listings_df = listings_df.join(one_hot_encoded)

Dropped categories:
host_neighbourhood : Arbutus Ridge
host_response_time : a few days or more
property_type : Boat
host_verifications : ['email', 'phone', 'photographer', 'work_email']
neighbourhood_cleansed : Arbutus Ridge
neighbourhood : Delta, British Columbia, Canada
room_type : Entire home/apt


In [9]:
# Print types of all columns
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Columns: 218 entries, review_scores_rating to room_type_Shared room
dtypes: bool(170), category(7), float64(16), int64(25)
memory usage: 3.2 MB


## Preparing for VIF Analysis

In [10]:
listings_df_VIF = listings_df.select_dtypes(include=['bool','float64','int64'])
listings_df_VIF = listings_df_VIF.astype('float64')

**Using VIF to filter relating variables**

In [11]:
# calculating VIF

# Drop all rows containing NAs or infs in listings_df_VIF
listings_df_VIF.replace([np.inf, -np.inf], np.nan, inplace=True)
listings_df_VIF.dropna(inplace=True)

## VIF Filtering

In [13]:
%%capture --no-stdout

listings_df_VIF_new = drop_column_using_vif_(listings_df_VIF.drop('legal_listing', axis=1), thresh=2)

Dropping: maximum_nights_avg_ntm (VIF: 13.351057622231982)
Dropping: minimum_maximum_nights (VIF: 4.631169615734571)
Dropping: id (VIF: 2.359721974895883)
Dropping: calculated_host_listings_count_entire_homes (VIF: inf)
Dropping: property_type_Camper/RV (VIF: inf)
Dropping: property_type_Private room in boat (VIF: inf)
Dropping: property_type_Room in bed and breakfast (VIF: inf)
Dropping: host_verifications_['email', 'phone'] (VIF: 673.0588124720937)
Dropping: property_type_Entire condo (VIF: 358.96550564554565)
Dropping: host_listings_count (VIF: 165.56401036703772)
Dropping: room_type_Private room (VIF: 91.55733351260155)
Dropping: host_response_time_within an hour (VIF: 82.59420750093747)
Dropping: minimum_nights_avg_ntm (VIF: 56.91497107912634)
Dropping: calculated_host_listings_count_shared_rooms (VIF: 27.48260928998158)
Dropping: neighbourhood_cleansed_Downtown (VIF: 24.564212352305173)
Dropping: host_total_listings_count (VIF: 17.143836587551522)
Dropping: availability_60 (VIF: 

In [14]:
# After VIF now we have the 'listings_df_VIF_new'
print(f"There are {listings_df_VIF_new.shape[1]} variables after VIF operation.")

# Add legal_listing back to csv
listings_df_VIF_new['legal_listing'] = listings_df_VIF['legal_listing']


# And save the new dataframe to csv.file
listings_df_VIF_new.to_csv(os.path.join('data','yvr_listing_data_cleaned.csv'),index=False)

There are 161 variables after VIF operation.
