In [1]:
# Load libraries
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import scipy.stats
import statsmodels.api as sm

In [2]:
# Open yvr_listing_data.csv in the data folder
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data.csv'))

# Print columns
# print(listings_df.columns)

# Exclude columns manually that are completly textual description or apparently non-related to legality(including coordinates).

excluded_columns = ['listing_url','scrape_id', 'last_scraped', 'source', 
                       'name','description', 'neighborhood_overview', 'picture_url', 
                       'host_id', 'host_url', 'host_name', 'host_since', 
                       'host_location', 'host_about', 'host_thumbnail_url', 
                       'host_picture_url', 'latitude', 'longitude', 'calendar_updated', 
                       'calendar_last_scraped', 'amenities', 'bathrooms_text',
                       'first_review','last_review']

remained_columns = [col for col in listings_df if col not in excluded_columns]
remained_columns = list(set(remained_columns))

# Delete all textual description columns 

listings_df = listings_df[remained_columns]



In [3]:
#%%capture --no-stdout
"""
Create a new column titled "legal_listing" that contains the boolean describing whether or not the listing has a valid license.
The column is True if the listing has a valid license or does not require one and False if the listing does not have a valid license.
To compute the value of the column, we use the following logic:

If the listing has a number in the "license" column with the regex pattern of r'.*?(\d{2}[-\s]?\d{6}).*?' 
OR the listing has a number in the "minimum_nights" column with a value equal to or greater than 30,
THEN the "legal_listing" is True. ELSE the "valid_license" is False.

Note:
The regex pattern r'.*?(\d{2}[-\s]?\d{6}).*?' is used to find a number with 2 digits, followed by a dash or space, 
followed by 6 digits. The number can be surrounded by any number of characters. 
TODO: Verify this is the correct pattern for the license numbers and find any other ways of verifying legitimate license numbers.
"""

###Just found there are some values like 'dd-ddd-ddd', so I changed regex pattern for better compatibility
#regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{6}).*?')
regex_pattern = re.compile(r'.*?(\d{2}[-\s]?\d{3}[-\s]?\d{3}).*?')

# Create the valid_license column using the logic described above
listings_df['legal_listing'] = listings_df['license'].str.contains(regex_pattern) | (listings_df['minimum_nights'] >= 30)

# Create new dataframe storing values after normalization or preprocessing
listings_df_cleaned = pd.DataFrame()
listings_df_cleaned['id'] = listings_df['id']
listings_df_cleaned['legal_listing'] = listings_df['legal_listing']

# Drop the 'license' column for better processing
listings_df.drop('license',axis=1, inplace=True)


# Print only the columns we are interested in
#print(listings_df[['id', 'license', 'minimum_nights', 'legal_listing']])

# Print count of valid and invalid licenses
print(listings_df['legal_listing'].value_counts())

legal_listing
True     4604
False    2091
Name: count, dtype: int64


  listings_df['legal_listing'] = listings_df['license'].str.contains(regex_pattern) | (listings_df['minimum_nights'] >= 30)


Hey guys, just came up with a new assumption: As our main goal is to construct a regression model and predict a new-coming property's legality, should we neglect all of those 'post-occupancy evaluation'  fields just like 'review_score' or 'host_response_time'? Because all of those information was collected after some renter's experience, while what we are trying to do is to provide more useful information for renters before the renting contract?

And I have done some preliminary filtering and division among all columns.

In [4]:
#print(listings_df.columns)
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 51 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_response_rate                            5644 non-null   object 
 1   calculated_host_listings_count_shared_rooms   6695 non-null   int64  
 2   review_scores_cleanliness                     5628 non-null   float64
 3   calculated_host_listings_count_private_rooms  6695 non-null   int64  
 4   host_neighbourhood                            6346 non-null   object 
 5   number_of_reviews_l30d                        6695 non-null   int64  
 6   review_scores_communication                   5628 non-null   float64
 7   calculated_host_listings_count                6695 non-null   int64  
 8   has_availability                              6695 non-null   object 
 9   host_is_superhost                             6536 non-null   o

**Dealing with some object column**

In [4]:
object_columns = listings_df.select_dtypes(include='object')

object_columns_name = list(object_columns.columns)
object_columns_name

['instant_bookable',
 'has_availability',
 'room_type',
 'neighbourhood_cleansed',
 'host_acceptance_rate',
 'host_identity_verified',
 'host_is_superhost',
 'price',
 'property_type',
 'neighbourhood',
 'host_neighbourhood',
 'host_verifications',
 'host_response_rate',
 'host_response_time',
 'host_has_profile_pic']

In [6]:
#converting 'price' column
# Convert price to a float variable
if listings_df['price'].dtype == 'object':
    listings_df['price'] = listings_df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# Convert 'host_acceptance_rate' to a float variable
if listings_df['host_acceptance_rate'].dtype == 'object':
    listings_df['host_acceptance_rate'] = listings_df['host_acceptance_rate'].str.replace('%', '').astype(float)

# Convert 'host_response_rate' to a float variable
if listings_df['host_response_rate'].dtype == 'object':
    listings_df['host_response_rate'] = listings_df['host_response_rate'].str.replace('%', '').astype(float)
    

# Convert 'host_is_superhost' to a bool variable
if listings_df['host_is_superhost'].dtype == 'object':
    listings_df['host_is_superhost'] = listings_df['host_is_superhost'].map({'t': 1, 'f': 0})

# Convert 'host_has_profile_pic' to a bool variable
if listings_df['host_has_profile_pic'].dtype == 'object':
    listings_df['host_has_profile_pic'] = listings_df['host_has_profile_pic'].map({'t': 1, 'f': 0})

# Convert 'has_availability' to a bool variable
if listings_df['has_availability'].dtype == 'object':
    listings_df['has_availability'] = listings_df['has_availability'].map({'t': 1, 'f': 0})

# Convert 'instant_bookable' to a bool variable
if listings_df['instant_bookable'].dtype == 'object':
    listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t': 1, 'f': 0})

In [7]:
# Check the object columns again
object_columns = listings_df.select_dtypes(include='object')

object_columns_name = list(object_columns.columns)
object_columns_name

['room_type',
 'neighbourhood_cleansed',
 'host_identity_verified',
 'property_type',
 'neighbourhood',
 'host_neighbourhood',
 'host_verifications',
 'host_response_time']

**One-hot to code categorical columns**

In [8]:
for colname in object_columns_name:
    # convert room_type column to 'category' dtype
    listings_df[colname] = listings_df[colname].astype('category')
    # applying one-hot coding (drop_first means eliminate one freedom degree to prevent multicollinearity)
    one_hot_encoded = pd.get_dummies(listings_df[colname], prefix=colname, drop_first=True)
    # join new columns back to DataFrame
    listings_df = listings_df.join(one_hot_encoded)

**listing_df for VIF**

In [9]:
listings_df_VIF = listings_df.select_dtypes(include=['bool','float64','int64'])
listings_df_VIF.drop(['bathrooms', 'neighbourhood_group_cleansed'], axis=1, inplace=True)

**Using VIF to filter relating variables**

In [10]:
int64_columns = listings_df_VIF.select_dtypes(include='Int64').columns
listings_df_VIF.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Columns: 211 entries, id to host_response_time_within an hour
dtypes: bool(171), float64(16), int64(24)
memory usage: 3.1 MB


In [11]:
# 假设 listings_df_VIF 是你的DataFrame
columns_Int64 = [col for col in listings_df_VIF.columns if pd.api.types.is_integer_dtype(listings_df_VIF[col]) and listings_df_VIF[col].dtype.name == 'Int64']

print(columns_Int64)


[]


In [12]:
listings_df_VIF['host_response_rate']=listings_df_VIF['host_response_rate'].astype(float)
listings_df_VIF['host_acceptance_rate']=listings_df_VIF['host_acceptance_rate'].astype(float)

In [13]:

listings_df_VIF = listings_df_VIF.astype('float64')

In [14]:
# calculating VIF
# This function is adjusted from: https://stackoverflow.com/a/51329496/4667568
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tools.tools import add_constant

# Drop all rows containing NAs or infs in listings_df_VIF


listings_df_VIF.replace([np.inf, -np.inf], np.nan, inplace=True)
listings_df_VIF.dropna(inplace=True)
df_with_const = add_constant(listings_df_VIF,has_constant='add')

In [15]:
help(add_constant)

Help on function add_constant in module statsmodels.tools.tools:

add_constant(data, prepend=True, has_constant='skip')
    Add a column of ones to an array.
    
    Parameters
    ----------
    data : array_like
        A column-ordered design matrix.
    prepend : bool
        If true, the constant is in the first column.  Else the constant is
        appended (last column).
    has_constant : str {'raise', 'add', 'skip'}
        Behavior if ``data`` already has a constant. The default will return
        data without adding another constant. If 'raise', will raise an
        error if any column has a constant value. Using 'add' will add a
        column of 1s if a constant column is present.
    
    Returns
    -------
    array_like
        The original values with a constant (column of ones) as the first or
        last column. Returned value type depends on input type.
    
    Notes
    -----
    When the input is a pandas Series or DataFrame, the added column's name
    is '

In [15]:


def drop_column_using_vif_(df, thresh=2):
    '''
    Calculates VIF each feature in a pandas dataframe, and repeatedly drop the columns with the highest VIF
    A constant must be added to variance_inflation_factor or the results will be incorrect

    :param df: the pandas dataframe containing only the predictor features, not the response variable
    :param thresh: (default 5) the threshould VIF value. If the VIF of a variable is greater than thresh, it should be removed from the dataframe
    :return: dataframe with multicollinear features removed
    '''
    while True:
        
        # adding a constatnt item to the data. add_constant is a function from statsmodels (see the import above)
        df_with_const = add_constant(df,has_constant='add')

        if 'const' in df_with_const.columns:
            vif_df = pd.Series([variance_inflation_factor(df_with_const.values, i) for i in range(df_with_const.shape[1])], name= "VIF", 
                                index=df_with_const.columns).to_frame()

            # drop the const
            vif_df = vif_df.drop('const')
        else:
            raise ValueError("constant column 'const' not successfully added")
        
        # if the largest VIF is above the thresh, remove a variable with the largest VIF
        # If there are multiple variabels with VIF>thresh, only one of them is removed. This is because we want to keep as many variables as possible
        if vif_df.VIF.max() > thresh:
            # If there are multiple variables with the maximum VIF, choose the first one
            index_to_drop = vif_df.index[vif_df.VIF == vif_df.VIF.max()].tolist()[0]
            print('Dropping: {}'.format(index_to_drop))
            df = df.drop(columns = index_to_drop)
        else:
            # No VIF is above threshold. Exit the loop
            break

    return df

***VIF***

In [16]:
listings_df_VIF_new = drop_column_using_vif_(listings_df_VIF.drop('legal_listing', axis=1))

  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


Dropping: maximum_nights_avg_ntm


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


Dropping: minimum_maximum_nights


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


Dropping: id


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Dropping: calculated_host_listings_count_entire_homes


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Dropping: room_type_Hotel room


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


Dropping: room_type_Private room


  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


Dropping: property_type_Camper/RV


  return 1 - self.ssr/self.centered_tss


Dropping: host_verifications_['email', 'phone']


  return 1 - self.ssr/self.centered_tss


Dropping: property_type_Entire condo


  return 1 - self.ssr/self.centered_tss


Dropping: host_listings_count


  return 1 - self.ssr/self.centered_tss


Dropping: host_response_time_within an hour


  return 1 - self.ssr/self.centered_tss


Dropping: minimum_nights_avg_ntm


  return 1 - self.ssr/self.centered_tss


Dropping: calculated_host_listings_count_shared_rooms


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Downtown


  return 1 - self.ssr/self.centered_tss


Dropping: host_total_listings_count


  return 1 - self.ssr/self.centered_tss


Dropping: availability_60


  return 1 - self.ssr/self.centered_tss


Dropping: minimum_nights


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Grandview-Woodland


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Kensington-Cedar Cottage


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_Central Vancouver


  return 1 - self.ssr/self.centered_tss


Dropping: review_scores_rating


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Renfrew-Collingwood


  return 1 - self.ssr/self.centered_tss


Dropping: accommodates


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Kitsilano


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Killarney


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_Riley Park–Little Mountain


  return 1 - self.ssr/self.centered_tss


Dropping: minimum_minimum_nights


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Fairview


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Dunbar Southlands


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_West Point Grey


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Sunset


  return 1 - self.ssr/self.centered_tss


Dropping: number_of_reviews_ltm


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_Shaughnessy


  return 1 - self.ssr/self.centered_tss


Dropping: bedrooms


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Hastings-Sunrise


  return 1 - self.ssr/self.centered_tss


Dropping: reviews_per_month


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_South Cambie


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Victoria-Fraserview


  return 1 - self.ssr/self.centered_tss


Dropping: review_scores_accuracy


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Kerrisdale


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_Oakridge


  return 1 - self.ssr/self.centered_tss


Dropping: property_type_Entire home


  return 1 - self.ssr/self.centered_tss


Dropping: review_scores_value


  return 1 - self.ssr/self.centered_tss


Dropping: availability_90


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_Marpole


  return 1 - self.ssr/self.centered_tss


Dropping: review_scores_communication


  return 1 - self.ssr/self.centered_tss


Dropping: neighbourhood_cleansed_Mount Pleasant


  return 1 - self.ssr/self.centered_tss


Dropping: host_neighbourhood_North Vancouver


  return 1 - self.ssr/self.centered_tss


In [None]:
"""
# Create a new dataframe to store the VIF values
VIF_df = pd.DataFrame()

# Drop all rows containing NAs or infs in listings_df_VIF

listings_df_VIF.replace([np.inf, -np.inf], np.nan, inplace=True)
listings_df_VIF.dropna(inplace=True)

# Exclude the 'legal_listing' column from the analysis
columns_to_check = [col for col in listings_df_VIF.columns if col != 'legal_listing']

# Calculate the VIF for each column
VIF_df['Variable'] = columns_to_check
VIF_df['VIF'] = [variance_inflation_factor(listings_df_VIF[columns_to_check].values, i) for i in range(len(columns_to_check))]

# Sort the dataframe by VIF values in descending order
vif_df = VIF_df.sort_values(by='VIF', ascending=False)

# Print the dataframe
print(vif_df)
"""

In [17]:
# After VIF now we have the 'listings_df_VIF_new'

print(f"There are {listings_df_VIF_new.shape[1]} variables after VIF operation.")

# Add legal_listing back to csv
listings_df_VIF_new['legal_listing'] = listings_df_VIF['legal_listing']


# And save the new dataframe to csv.file
listings_df_VIF_new.to_csv(os.path.join('data','yvr_listing_data_cleaned.csv'),index=False)

There are 162 variables after VIF operation.
