In [1]:
# Load libraries
import pandas as pd
import os
import statsmodels.api as sm
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
# Load cleaned data
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'))

In [3]:
# Prepare the data
X = listings_df.drop('legal_listing', axis=1)
y = listings_df['legal_listing']

# Add constant to the independent variables
X = sm.add_constant(X)

In [4]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the independent variables
X_scaled = scaler.fit_transform(X)

# Convert the scaled variables back to a DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [5]:
# Check for zero variance variables
zero_variance_variables = X_scaled.var()[X_scaled.var() == 0].index

# Print the zero variance variables
print(zero_variance_variables)

Index(['const'], dtype='object')


In [6]:
# Drop the zero variance variables
X_scaled = X_scaled.drop(zero_variance_variables, axis=1)

In [7]:
# Fit the model using a regularized maximum likelihood
model = sm.Logit(y, X_scaled)
result = model.fit_regularized(method='l1')

# Print the summary of the model
print(result.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.4942156081839962
            Iterations: 42
            Function evaluations: 42
            Gradient evaluations: 42
                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 3583
Model:                          Logit   Df Residuals:                     3541
Method:                           MLE   Df Model:                           41
Date:                Thu, 30 Nov 2023   Pseudo R-squ.:                 0.06188
Time:                        21:53:02   Log-Likelihood:                -1770.8
converged:                       True   LL-Null:                       -1887.6
Covariance Type:            nonrobust   LLR p-value:                 8.558e-29
                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------

In [8]:
# Get all coefficients of the model with a p-value less than 0.05
significant_variables = result.params[result.pvalues < 0.05]
print(significant_variables)

# Print count of significant variables
print(len(significant_variables))

review_scores_checkin                             -0.148158
beds                                               0.098452
minimum_nights                                    -1.269931
number_of_reviews                                  0.237274
price                                              0.106414
review_scores_cleanliness                          0.188315
availability_365                                  -0.125399
host_response_rate                                -0.104558
host_total_listings_count                         -0.116524
neighbourhood_cleansed_Grandview-Woodland          0.091393
neighbourhood_cleansed_Hastings-Sunrise            0.126015
neighbourhood_cleansed_Kensington-Cedar Cottage    0.143774
neighbourhood_cleansed_Riley Park                  0.131452
neighbourhood_cleansed_Victoria-Fraserview         0.090790
neighbourhood_cleansed_West End                   -0.116607
room_type_Private room                            -0.109804
dtype: float64
16


In [9]:
# Create a formula for the model using the coefficients
# Get the coefficients from the logistic regression model
coefficients = result.params

# Create the formula for the model
formula = '1 / (1 + exp(-(' + ' + '.join(['{} * {}'.format(coefficient, variable) for variable, coefficient in coefficients.items()]) + ')))'

# Print the formula
print(formula)


1 / (1 + exp(-(-0.14815793922316609 * review_scores_checkin + -0.06678649895772525 * host_verifications + 0.09845201587855172 * beds + -0.014375358476600783 * host_has_profile_pic + -1.2699307076487896 * minimum_nights + 0.23727433591711067 * number_of_reviews + 0.10641352148249862 * price + 0.07333384386954081 * host_response_time + 0.0631479059567123 * host_is_superhost + -0.04066663086890847 * host_identity_verified + 0.10797884474111752 * host_acceptance_rate + 0.0034039267868072273 * availability_30 + 0.18831455175351275 * review_scores_cleanliness + -0.12539915960590534 * availability_365 + -0.10455807701882973 * host_response_rate + -0.020162273170747286 * calculated_host_listings_count_private_rooms + -0.017285878377827007 * instant_bookable + -0.11652387833282933 * host_total_listings_count + -0.06424029449010352 * has_availability + -0.036514206566805006 * number_of_reviews_l30d + 0.05583014202989972 * neighbourhood_cleansed_Downtown Eastside + 0.06593502107198633 * neighbour