In [17]:
# Load libraries
import pandas as pd
import os
import statsmodels.api as sm
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [14]:
# Load cleaned data
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'))

In [15]:
# Prepare the data
X = listings_df.drop('legal_listing', axis=1)
y = listings_df['legal_listing']

# Add constant to the independent variables
X = sm.add_constant(X)

In [18]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the independent variables
X_scaled = scaler.fit_transform(X)

# Convert the scaled variables back to a DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [21]:
# Check for zero variance variables
zero_variance_variables = X_scaled.var()[X_scaled.var() == 0].index

# Print the zero variance variables
print(zero_variance_variables)

Index(['const', 'host_neighbourhood_Bergen-Lafayette',
       'host_neighbourhood_Burke Mountain', 'host_neighbourhood_Kauaʻi',
       'host_neighbourhood_Ladner', 'host_neighbourhood_Landsdale',
       'host_neighbourhood_North Philadelphia', 'host_neighbourhood_Richmond',
       'host_neighbourhood_South Lake Tahoe',
       'host_neighbourhood_Terwillegar Towne', 'host_neighbourhood_UBC',
       'host_neighbourhood_Victoria West', 'host_neighbourhood_Vila Mariana',
       'host_neighbourhood_Westmount', 'host_neighbourhood_Willoughby',
       'property_type_Casa particular', 'property_type_Cave',
       'property_type_Entire cabin', 'property_type_Entire timeshare',
       'property_type_Private room in bed and breakfast',
       'property_type_Private room in camper/rv',
       'property_type_Private room in casa particular',
       'property_type_Private room in castle',
       'property_type_Private room in kezhan',
       'property_type_Private room in tiny home',
       'propert

In [22]:
# Drop the zero variance variables
X_scaled = X_scaled.drop(zero_variance_variables, axis=1)

In [23]:
# Fit the model using a regularized maximum likelihood
model = sm.Logit(y, X_scaled)
result = model.fit_regularized(method='l1')

# Print the summary of the model
print(result.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.2758580828317034
            Iterations: 214
            Function evaluations: 215
            Gradient evaluations: 214
                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 3683
Model:                          Logit   Df Residuals:                     3556
Method:                           MLE   Df Model:                          126
Date:                Wed, 29 Nov 2023   Pseudo R-squ.:                  0.4927
Time:                        17:29:35   Log-Likelihood:                -1016.0
converged:                       True   LL-Null:                       -2002.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------

In [29]:
# Get all coefficients of the model with a p-value less than 0.05
significant_variables = result.params[result.pvalues < 0.05]
print(significant_variables)

# Print count of significant variables
print(len(significant_variables))

price                                      0.693939
beds                                       0.227224
maximum_minimum_nights                    -1.251198
review_scores_checkin                     -0.142940
review_scores_cleanliness                  0.137664
number_of_reviews_l30d                     0.726075
number_of_reviews                          1.725343
availability_365                          -0.149028
host_neighbourhood_Gastown                 0.265616
host_neighbourhood_Hastings-Sunrise        0.238906
host_neighbourhood_Knight                  0.253932
host_neighbourhood_Metrotown               0.148533
host_response_time_within a few hours     -0.145521
property_type_Entire rental unit          -0.147761
property_type_Entire serviced apartment   -0.150965
property_type_Room in boutique hotel      -0.476688
neighbourhood_cleansed_Strathcona          0.282365
neighbourhood_cleansed_West End           -0.127397
dtype: float64
18


In [28]:
# Create a formula for the model using the coefficients
# Get the coefficients from the logistic regression model
coefficients = result.params

# Create the formula for the model
formula = '1 / (1 + exp(-(' + ' + '.join(['{} * {}'.format(coefficient, variable) for variable, coefficient in coefficients.items()]) + ')))'

# Print the formula
print(formula)


1 / (1 + exp(-(0.020590619354067703 * maximum_maximum_nights + 0.6939386628662507 * price + -0.05407280970695617 * instant_bookable + 0.030354628622901905 * host_is_superhost + 0.010719677412173651 * review_scores_location + 0.2272235690774166 * beds + -0.02029936215752927 * host_acceptance_rate + -1.2511982832329893 * maximum_minimum_nights + -0.14293966892896656 * review_scores_checkin + 0.13766379285452593 * review_scores_cleanliness + 0.7260745916151178 * number_of_reviews_l30d + -0.055204861516105146 * host_response_rate + 0.0407447102138601 * availability_30 + -0.12012994358564606 * host_identity_verified + -0.004634831327077161 * calculated_host_listings_count + -0.10892682200990413 * maximum_nights + 1.7253426613888685 * number_of_reviews + -0.14902779327720572 * availability_365 + 0.12424699017173921 * calculated_host_listings_count_private_rooms + 0.003397021797924817 * has_availability + -0.05031298327296703 * host_has_profile_pic + 0.23065635807136597 * host_neighbourhood_B