# Logit Model using Statsmodels

In [1]:
# Load libraries
import pandas as pd
import os
import statsmodels.api as sm

In [2]:
# Load cleaned data
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'))

In [3]:
# Define the independent and dependent variables
X = listings_df.drop('legal_listing', axis=1)
y = listings_df['legal_listing']

In [4]:
# Find all columns in X with a low variance
low_variance = []
for col in X:
    if X[col].var() < 0.001:
        low_variance.append(col)

print(low_variance)

['room_type_Hotel room', 'room_type_Shared room']


In [5]:
# Drop columns with low variance
X = X.drop(low_variance, axis=1)

In [6]:
# Add a constant column to the independent variables
X = sm.add_constant(X)

# Create the logit model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit(maxiter=10000)

Optimization terminated successfully.
         Current function value: 0.297474
         Iterations 8


In [7]:
# Print the summary of the model
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 3683
Model:                          Logit   Df Residuals:                     3636
Method:                           MLE   Df Model:                           46
Date:                Wed, 29 Nov 2023   Pseudo R-squ.:                  0.4530
Time:                        22:52:40   Log-Likelihood:                -1095.6
converged:                       True   LL-Null:                       -2002.8
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                      coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------
const                                               2.1713      1.470      1.477      0.140      -0.709       5.052
number_of_reviews   

In [8]:
# Print all the coefficients of the model, with their names, only if their p-value is less than 0.05
for i in range(len(result.pvalues)):
    if result.pvalues[i] < 0.05:
        print(result.params.index[i], result.params[i])

number_of_reviews 0.02243913688596531
price 0.0028103874880441785
host_response_time 1.0946300585590332
availability_365 -0.0015648741859619697
review_scores_checkin -0.553952469367292
maximum_minimum_nights -0.04952969820529914
host_response_rate -0.015599111980460316
host_verifications -1.2749948329499947
number_of_reviews_l30d 0.2962920360730335
beds 0.1783313725042358
neighbourhood_cleansed_Downtown Eastside 0.6609256951710886
neighbourhood_cleansed_Dunbar Southlands 0.8516290838567399
neighbourhood_cleansed_Grandview-Woodland 1.0174713389283472
neighbourhood_cleansed_Hastings-Sunrise 1.5926354682671144
neighbourhood_cleansed_Kensington-Cedar Cottage 1.3479567662698566
neighbourhood_cleansed_Mount Pleasant 0.5407037695409844
neighbourhood_cleansed_Renfrew-Collingwood 0.8018427111966568
neighbourhood_cleansed_Riley Park 0.9261618695070281
neighbourhood_cleansed_Shaughnessy 1.362007134801811
neighbourhood_cleansed_Strathcona 3.670136830786422
neighbourhood_cleansed_Sunset 1.016556867

  if result.pvalues[i] < 0.05:
  print(result.params.index[i], result.params[i])
