# Logit Model using Statsmodels

In [1]:
# Load libraries
import pandas as pd
import os
import statsmodels.api as sm

In [2]:
# Load cleaned data
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'))

In [3]:
# Define the independent and dependent variables
X = listings_df.drop('legal_listing', axis=1)
y = listings_df['legal_listing']

In [4]:
# Find all columns in X with a low variance
low_variance = []
for col in X:
    if X[col].var() < 0.001:
        low_variance.append(col)

print(low_variance)

['room_type_Hotel room', 'room_type_Shared room']


In [5]:
# Drop columns with low variance
X = X.drop(low_variance, axis=1)

In [6]:
# Add a constant column to the independent variables
X = sm.add_constant(X)

# Create the logit model
logit_model = sm.Logit(y, X)

# Fit the model
result = logit_model.fit(maxiter=100000)

         Current function value: 0.259176
         Iterations: 100000




In [7]:
# Print the summary of the model
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 3583
Model:                          Logit   Df Residuals:                     3540
Method:                           MLE   Df Model:                           42
Date:                Fri, 01 Dec 2023   Pseudo R-squ.:                  0.5080
Time:                        00:06:55   Log-Likelihood:                -928.63
converged:                      False   LL-Null:                       -1887.6
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                      coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------
const                                               2.9203      1.472      1.984      0.047       0.036       5.805
host_response_rate  

In [8]:
# Print all the coefficients of the model, with their names, only if their p-value is less than 0.05
for i in range(len(result.pvalues)):
    if result.pvalues[i] < 0.05:
        print(result.params.index[i], result.params[i])

const 2.920323907660298
host_response_rate -0.01618857594696151
number_of_reviews 0.018513695808793505
availability_365 -0.001282230085218165
host_response_time 1.1096244486934135
beds 0.2406699208810988
review_scores_cleanliness 0.5727439766736888
minimum_nights -0.11641738226495012
room_type_Private room -1.7703059517544348
neighbourhood_cleansed_Grandview-Woodland 0.7461602481198714
neighbourhood_cleansed_Hastings-Sunrise 1.32052767746657
neighbourhood_cleansed_Kensington-Cedar Cottage 1.294840085847496
neighbourhood_cleansed_Mount Pleasant 0.5320516193209789
neighbourhood_cleansed_Riley Park 0.8866875950552021
neighbourhood_cleansed_Strathcona 2.556516231255847
neighbourhood_cleansed_Victoria-Fraserview 1.3693085740191095
neighbourhood_cleansed_West Point Grey 1.1977155396641443


  if result.pvalues[i] < 0.05:
  print(result.params.index[i], result.params[i])
