In [30]:
# Load libraries
import pandas as pd
import os
import statsmodels.api as sm

In [31]:
# Load cleaned data
listings_df = pd.read_csv(os.path.join('data', 'yvr_listing_data_cleaned.csv'))

In [32]:
"""
Using the logistic regression Wald test, see if there is a connection between review_scores_rating (independent variable)
and legal_listing (dependent variable). The null hypothesis is that there is no connection between the two variables.
review_scores_rating is a continuous variable from 0 to 5, while legal_listing is a categorial boolean variable.
"""
# Remove rows where review_scores_rating is NaN
listings_df = listings_df[listings_df['review_scores_rating'].notna()]

# Prepare the data
X = listings_df['review_scores_rating']
y = listings_df['legal_listing']

# Fit the logistic regression model
model = sm.Logit(y, sm.add_constant(X))
result = model.fit()

print(result.summary())

# Perform the Wald test
wald_test = result.wald_test("review_scores_rating = 0", scalar=True)

# Print the Wald test results
print("\n       Wald Test Results")
print("Test Statistic:", wald_test.statistic)
print("P-value:", wald_test.pvalue)
print("Degrees of Freedom:", wald_test.df_denom)


Optimization terminated successfully.
         Current function value: 0.563565
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 5641
Model:                          Logit   Df Residuals:                     5639
Method:                           MLE   Df Model:                            1
Date:                Thu, 23 Nov 2023   Pseudo R-squ.:                0.003890
Time:                        09:14:05   Log-Likelihood:                -3179.1
converged:                       True   LL-Null:                       -3191.5
Covariance Type:            nonrobust   LLR p-value:                 6.257e-07
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -0.4340      0.302     -1.437      0.151      -1.026       0.158

In [33]:
"""
Create a multiple logistic regression model to predict legal_listing using the following independent variables:
review_scores_rating, price, instant_bookable
"""
# Convert instant_bookable to a boolean variable
listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t': 1, 'f': 0})

# Convert price to a float variable
listings_df['price'] = listings_df['price'].str.replace('$', '').str.replace(',', '').astype(float)

# Prepare the data
X = listings_df[['review_scores_rating', 'price', 'instant_bookable']]
y = listings_df['legal_listing']

# Add constant to the independent variables
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Print the summary of the model
print(result.summary())

# Using the results of a logistic regression model, predict the probability of a listing being legal
# given the following values of the independent variables:
# review_scores_rating = 4.5, price = $256, instant_bookable = False
print("\n       Prediction Results")
print("Probability of being legal:", result.predict([1, 4.5, 256, 0])[0])

# Show formula for the model
print("\n       Model Formula")
print("logit(p) = ", result.params[0], "+", result.params[1], "* review_scores_rating +", result.params[2], "* price +", result.params[3], "* instant_bookable")

# Apply formula to calculate the probability of being legal (sigmoid function)
print("\n       Prediction Results")
print("Probability of being legal:", 1 / (1 + 2.71828 ** -(result.params[0] + result.params[1] * 4.5 + result.params[2] * 256 + result.params[3] * 0)))

Optimization terminated successfully.
         Current function value: 0.538722
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:          legal_listing   No. Observations:                 5641
Model:                          Logit   Df Residuals:                     5637
Method:                           MLE   Df Model:                            3
Date:                Thu, 23 Nov 2023   Pseudo R-squ.:                 0.04780
Time:                        09:14:05   Log-Likelihood:                -3038.9
converged:                       True   LL-Null:                       -3191.5
Covariance Type:            nonrobust   LLR p-value:                 7.792e-66
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -1.0386      0.315     -3.293      0.001      -1.657      -0.420

  print("logit(p) = ", result.params[0], "+", result.params[1], "* review_scores_rating +", result.params[2], "* price +", result.params[3], "* instant_bookable")
  print("Probability of being legal:", 1 / (1 + 2.71828 ** -(result.params[0] + result.params[1] * 4.5 + result.params[2] * 256 + result.params[3] * 0)))
