In [147]:
#imports
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as stats

Build a regression model.

In [16]:
#import the businesses/bike stations dataframe to perform the modelling on
businesses = pd.read_json('business.json')

#add a column containing the total number of bikes per station, total_bikes = empty_bike_slots+available_bikes
businesses['total_bikes'] = businesses['empty_bike_slots']+businesses['available_bikes']

#review the characteristics of the dataframe
print(businesses.info())
print(businesses.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 541 entries, 0 to 540
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   nearest_station   541 non-null    int64  
 1   empty_bike_slots  541 non-null    int64  
 2   available_bikes   541 non-null    int64  
 3   business_name     541 non-null    object 
 4   rating            541 non-null    float64
 5   review_count      541 non-null    int64  
 6   dist_to_station   541 non-null    float64
 7   price             267 non-null    object 
 8   numeric_price     267 non-null    float64
 9   business_type     541 non-null    object 
 10  total_bikes       541 non-null    int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 50.7+ KB
None
       nearest_station  empty_bike_slots  available_bikes      rating  \
count       541.000000        541.000000       541.000000  541.000000   
mean         17.770795         15.609982         1.181146    3.927911   
st

In [17]:

#numeric_price column contain many null values that need to be handled. Since there are no outliers in this column, null values can be replaced with the column average
businesses.fillna(businesses['numeric_price'].mean(), inplace=True)

#column 'price' is symbolic and also contain many null values, however, replacing them with the "mean" or the corresponding number of "$" symbols would introduce significant systematic bias, thus is better to simply drop the column
businesses.drop(columns=['price'], inplace=True)

businesses.head()

Unnamed: 0,nearest_station,empty_bike_slots,available_bikes,business_name,rating,review_count,dist_to_station,numeric_price,business_type,total_bikes
0,1,11,1,Popeyes Louisiana Kitchen,1.5,54,237.035506,1.0,Fried Chicken Joint,12
1,1,11,1,Captain D's,4.0,8,249.010884,1.0,Restaurant,12
2,1,11,1,Red Harbor Seafood N Chicken,4.0,13,336.555225,1.726592,Restaurant,12
3,1,11,1,Sugar's Crab Shack,4.5,161,607.440572,2.0,Restaurant,12
4,1,11,1,Sugars crab shack,5.0,3,607.440572,1.726592,Restaurant,12


In [135]:
#build the regression model where total_bikes is the dependent variable and the business characteristics are the independent variables
y=businesses['total_bikes']
x=businesses[['rating','review_count','dist_to_station','numeric_price']]

#add intercept value to independent variables and fit the regression model
x = stats.add_constant(x)
regmodel=stats.OLS(y,x).fit()

#display the regression results
print(regmodel.summary())

                            OLS Regression Results                            
Dep. Variable:            total_bikes   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.634
Date:                Mon, 23 Oct 2023   Prob (F-statistic):              0.164
Time:                        12:23:00   Log-Likelihood:                -1275.2
No. Observations:                 541   AIC:                             2560.
Df Residuals:                     536   BIC:                             2582.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              16.7473      0.699     

In [136]:
#since most independent variables have a p_value larger than the significant cutoff, they should be removed from the model to try to improve it

#recursively remove the independent variable with the highest p_value and repeat the regression until all the remaining idependent variables have p_values under 0.05
while max(regmodel.pvalues)>0.05:
    maxP_value = round(max(regmodel.pvalues),3)
    drop_col = x.columns[regmodel.pvalues.tolist().index(max(regmodel.pvalues))]
    print(f'dropped column {drop_col}, with p_value = {round(regmodel.pvalues[drop_col],3)}')
    x.drop(columns=drop_col, inplace=True)
    regmodel=stats.OLS(y,x).fit()
#print(regmodel.summary())

dropped column numeric_price, with p_value = 0.816
dropped column rating, with p_value = 0.762
dropped column dist_to_station, with p_value = 0.37


Provide model output and an interpretation of the results. 

In [134]:
#display the resulting regression output
print(regmodel.summary())

                            OLS Regression Results                            
Dep. Variable:            total_bikes   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     5.606
Date:                Mon, 23 Oct 2023   Prob (F-statistic):             0.0183
Time:                        12:22:40   Log-Likelihood:                -1275.7
No. Observations:                 541   AIC:                             2555.
Df Residuals:                     539   BIC:                             2564.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           16.6457      0.126    131.975   

In [None]:
"""
In the end, only the number of Yelp reviews of a business, has any significant correlation to the total number of bikes in the nearest bike station.
However, with R-squared = 0.010 and Adjusted R-squared = 0.008, it seems the predictive power of this model is negligible.

The following is a possible interpretation of the results:
The number of Yelp reviews of a business is likely correlated to the popularity of the business, meaning the number of regular patrons served.
Then, there must be an elevated transit of patrons near popular businesses, thus more bikes are required to meet the demands of patrons\
when compared to areas with less popular businesses.
"""

# Stretch

How can you turn the regression model into a classification model?

In [165]:
"""
A possible approach to turn the regression model into a classification model would be to use total_bikes as independent variable\
to determine if a business is popular or not. With Popular being defined as a business with more than 100 Yelp reviews.
"""
is_popular = []
for i in range(len(businesses)):
    if businesses['review_count'][i] > 100:
        is_popular.append(1)
    else:
        is_popular.append(0)
is_popular = np.array(is_popular, dtype=float)

indep_var = businesses['total_bikes']
indep_var = stats.add_constant(indep_var)
classification_model=stats.Logit(is_popular, indep_var).fit()
print(classification_model.summary())

Optimization terminated successfully.
         Current function value: 0.603893
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  541
Model:                          Logit   Df Residuals:                      539
Method:                           MLE   Df Model:                            1
Date:                Mon, 23 Oct 2023   Pseudo R-squ.:                 0.01811
Time:                        13:35:26   Log-Likelihood:                -326.71
converged:                       True   LL-Null:                       -332.73
Covariance Type:            nonrobust   LLR p-value:                 0.0005166
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -2.9808      0.642     -4.640      0.000      -4.240      -1.722
total_bikes     0.1273    

In [None]:
#Pseudo R-squred is still very low (0.01811), so the predictive power of the classification model is just as bad as that of the regression model.