Build a regression model.

In [1]:
# libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [16]:
# Load the dataset for combined stations and yelp point of interest (POI)
data = pd.read_csv('../data/combined_station_yelp.csv')
data

Unnamed: 0,station_name,station_latitude,station_longitude,free_bikes,number_of_bikes,poi_name,poi_categories,poi_address,poi_distance,poi_latitude,poi_longitude,poi_ratings,poi_review_count
0,Hess at king,43.259126,-79.877212,10,12,La Luna,Middle Eastern,"306 King Street W,Hamilton, ON L8P 1B1,Canada",108.424550,43.259422,-79.878488,4.0,63
1,Hess at king,43.259126,-79.877212,10,12,Hambrgr,Burgers,"49 King William Street,Hamilton, ON L8R 1A2,Ca...",858.672096,43.257210,-79.866900,4.5,202
2,Hess at king,43.259126,-79.877212,10,12,Earth To Table : Bread Bar,Pizza,"258 Locke Street S,Hamilton, ON L8P 4B9,Canada",1052.141521,43.252840,-79.887020,4.0,293
3,Hess at king,43.259126,-79.877212,10,12,The Ship,Seafood,"23 Augusta Street,Hamilton, ON L8N 1P6,Canada",970.855528,43.252150,-79.870000,4.0,208
4,Hess at king,43.259126,-79.877212,10,12,Berkeley North,Bars,"31 King William Street,Hamilton, ON L8R 1A1,Ca...",792.544458,43.257405,-79.867715,4.5,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2801,Cannon at Ottawa,43.247565,-79.818050,6,7,Mr Beast Burger,Burgers,"224 Ottawa Street N,Hamilton, ON L8H 3Z6,Canada",149.392720,43.248686,-79.817039,4.0,1
2802,Cannon at Ottawa,43.247565,-79.818050,6,7,Bernie’s Tavern,Modern European,"1101-1103 Cannon St E,Hamilton, ON L8L 2J5,Canada",293.018582,43.248570,-79.821395,3.5,3
2803,Cannon at Ottawa,43.247565,-79.818050,6,7,The Hearty Hooligan,Cafes,"292 Ottawa Street N,Hamilton, ON L8H 3Z9,Canada",324.046798,43.250241,-79.816466,4.5,7
2804,Cannon at Ottawa,43.247565,-79.818050,6,7,Simply Italian Bakery,Bakeries,"212 Ottawa Street N,Hamilton, ON L8H 3Z6,Canada",116.158933,43.248448,-79.817282,4.0,1


In [17]:
# Define the columns for which you want to calculate summary statistics
columns_to_agg = {
    'poi_ratings': 'mean',
    'poi_review_count': 'mean',
    'poi_distance': 'mean'
}

# Group by 'station_name' and calculate summary statistics
poi_stats = data.groupby('station_name').agg(columns_to_agg).reset_index()

# Select relevant columns for the model
selected_columns = ['number_of_bikes', 'poi_ratings', 'poi_review_count', 'poi_distance']
data = data[selected_columns].dropna()  # Remove rows with missing values

# Step 2: Model Building
# Define the dependent variable (number_of_bikes) and independent variables (POI characteristics)
y = data['number_of_bikes']
X = data[['poi_ratings', 'poi_review_count', 'poi_distance']]

# Add a constant (intercept) to the independent variables
X = sm.add_constant(X)

# Fit a linear regression model
model = sm.OLS(y, X).fit()

# Step 3: Model Evaluation
# Print a summary of the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        number_of_bikes   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     20.25
Date:                Tue, 05 Sep 2023   Prob (F-statistic):           5.53e-13
Time:                        02:12:33   Log-Likelihood:                -8821.7
No. Observations:                2806   AIC:                         1.765e+04
Df Residuals:                    2802   BIC:                         1.768e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               12.6352      0.610  

  x = pd.concat(x[::order], 1)


Provide model output and an interpretation of the results. 

# Stretch

How can you turn the regression model into a classification model?

In [26]:
import pandas as pd

# Assuming you have 'predicted_values' as a NumPy array
predicted_values = model.predict()

def classify_usage(prediction):
    if prediction < threshold_low:
        return "Low Usage"
    elif prediction < threshold_high:
        return "Moderate Usage"
    else:
        return "High Usage"

# Apply the classification function to predictions
data['usage_category'] = data['predicted_values'].apply(classify_usage)

# Convert the NumPy array to a Pandas DataFrame
#predicted_df = pd.DataFrame({'predicted_values': predicted_values})

# Save the DataFrame to a CSV file
#predicted_df.to_csv('../data/predicted_values.csv', index=False)

KeyError: 'predicted_values'