Build a regression model.

In [2]:
import sqlite3  # Importing sqlite3 to work with SQLite databases
import pandas as pd  # Importing pandas to work with dataframes
import statsmodels.api as sm  # Importing the statsmodels library for the regression model

# The path to the SQLite database where my data is stored
db_path = '/Users/jorgen/Documents/LHL/project/Statistical-Modeling-with-Python/data/bike_stations.db'

# Establishing a connection to my database
conn = sqlite3.connect(db_path)

# SQL query to select my needed columns from the table
# Here, I need to adjust the table name to 'bike_stations', the correct name based on how I created the database
sql_query = '''
SELECT free_bikes, latitude, longitude
FROM bike_stations
'''

# Loading the data from my SQL query into a pandas DataFrame
df = pd.read_sql_query(sql_query, conn)

# Closing the database connection as it's a good practice to close connections once the task is done
conn.close()

# Assuming for the regression model, 'free_bikes' is our response variable
# and we're exploring its relationship with 'latitude' and 'longitude'
X = df[['latitude', 'longitude']]  # Predictor variables
y = df['free_bikes']  # Response variable

# Adding a constant to the predictor variables, necessary for the regression model's intercept
X = sm.add_constant(X)

# Fitting the regression model using OLS (Ordinary Least Squares)
model = sm.OLS(y, X).fit()

# Printing a summary of the model to see the results, including coefficients and statistics
print(model.summary())




                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.581
Model:                            OLS   Adj. R-squared:                  0.162
Method:                 Least Squares   F-statistic:                     1.386
Date:                Sun, 07 Apr 2024   Prob (F-statistic):              0.419
Time:                        13:09:21   Log-Likelihood:                -10.961
No. Observations:                   5   AIC:                             27.92
Df Residuals:                       2   BIC:                             26.75
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -4825.6556   3558.281     -1.356      0.3

  warn("omni_normtest is not valid with less than 8 observations; %i "


Provide model output and an interpretation of the results. 

In [None]:
# Model Summary Interpretation:

"""
- R-squared (0.581) indicates our model explains 58.1% of the variance in free bikes, showing a moderate fit.
- Adjusted R-squared (0.162) suggests limited predictive power when adjusting for predictors.
- Coefficients for latitude (120.8305) and longitude (-79.0889) aren't statistically significant (p-values > 0.05), indicating uncertain effects on free bikes.
- The model's overall F-statistic (1.386) with a p-value (0.419) implies the model isn't significant, questioning its predictive capability.
- Given the small sample size and a large condition number, results should be interpreted with caution due to potential overfitting and multicollinearity.
"""


# Stretch

How can you turn the regression model into a classification model?