In [2]:
%pip install statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading statsmodels-0.14.6-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [statsmodels][0m [statsmodels]
[1A[2KSuccessfully installed patsy-1.0.2 statsmodels-0.14.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error

%matplotlib inline

# Load processed hourly data
hourly = pd.read_csv("../data_processed/hourly_counts_2023_sample.csv")

hourly.head(), hourly.shape


(         date  hour  hire_count
 0  2023-01-08     0         171
 1  2023-01-08     1         108
 2  2023-01-08     2          83
 3  2023-01-08     3          37
 4  2023-01-08     4          47,
 (288, 3))

In [10]:
# Ensure date is datetime
hourly["date"] = pd.to_datetime(hourly["date"])

# Derive day_of_week and weekend flag
hourly["day_of_week"] = hourly["date"].dt.day_name()
hourly["is_weekend"] = hourly["day_of_week"].isin(["Saturday", "Sunday"]).astype(int)

# Make sure types are sensible
hourly["hour"] = hourly["hour"].astype(int)
hourly["hire_count"] = hourly["hire_count"].astype(float)

hourly[["date", "hour", "hire_count", "day_of_week", "is_weekend"]].head()


Unnamed: 0,date,hour,hire_count,day_of_week,is_weekend
0,2023-01-08,0,171.0,Sunday,1
1,2023-01-08,1,108.0,Sunday,1
2,2023-01-08,2,83.0,Sunday,1
3,2023-01-08,3,37.0,Sunday,1
4,2023-01-08,4,47.0,Sunday,1


In [12]:
# MODEL 1: hour of day only (categorical)
model1 = smf.ols('hire_count ~ C(hour)', data=hourly).fit()

# MODEL 2: hour of day + weekend indicator
model2 = smf.ols('hire_count ~ C(hour) + is_weekend', data=hourly).fit()

print("MODEL 1: hour only")
print("R-squared:", model1.rsquared)

print("\nMODEL 2: hour + weekend")
print("R-squared:", model2.rsquared)

# ----- Compare prediction error (RMSE) -----
y = hourly["hire_count"]
y_pred1 = model1.predict(hourly)
y_pred2 = model2.predict(hourly)

# sklearn's mean_squared_error gives MSE → take square root to get RMSE
mse1 = mean_squared_error(y, y_pred1)
mse2 = mean_squared_error(y, y_pred2)

rmse1 = np.sqrt(mse1)
rmse2 = np.sqrt(mse2)

print("\nModel 1 RMSE:", rmse1)
print("Model 2 RMSE:", rmse2)
print("Mean hire_count:", y.mean())


MODEL 1: hour only
R-squared: 0.7457692873232608

MODEL 2: hour + weekend
R-squared: 0.7599876925003307

Model 1 RMSE: 428.7766536229901
Model 2 RMSE: 416.6140184330555
Mean hire_count: 1029.2118055555557


In [13]:
print(model2.summary())


                            OLS Regression Results                            
Dep. Variable:             hire_count   R-squared:                       0.760
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     34.70
Date:                Fri, 12 Dec 2025   Prob (F-statistic):           1.32e-67
Time:                        22:21:20   Log-Likelihood:                -2145.9
No. Observations:                 288   AIC:                             4342.
Df Residuals:                     263   BIC:                             4433.
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       176.2153    127.157      1.386