In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [2]:
# Label: The hourly average gb per plan per scan at city-level
# Feature Set:
#     X1: driver_cancel_prob
#     X2: rider_cancel_prob
#     X3: spinner_cancel_prob
#     X4: eta
#     X5: surge_mul
#     X6: gb

#     X7: eta_square = (1 - X4/1500)**2
#     X8: eta_cube = (1 - X4/1500)**3
#     X9: network_contention = 1 / (1 + X5^5)
#     X10: cr_ratio = (1 - X1) * (1 - X2) * (1 - X3)

#     X11: eta_square_cr_ratio = X7 * X10
#     X12: eta_cube_cr_ratio = X8 * X10
#     X13: network_eta_cr_ratio = X7 * X9 * X10
#     X14: network_eta_gb_cr_ratio = X7 * (1 - X9) * X10 * X6

In [3]:
# Data
# Link: https://drive.google.com/drive/folders/1bYf6zAhH0OG1gFgtWtdjodxw8Gm2odN0

In [4]:
# Read in data: features & labels
df_features_dict = {}
df_labels_dict = {}

for i in range(1, 11):
    df_features_dict[i] = pd.read_csv(f'features_{i}.csv')
    df_labels_dict[i] = pd.read_csv(f'labels_{i}.csv')

In [5]:
df_features_frames = [df_features_dict[1], df_features_dict[2], df_features_dict[3],
                      df_features_dict[4], df_features_dict[5], df_features_dict[6],
                      df_features_dict[7], df_features_dict[8], df_features_dict[9],
                      df_features_dict[10]]

df_labels_frames = [df_labels_dict[1], df_labels_dict[2], df_labels_dict[3],
                    df_labels_dict[4], df_labels_dict[5], df_labels_dict[6],
                    df_labels_dict[7], df_labels_dict[8], df_labels_dict[9],
                    df_labels_dict[10]]

In [6]:
df_features = pd.concat(df_features_frames, ignore_index=True)
df_labels = pd.concat(df_labels_frames, ignore_index=True)

In [7]:
df_features.drop('num_plans', axis=1, inplace=True)
df_labels.drop('num_plans', axis=1, inplace=True)

In [8]:
df = pd.merge(df_features, df_labels,
              how='left',
              left_on=['datestr', 'city_id', 'hour_of_day'],
              right_on=['datestr', 'city_id', 'hour_of_day']
             )

In [9]:
df

Unnamed: 0,datestr,city_id,hour_of_day,market_drive_cancel_prob,market_rider_cancel_prob,market_spinner_cancel_prob,market_eta,market_surge_mul,market_gb,market_eta_square,market_eta_cube,market_network_contention,market_cr_ratio,market_eta_square_cr_ratio,market_eta_cube_cr_ratio,market_network_eta_cr_ratio,market_network_eta_gb_cr_ratio,hour_gb
0,2022-07-04,36,0,0.739296,0.099772,0.002159,207.363636,1.363636,21.144315,0.749260,0.657055,0.197540,0.233962,0.169522,0.146206,0.035209,2.828381,26.444516
1,2022-07-04,36,1,0.652287,0.127233,0.003484,711.181818,1.381818,33.227579,0.380371,0.295023,0.258382,0.307002,0.139338,0.112686,0.011593,3.353401,29.010592
2,2022-07-04,36,2,0.682259,0.131838,0.002629,531.000000,1.283333,31.267481,0.494161,0.396850,0.299754,0.276759,0.164017,0.137850,0.031121,3.470962,30.429411
3,2022-07-04,36,3,0.723331,0.141891,0.003238,989.600000,1.000000,33.784318,0.230431,0.178405,0.500000,0.247423,0.090078,0.077915,0.045039,0.975321,31.235293
4,2022-07-04,36,4,0.618826,0.157117,0.002977,498.000000,1.166667,39.016954,0.513928,0.421898,0.346617,0.323176,0.186640,0.155479,0.079729,3.155870,29.010644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10609,2022-08-02,1269,19,0.597781,0.056243,0.008374,176.536913,1.004027,2.894239,0.784557,0.702350,0.495567,0.378684,0.305066,0.276299,0.151809,0.397718,2.682849
10610,2022-08-02,1269,20,0.615688,0.057135,0.008104,189.801242,1.002484,2.631284,0.769234,0.682206,0.497095,0.360429,0.281631,0.251652,0.140471,0.349518,2.653636
10611,2022-08-02,1269,21,0.687489,0.065214,0.009356,211.834356,1.013497,2.790558,0.743521,0.648528,0.484848,0.290653,0.219170,0.192560,0.106140,0.290362,2.740683
10612,2022-08-02,1269,22,0.714922,0.073527,0.009494,242.914286,1.014857,2.834546,0.710440,0.608461,0.484285,0.263962,0.189602,0.163225,0.092902,0.240974,2.648862


In [10]:
df.shape

(10614, 18)

In [11]:
y = df['hour_gb']
X = df.drop(['hour_gb', 'datestr', 'city_id', 'hour_of_day'], axis=1)

In [12]:
X['intercept'] = pd.Series([1 for _ in range(len(X.index))])

In [13]:
X

Unnamed: 0,market_drive_cancel_prob,market_rider_cancel_prob,market_spinner_cancel_prob,market_eta,market_surge_mul,market_gb,market_eta_square,market_eta_cube,market_network_contention,market_cr_ratio,market_eta_square_cr_ratio,market_eta_cube_cr_ratio,market_network_eta_cr_ratio,market_network_eta_gb_cr_ratio,intercept
0,0.739296,0.099772,0.002159,207.363636,1.363636,21.144315,0.749260,0.657055,0.197540,0.233962,0.169522,0.146206,0.035209,2.828381,1
1,0.652287,0.127233,0.003484,711.181818,1.381818,33.227579,0.380371,0.295023,0.258382,0.307002,0.139338,0.112686,0.011593,3.353401,1
2,0.682259,0.131838,0.002629,531.000000,1.283333,31.267481,0.494161,0.396850,0.299754,0.276759,0.164017,0.137850,0.031121,3.470962,1
3,0.723331,0.141891,0.003238,989.600000,1.000000,33.784318,0.230431,0.178405,0.500000,0.247423,0.090078,0.077915,0.045039,0.975321,1
4,0.618826,0.157117,0.002977,498.000000,1.166667,39.016954,0.513928,0.421898,0.346617,0.323176,0.186640,0.155479,0.079729,3.155870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10609,0.597781,0.056243,0.008374,176.536913,1.004027,2.894239,0.784557,0.702350,0.495567,0.378684,0.305066,0.276299,0.151809,0.397718,1
10610,0.615688,0.057135,0.008104,189.801242,1.002484,2.631284,0.769234,0.682206,0.497095,0.360429,0.281631,0.251652,0.140471,0.349518,1
10611,0.687489,0.065214,0.009356,211.834356,1.013497,2.790558,0.743521,0.648528,0.484848,0.290653,0.219170,0.192560,0.106140,0.290362,1
10612,0.714922,0.073527,0.009494,242.914286,1.014857,2.834546,0.710440,0.608461,0.484285,0.263962,0.189602,0.163225,0.092902,0.240974,1


In [14]:
X.drop(['market_eta', 'market_gb'], axis=1, inplace=True)

In [15]:
X

Unnamed: 0,market_drive_cancel_prob,market_rider_cancel_prob,market_spinner_cancel_prob,market_surge_mul,market_eta_square,market_eta_cube,market_network_contention,market_cr_ratio,market_eta_square_cr_ratio,market_eta_cube_cr_ratio,market_network_eta_cr_ratio,market_network_eta_gb_cr_ratio,intercept
0,0.739296,0.099772,0.002159,1.363636,0.749260,0.657055,0.197540,0.233962,0.169522,0.146206,0.035209,2.828381,1
1,0.652287,0.127233,0.003484,1.381818,0.380371,0.295023,0.258382,0.307002,0.139338,0.112686,0.011593,3.353401,1
2,0.682259,0.131838,0.002629,1.283333,0.494161,0.396850,0.299754,0.276759,0.164017,0.137850,0.031121,3.470962,1
3,0.723331,0.141891,0.003238,1.000000,0.230431,0.178405,0.500000,0.247423,0.090078,0.077915,0.045039,0.975321,1
4,0.618826,0.157117,0.002977,1.166667,0.513928,0.421898,0.346617,0.323176,0.186640,0.155479,0.079729,3.155870,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10609,0.597781,0.056243,0.008374,1.004027,0.784557,0.702350,0.495567,0.378684,0.305066,0.276299,0.151809,0.397718,1
10610,0.615688,0.057135,0.008104,1.002484,0.769234,0.682206,0.497095,0.360429,0.281631,0.251652,0.140471,0.349518,1
10611,0.687489,0.065214,0.009356,1.013497,0.743521,0.648528,0.484848,0.290653,0.219170,0.192560,0.106140,0.290362,1
10612,0.714922,0.073527,0.009494,1.014857,0.710440,0.608461,0.484285,0.263962,0.189602,0.163225,0.092902,0.240974,1


In [16]:
# Simple scaling of specific features of X
X['market_rider_cancel_prob'] = X['market_rider_cancel_prob'] * 10
X['market_spinner_cancel_prob'] = X['market_spinner_cancel_prob'] * 100
X['market_network_eta_cr_ratio'] = X['market_network_eta_cr_ratio'] * 10

In [17]:
y

0        26.444516
1        29.010592
2        30.429411
3        31.235293
4        29.010644
           ...    
10609     2.682849
10610     2.653636
10611     2.740683
10612     2.648862
10613     2.583077
Name: hour_gb, Length: 10614, dtype: float64

In [18]:
# Scale y
y = y / 10
y

0        2.644452
1        2.901059
2        3.042941
3        3.123529
4        2.901064
           ...   
10609    0.268285
10610    0.265364
10611    0.274068
10612    0.264886
10613    0.258308
Name: hour_gb, Length: 10614, dtype: float64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=96)

In [20]:
X_train.shape

(8491, 13)

In [21]:
y_train.shape

(8491,)

In [22]:
X_test.shape

(2123, 13)

In [23]:
y_test.shape

(2123,)

In [24]:
model = sm.OLS(y_train, X_train)
trained_model = model.fit()

In [25]:
print(trained_model.summary())

                            OLS Regression Results                            
Dep. Variable:                hour_gb   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     2725.
Date:                Mon, 08 Aug 2022   Prob (F-statistic):               0.00
Time:                        21:16:45   Log-Likelihood:                -661.07
No. Observations:                8491   AIC:                             1348.
Df Residuals:                    8478   BIC:                             1440.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
market_drive

In [26]:
# Train RMSE
y_train_pred = trained_model.predict(X_train)
print(y_train_pred - y_train)

4752     0.038907
106     -0.038093
3184     0.002115
10561   -0.096598
1007    -0.067284
           ...   
2114     0.100302
8801    -0.148683
5748     0.044466
2412     0.100488
9638     0.086247
Length: 8491, dtype: float64


In [27]:
train_rmse = np.sqrt(np.mean((y_train - y_train_pred)**2))
train_rmse

0.26156220895126764

In [28]:
# Test RMSE
y_test_pred = trained_model.predict(X_test)
print(y_test_pred - y_test)

4822   -0.087136
5594    0.034750
2062   -0.054081
5454    0.087323
7312    0.100028
          ...   
3527    0.014798
5948   -0.164156
4754   -0.371872
8296   -0.002736
7397   -0.031655
Length: 2123, dtype: float64


In [29]:
test_rmse = np.sqrt(np.mean((y_test - y_test_pred)**2))
test_rmse

0.25194958830406017

In [30]:
y_train.describe()

count    8491.000000
mean        0.678712
std         0.576472
min         0.213940
25%         0.313027
50%         0.423438
75%         0.622321
max         3.797802
Name: hour_gb, dtype: float64

In [31]:
y_test.describe()

count    2123.000000
mean        0.654854
std         0.563477
min         0.220372
25%         0.309151
50%         0.407085
75%         0.592107
max         3.181440
Name: hour_gb, dtype: float64