In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
np.set_printoptions(precision = 3)

# Preparation for statistics modelling

In [11]:
pu_2019 = pd.read_csv('../data/curated/pu_2019.csv')
do_2019 = pd.read_csv('../data/curated/pu_2020.csv')
pu_2020 = pd.read_csv('../data/curated/pu_2020.csv')
do_2020 = pd.read_csv('../data/curated/do_2020.csv')
all_data = [pu_2019, do_2019, pu_2020, do_2020]

# Drop duplicate rows
weather_2019 = pu_2019.drop_duplicates(subset = ['month'])
weather_2020 = pu_2020.drop_duplicates(subset = ['month'])

for data in all_data:
    data.drop_duplicates(subset = ['LocationID'], inplace = True)

In [12]:
def print_model_performance(true, pred, model):
    """
    print the summary, anova table and rmse of a given linear model
    """
    print(model.summary())
    print("=" * 78)
    anova_table = sm.stats.anova_lm(model, robust = "hc2")
    print("ANOVA TABLE")
    print(anova_table)
    print("=" * 78)
    print('RMSE = {:<.4}'.format(np.sqrt(mean_squared_error(true, pred))))
    
    
def stepwise_selection(data, response, all_covariates, __covariates=None, __min_aic=np.inf, counter=1):
    """
    print the model attributes with lowest aic
    """
    print(f"Stage {counter}")
    before_aic = __min_aic
    min_aic = __min_aic
    if __min_aic == np.inf:
        __covariates = all_covariates
    # Remove one covariate and check the aic 
    for var in __covariates:
        new_covariates = [x for x in __covariates if x != var]
        formula = response + '~' + '+'.join(new_covariates)
        curr_model = ols(formula = formula, data = data).fit()
        print(f"{new_covariates}: {curr_model.aic}")
        if curr_model.aic < min_aic:
            min_aic = curr_model.aic
            best_model = curr_model
    # Add one covariate and check the aic
    for var in all_covariates:
        if var in __covariates:
            continue
        new_covariates = list(__covariates) + [var, ]
        formula = response + '~' + '+'.join(new_covariates)
        curr_model = ols(formula = formula, data = data).fit()
        print(f"{new_covariates}: {curr_model.aic}")
        if curr_model.aic < min_aic:
            min_aic = curr_model.aic
            best_model = curr_model
    if before_aic == min_aic:
        print(f"Best of Final: {list(__covariates)} with aic = {before_aic}\n")
    else:
        __covariates = best_model.params.index[1:]
        print(f"Best of Stage {counter}: {list(__covariates)} with aic = {best_model.aic}")
        return stepwise_selection(data, response, all_covariates, __covariates, min_aic, counter=counter+1)

# Modelling

## How does weather impact the number of trips?

In [19]:
model0 = ols(formula = "trip_count_in_month ~ 1",
             data = weather_2019
            ).fit(cov_type = 'HC2')

trip_count_pred = model0.predict(weather_2020)
print_model_performance(weather_2020['trip_count_in_month'], trip_count_pred, model0)

                             OLS Regression Results                            
Dep. Variable:     trip_count_in_month   R-squared:                      -0.000
Model:                             OLS   Adj. R-squared:                 -0.000
Method:                  Least Squares   F-statistic:                       nan
Date:                 Sat, 15 Oct 2022   Prob (F-statistic):                nan
Time:                         15:24:15   Log-Likelihood:                -176.60
No. Observations:                   12   AIC:                             355.2
Df Residuals:                       11   BIC:                             355.7
Df Model:                            0                                         
Covariance Type:                   HC2                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   6.453e+06    1.8e+05     35.90



In [14]:
stepwise_selection(weather_2019, 
                   'trip_count_in_month', 
                   ['feelslike', 'precip', 'windspeed', 'visibility', 'snow', 'snowdepth']
                   )
model1 = ols(formula = "trip_count_in_month ~ windspeed + visibility + snow + snowdepth",
             data = weather_2019
            ).fit(cov_type = 'HC2')

trip_count_pred = model1.predict(weather_2020)
print_model_performance(weather_2020['trip_count_in_month'], trip_count_pred, model1)

Stage 1
['precip', 'windspeed', 'visibility', 'snow', 'snowdepth']: 329.7807298377206
['feelslike', 'windspeed', 'visibility', 'snow', 'snowdepth']: 331.5688226895276
['feelslike', 'precip', 'visibility', 'snow', 'snowdepth']: 342.4714215089548
['feelslike', 'precip', 'windspeed', 'snow', 'snowdepth']: 337.48268110804077
['feelslike', 'precip', 'windspeed', 'visibility', 'snowdepth']: 350.3016083464621
['feelslike', 'precip', 'windspeed', 'visibility', 'snow']: 353.3592973962748
Best of Stage 1: ['precip', 'windspeed', 'visibility', 'snow', 'snowdepth'] with aic = 329.7807298377206
Stage 2
['windspeed', 'visibility', 'snow', 'snowdepth']: 329.6553835187358
['precip', 'visibility', 'snow', 'snowdepth']: 354.5543807788965
['precip', 'windspeed', 'snow', 'snowdepth']: 335.48922132414583
['precip', 'windspeed', 'visibility', 'snowdepth']: 349.22541587845507
['precip', 'windspeed', 'visibility', 'snow']: 351.4536714484183
['precip', 'windspeed', 'visibility', 'snow', 'snowdepth', 'feelslike



In [15]:
# Lasso regression
y = ['trip_count_in_month']
X = ['feelslike', 'precip', 'windspeed', 'visibility', 'snow', 'snowdepth']
reg = LassoCV(cv = 5, 
              normalize = True, 
              random_state = 0).fit(weather_2019[X], np.array(weather_2019[y]).ravel())


r2 = reg.score(weather_2019[X], weather_2019[y])
trip_count_pred = reg.predict(weather_2020[X])
rmse = np.sqrt(mean_squared_error(weather_2020[y], trip_count_pred))


formula = 'trip_count_in_month = ' + str(reg.intercept_)[:10] + ' + ' \
            + ' + '.join([str(coef)[:10] + ' * ' + attr for coef, attr in zip(reg.coef_, X)])


print(formula)
print('a = {:<.4}'.format(reg.alpha_))
print('R2 = {:<.4}'.format(r2))
print('RMSE = {:<.4}'.format(rmse))

trip_count_in_month = 10316969.7 + -777.91157 * feelslike + -76968.009 * precip + 120234.948 * windspeed + -424543.97 * visibility + -3081987.8 * snow + 1632808.86 * snowdepth
a = 133.6
R2 = 0.9478
RMSE = 1.622e+06


## How do populations and property prices impact the pick-up number in a zone?

In [16]:
stepwise_selection(pu_2019, 
                   'ln_trip_count_total', 
                   ['ln_Price_per_square_feet', 'ln_Population_By_LocationID', 'ln_Density_per_hectare']
                   )

model2 = ols(formula = "ln_trip_count_total ~ ln_Price_per_square_feet + ln_Population_By_LocationID \
                        + ln_Density_per_hectare",
             data = pu_2019
            ).fit(cov_type = 'HC2')

trip_count_pred = model2.predict(pu_2020)
print_model_performance(pu_2020['ln_trip_count_total'], trip_count_pred, model2)

Stage 1
['ln_Population_By_LocationID', 'ln_Density_per_hectare']: 1131.2481034052016
['ln_Price_per_square_feet', 'ln_Density_per_hectare']: 1181.4354870887678
['ln_Price_per_square_feet', 'ln_Population_By_LocationID']: 1241.8639633165944
Best of Stage 1: ['ln_Population_By_LocationID', 'ln_Density_per_hectare'] with aic = 1131.2481034052016
Stage 2
['ln_Density_per_hectare']: 1231.5819876014998
['ln_Population_By_LocationID']: 1292.441791847786
['ln_Population_By_LocationID', 'ln_Density_per_hectare', 'ln_Price_per_square_feet']: 1090.9448214612623
Best of Stage 2: ['ln_Population_By_LocationID', 'ln_Density_per_hectare', 'ln_Price_per_square_feet'] with aic = 1090.9448214612623
Stage 3
['ln_Density_per_hectare', 'ln_Price_per_square_feet']: 1181.4354870887676
['ln_Population_By_LocationID', 'ln_Price_per_square_feet']: 1241.8639633165944
['ln_Population_By_LocationID', 'ln_Density_per_hectare']: 1131.2481034052016
Best of Final: ['ln_Population_By_LocationID', 'ln_Density_per_hecta

## How do populations and property prices impact the drop-off number in a zone?

In [17]:
stepwise_selection(do_2019, 
                   'ln_trip_count_total', 
                   [ 'ln_Price_per_square_feet', 'ln_Population_By_LocationID', 'ln_Density_per_hectare']
                   )

model3 = ols(formula = "ln_trip_count_total ~ ln_Price_per_square_feet + ln_Population_By_LocationID \
                        + ln_Density_per_hectare",
             data = do_2019
            ).fit(cov_type = 'HC2')

trip_count_pred = model3.predict(do_2020)
print_model_performance(do_2020['ln_trip_count_total'], trip_count_pred, model3)

Stage 1
['ln_Population_By_LocationID', 'ln_Density_per_hectare']: 1155.086837592395
['ln_Price_per_square_feet', 'ln_Density_per_hectare']: 1253.7083523591189
['ln_Price_per_square_feet', 'ln_Population_By_LocationID']: 1317.1395456856155
Best of Stage 1: ['ln_Population_By_LocationID', 'ln_Density_per_hectare'] with aic = 1155.086837592395
Stage 2
['ln_Density_per_hectare']: 1259.2767191079802
['ln_Population_By_LocationID']: 1322.0120246349077
['ln_Population_By_LocationID', 'ln_Density_per_hectare', 'ln_Price_per_square_feet']: 1146.8236349824142
Best of Stage 2: ['ln_Population_By_LocationID', 'ln_Density_per_hectare', 'ln_Price_per_square_feet'] with aic = 1146.8236349824142
Stage 3
['ln_Density_per_hectare', 'ln_Price_per_square_feet']: 1253.7083523591189
['ln_Population_By_LocationID', 'ln_Price_per_square_feet']: 1317.1395456856155
['ln_Population_By_LocationID', 'ln_Density_per_hectare']: 1155.086837592395
Best of Final: ['ln_Population_By_LocationID', 'ln_Density_per_hectare