In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
# Read in data: features & labels
df_features_dict = {}
df_labels_dict = {}

num_feature_files = 15
num_label_files = 15

for i in range(1, num_feature_files):
    df_features_dict[i] = pd.read_csv(f'lcof_hourly_features_{i}.csv')

for i in range(1, num_label_files):
    df_labels_dict[i] = pd.read_csv(f'lcof_hourly_labels_{i}.csv')

In [3]:
df_features_frames = [df_features_dict[1], df_features_dict[2], df_features_dict[3],
                      df_features_dict[4], df_features_dict[5], df_features_dict[6],
                      df_features_dict[7], df_features_dict[8], df_features_dict[9],
                      df_features_dict[10], df_features_dict[11], df_features_dict[12],
                      df_features_dict[13], df_features_dict[14]
                     ]

df_labels_frames = [df_labels_dict[1], df_labels_dict[2], df_labels_dict[3],
                    df_labels_dict[4], df_labels_dict[5], df_labels_dict[6],
                    df_labels_dict[7], df_labels_dict[8], df_labels_dict[9],
                    df_labels_dict[10], df_labels_dict[11], df_labels_dict[12],
                    df_labels_dict[13], df_labels_dict[14]
                   ]

df_features = pd.concat(df_features_frames, ignore_index=True)
df_labels = pd.concat(df_labels_frames, ignore_index=True)

df_features.drop('num_plans', axis=1, inplace=True)
df_labels.drop('num_plans', axis=1, inplace=True)

df = pd.merge(df_features, df_labels,
              how='inner',
              left_on=['datestr', 'city_id', 'hour_of_day'],
              right_on=['datestr', 'city_id', 'hour_of_day']
             )

In [4]:
# Filter out outliers
df_clean = df[df['hour_gb_gamma_95'] < 3].reset_index(drop=True)
print(f"The original dataset size = {len(df)}; the new dataset size = {len(df_clean)}")

The original dataset size = 20933; the new dataset size = 20509


In [5]:
# Labels
gb_100 = df_clean['hour_gb_gamma_100']
gb_99 = df_clean['hour_gb_gamma_99']
gb_95 = df_clean['hour_gb_gamma_95']
gb_90 = df_clean['hour_gb_gamma_90']

fare_100 = df_clean['hour_fare_gamma_100']
fare_99 = df_clean['hour_fare_gamma_99']
fare_95 = df_clean['hour_fare_gamma_95']
fare_90 = df_clean['hour_fare_gamma_90']

y = gb_95

In [6]:
# Features
X = df_clean.loc[:, ['market_log_cr',
                     'market_log_eta',
                     'market_log_fare_p50_scale',
                     'market_surge'
                    ]
                ]

In [15]:
# Training & Test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=116)

In [16]:
def calculate_loss(x1, x2, x3, x, y, b0, b1, b2, b3):
    alpha = 1 - 1/np.power(np.maximum(x, 1), b3)
    return np.mean(np.power(b0 + b1*x1 + b2*x2 + alpha*x3 - y, 2) / 2)

In [20]:
def grad(x1, x2, x3, x, y, b0, b1, b2, b3):
    alpha = 1 - 1/np.power(np.maximum(x, 1), b3)
    re = b0 + b1*x1 + b2*x2 + alpha*x3 - y
    
    db0 = np.mean(re)
    db1 = np.mean(re * x1 - 0.2 * b1)
    db2 = np.mean(re * x2 - 0.2 * b2)
    db3 = np.mean(re * x3 * b3 * np.power(np.maximum(x, 1), -b3-1) - 0.2 * b3)
    
    return db0, db1, db2, db3

In [21]:
def main(n_iter=20, step_size=400, lr=0.01, tol=1e-03):
    N = len(X_train)
    num_iters = N // step_size
    
    b0 = 0.0
    b1 = 1.0
    b2 = 2.0
    b3 = 1.0
    
    train_losses = []
    test_losses = []
    
    best_b0 = b0
    best_b1 = b1
    best_b2 = b2
    best_b3 = b3
    
    best_test_loss = np.inf
    best_iter = 0
    
    for it in range(n_iter):
        print(f"We are in iteration: {it+1} ...")
        
        # Reshuffle
        ids = list(range(N))
        np.random.shuffle(ids)
        
        X_train_ids = X_train.reset_index(drop=True)
        y_train_ids = y_train.reset_index(drop=True)
        
        train_loss = 0
        for idx in range(num_iters):
            y = y_train_ids[ids][idx*step_size:(idx+1)*step_size]
            x1 = X_train_ids['market_log_cr'][ids][idx*step_size:(idx+1)*step_size]
            x2 = X_train_ids['market_log_eta'][ids][idx*step_size:(idx+1)*step_size]
            x3 = X_train_ids['market_log_fare_p50_scale'][ids][idx*step_size:(idx+1)*step_size]
            x = X_train_ids['market_surge'][ids][idx*step_size:(idx+1)*step_size]
            
            cur_loss = calculate_loss(x1, x2, x3, x, y, b0, b1, b2, b3)
            train_loss += cur_loss
            
            db0, db1, db2, db3 = grad(x1, x2, x3, x, y, b0, b1, b2, b3)
            
            if np.all(np.abs([lr*db0, lr*db1, lr*db2, lr*db3]) <= tol):
                break
            
            b0 -= lr * db0
            b1 -= lr * db1
            b2 -= lr * db2
            b3 -= lr * db3
        
        train_loss /= num_iters
        print(f"Train Loss = {train_loss}")
        
        train_losses.append(train_loss)
        
        # Test loss
        y = y_test
        x1 = X_test['market_log_cr']
        x2 = X_test['market_log_eta']
        x3 = X_test['market_log_fare_p50_scale']
        x = X_test['market_surge']
        test_loss = calculate_loss(x1, x2, x3, x, y, b0, b1, b2, b3)
        
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            best_iter = it

            best_b0 = b0
            best_b1 = b1
            best_b2 = b2
            best_b3 = b3
        
        print(f"Test Loss = {test_loss}")
        
        test_losses.append(test_loss)
    
    return best_b0, best_b1, best_b2, best_b3, best_test_loss, best_iter

In [22]:
best_b0, best_b1, best_b2, best_b3, best_test_loss, best_iter = main(n_iter=100, lr=0.01, step_size=400)

We are in iteration: 1 ...
Train Loss = 2.3323057919906893
Test Loss = 1.4134222698040684
We are in iteration: 2 ...
Train Loss = 1.0018397748030892
Test Loss = 0.7146950493105182
We are in iteration: 3 ...
Train Loss = 0.598591309855348
Test Loss = 0.5014664424124013
We are in iteration: 4 ...
Train Loss = 0.4785619228325616
Test Loss = 0.43723636656985754
We are in iteration: 5 ...
Train Loss = 0.38155194955592586
Test Loss = 0.4207939762998643
We are in iteration: 6 ...
Train Loss = 0.03848624987776011
Test Loss = 0.4204112503748586
We are in iteration: 7 ...
Train Loss = 0.2082439042819725
Test Loss = 0.4192252588804633
We are in iteration: 8 ...
Train Loss = 0.10467701799529712
Test Loss = 0.418500053918642
We are in iteration: 9 ...
Train Loss = 0.02126249666823967
Test Loss = 0.4186447054033014
We are in iteration: 10 ...
Train Loss = 0.08081877424061491
Test Loss = 0.4173528656492568
We are in iteration: 11 ...
Train Loss = 0.2140919834426107
Test Loss = 0.4164968643411749
We a

In [23]:
print(best_b0, best_b1, best_b2, best_b3, best_test_loss, best_iter)
# gamma = 0.95, fare_scaling = p50, with regularization 0.1
# --- 1.517388764794251 0.09627509778325313 0.7899406274789974 2.0840185343833264 0.22026837308085773 99

# gamma = 0.95, fare_scaling = p50, with regularization 0.2
# --- 1.6925027509382682 0.2905504174655244 1.352525522576194 3.5240774606967213 0.4164968643411749 10


1.6925027509382682 0.2905504174655244 1.352525522576194 3.5240774606967213 0.4164968643411749 10
