In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
# Read in data: features & labels
df_features_dict = {}
df_labels_dict = {}

num_feature_files = 15
num_label_files = 15

for i in range(1, num_feature_files):
    df_features_dict[i] = pd.read_csv(f'lcof_hourly_features_{i}.csv')

for i in range(1, num_label_files):
    df_labels_dict[i] = pd.read_csv(f'lcof_hourly_labels_{i}.csv')

In [3]:
df_features_frames = [df_features_dict[1], df_features_dict[2], df_features_dict[3],
                      df_features_dict[4], df_features_dict[5], df_features_dict[6],
                      df_features_dict[7], df_features_dict[8], df_features_dict[9],
                      df_features_dict[10], df_features_dict[11], df_features_dict[12],
                      df_features_dict[13], df_features_dict[14]
                     ]

df_labels_frames = [df_labels_dict[1], df_labels_dict[2], df_labels_dict[3],
                    df_labels_dict[4], df_labels_dict[5], df_labels_dict[6],
                    df_labels_dict[7], df_labels_dict[8], df_labels_dict[9],
                    df_labels_dict[10], df_labels_dict[11], df_labels_dict[12],
                    df_labels_dict[13], df_labels_dict[14]
                   ]

df_features = pd.concat(df_features_frames, ignore_index=True)
df_labels = pd.concat(df_labels_frames, ignore_index=True)

df_features.drop('num_plans', axis=1, inplace=True)
df_labels.drop('num_plans', axis=1, inplace=True)

df = pd.merge(df_features, df_labels,
              how='inner',
              left_on=['datestr', 'city_id', 'hour_of_day'],
              right_on=['datestr', 'city_id', 'hour_of_day']
             )

In [4]:
# Filter out outliers
df_clean = df[df['hour_gb_gamma_95'] < 3].reset_index(drop=True)
print(f"The original dataset size = {len(df)}; the new dataset size = {len(df_clean)}")

The original dataset size = 20933; the new dataset size = 20509


In [5]:
# Labels
gb_100 = df_clean['hour_gb_gamma_100']
gb_99 = df_clean['hour_gb_gamma_99']
gb_95 = df_clean['hour_gb_gamma_95']
gb_90 = df_clean['hour_gb_gamma_90']

fare_100 = df_clean['hour_fare_gamma_100']
fare_99 = df_clean['hour_fare_gamma_99']
fare_95 = df_clean['hour_fare_gamma_95']
fare_90 = df_clean['hour_fare_gamma_90']

y = gb_95

In [6]:
# Features
X = df_clean.loc[:, ['market_log_cr',
                     'market_log_eta',
                     'market_log_fare_p50_scale',
                     'market_surge'
                    ]
                ]

In [7]:
# Training & Test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=116)

In [8]:
def calculate_loss(x1, x2, x3, x, y, b0, b11, b12, b2, b31, b32):
    alpha = b31*(1 - 1/np.power(np.maximum(x, 1), b32))
    beta = b11/np.power(np.maximum(x, 1), b12)
    
    return np.mean(np.power(b0 + beta*x1 + b2*x2 + alpha*x3 - y, 2) / 2)

In [13]:
def grad(x1, x2, x3, x, y, b0, b11, b12, b2, b31, b32):
    alpha = b31*(1 - 1/np.power(np.maximum(x, 1), b32))
    beta = b11/np.power(np.maximum(x, 1), b12)
    re = b0 + beta*x1 + b2*x2 + alpha*x3 - y
    
    db0 = np.mean(re)
    db11 = np.mean(re * x1 * 1/np.power(np.maximum(x, 1), b12))
    db12 = np.mean(re * x1 * b11 * (-b12) * np.power(np.maximum(x, 1), -b12-1))
    db2 = np.mean(re * x2)
    db31 = np.mean(re * x3 * (1 - 1/np.power(np.maximum(x, 1), b32)))
    db32 = np.mean(re * x3 * b31 * b32 * np.power(np.maximum(x, 1), -b32-1))
    
    return db0, db11, db12, db2, db31, db32

In [14]:
def main(n_iter=20, step_size=400, lr=0.01, tol=1e-03):
    N = len(X_train)
    num_iters = N // step_size
    
    b0 = 0.0
    b11 = 1.0
    b12 = 1.0
    b2 = 2.0
    b31 = 1.0
    b32 = 1.0
    
    train_losses = []
    test_losses = []
    
    best_b0 = b0
    best_b11 = b11
    best_b12 = b12
    best_b2 = b2
    best_b31 = b31
    best_b32 = b32
    
    best_test_loss = np.inf
    best_iter = 0
    
    for it in range(n_iter):
        print(f"We are in iteration: {it+1} ...")
        
        # Reshuffle
        ids = list(range(N))
        np.random.shuffle(ids)
        
        X_train_ids = X_train.reset_index(drop=True)
        y_train_ids = y_train.reset_index(drop=True)
        
        train_loss = 0
        for idx in range(num_iters):
            y = y_train_ids[ids][idx*step_size:(idx+1)*step_size]
            x1 = X_train_ids['market_log_cr'][ids][idx*step_size:(idx+1)*step_size]
            x2 = X_train_ids['market_log_eta'][ids][idx*step_size:(idx+1)*step_size]
            x3 = X_train_ids['market_log_fare_p50_scale'][ids][idx*step_size:(idx+1)*step_size]
            x = X_train_ids['market_surge'][ids][idx*step_size:(idx+1)*step_size]
            
            cur_loss = calculate_loss(x1, x2, x3, x, y, b0, b11, b12, b2, b31, b32)
            train_loss += cur_loss
            
            db0, db11, db12, db2, db31, db32 = grad(x1, x2, x3, x, y, b0, b11, b12, b2, b31, b32)
            
            if np.all(np.abs([lr*db0, lr*db11, lr*db12, lr*db2, lr*db31, lr*db32]) <= tol):
                break
            
            b0 -= lr * db0
            b11 -= lr * db11
            b12 -= lr * db12
            b2 -= lr * db2
            b31 -= lr * db31
            b32 -= lr * db32
        
        train_loss /= num_iters
        print(f"Train Loss = {train_loss}")
        
        train_losses.append(train_loss)
        
        # Test loss
        y = y_test
        x1 = X_test['market_log_cr']
        x2 = X_test['market_log_eta']
        x3 = X_test['market_log_fare_p50_scale']
        x = X_test['market_surge']
        test_loss = calculate_loss(x1, x2, x3, x, y, b0, b11, b12, b2, b31, b32)
        
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            best_iter = it

            best_b0 = b0
            best_b11 = b11
            best_b12 = b12
            best_b2 = b2
            best_b31 = b31
            best_b32 = b32
        
        print(f"Test Loss = {test_loss}")
        
        test_losses.append(test_loss)
    
    return best_b0, best_b11, best_b12, best_b2, best_b31, best_b32, best_test_loss, best_iter

In [23]:
best_b0, best_b11, best_b12, best_b2, best_b31, best_b32, best_test_loss, best_iter = main(n_iter=2000, lr=0.01, step_size=400)

We are in iteration: 1 ...
Train Loss = 2.196211788920333
Test Loss = 1.2632934434001482
We are in iteration: 2 ...
Train Loss = 0.8605808501630761
Test Loss = 0.5793443860269197
We are in iteration: 3 ...
Train Loss = 0.45891854957540495
Test Loss = 0.35936112502767736
We are in iteration: 4 ...
Train Loss = 0.3203958756400182
Test Loss = 0.27339590698626925
We are in iteration: 5 ...
Train Loss = 0.2602647690927996
Test Loss = 0.23066359177513537
We are in iteration: 6 ...
Train Loss = 0.22636910207335578
Test Loss = 0.20435945744090153
We are in iteration: 7 ...
Train Loss = 0.10246801065709642
Test Loss = 0.19497857926506157
We are in iteration: 8 ...
Train Loss = 0.09465493277912754
Test Loss = 0.18713073510880537
We are in iteration: 9 ...
Train Loss = 0.05248590336525448
Test Loss = 0.1832196160794714
We are in iteration: 10 ...
Train Loss = 0.029315432387780758
Test Loss = 0.18100912294847893
We are in iteration: 11 ...
Train Loss = 0.11650997652315032
Test Loss = 0.17240821718

In [24]:
print(best_b0, best_b11, best_b12, best_b2, best_b31, best_b32, best_test_loss, best_iter)
# gamma = 0.95, fare_scaling = p50
# --- 1.458824434692779 0.1378199346506436 1.5828221397460458 0.45629890757317865 0.48547254835845294 2.282356236896841 0.12949462273083212 99
# --- 1.4135417321207238 0.11480805785644783 1.5866990696016756 0.36829481361693556 0.4244928389797222 2.26138289753076 0.11643209238158753 199
# --- 1.3735669567444788 0.09587156904088037 1.5898245566037503 0.2745566847033418 0.36795355632064053 2.330766194468109 0.10588613217723643 498
# --- 1.2830555402855435 0.05683060753942558 1.5939593373090202 0.10499589032795699 0.26664410499641805 2.4780156757025043 0.09002443062171045 1997

1.2830555402855435 0.05683060753942558 1.5939593373090202 0.10499589032795699 0.26664410499641805 2.4780156757025043 0.09002443062171045 1997
