In [1]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate
 
# Install latest mxpkg version (to specify version, use syntax: pip install mxpkg==1.1.7)
pip install dataclasses
pip install matching-ds-tools
 
deactivate

Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index


In [2]:
import json
import datetime
import re

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from queryrunner_client import Client
USER_EMAIL = 'thai@uber.com'
qclient = Client(user_email=USER_EMAIL)
CONSUMER_NAME = 'intelligentdispatch'

import os
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from joblib import Parallel, delayed
#num_cores = multiprocessing.cpu_count()  -- 48
n_cores = 4

In [3]:
from dataclasses import dataclass
import itertools
from typing import *
import numpy as np
import pandas as pd
from queryrunner_client import Client as QRClient
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment

In [4]:
import mdstk
from mdstk.data_fetcher.data_fetcher import DataFetcher
from mdstk.data_fetcher.cached_data_fetcher import CachedDataFetcher

In [5]:
# data collection

QUERY = """
with dispatch as (
select 
    datestr,
    msg.cityid,
    msg.ctplangenrequestuuid as plangen_uuid,
    msg.ctrequestuuid as scan_uuid,
    j as job_uuid,
    msg.supplyuuid,
    msg.planactiontype
from 
    rawdata_user.kafka_hp_multileg_dispatched_plan_nodedup
cross join 
    unnest(msg.jobuuid) jobs(j)
where 
    datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.vehicleviewid = {vvid} 
    and msg.tenancy = 'uber/production'
    and CARDINALITY(msg.jobuuid) > 0
    and substr(msg.ctrequestuuid, 1, length('{digits}')) = '{digits}'
),
plangen as (
select 
    msg.scanuuid as plangen_uuid, 
    p.uuid as job_uuid,
    j.supplyuuid
from 
    rawdata_user.kafka_hp_multileg_matching_observability_proposals_v2_nodedup
cross join 
    unnest(msg.proposals) as job(j)
cross join 
    unnest(j.jobs) as plan(p)
where 
    datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.flowtype = 'solo_batch'
    and msg.tenancy = 'uber/production'
    and j.status = 'eligible'
),
mgv as (
select datestr,
    msg.city_id,
    msg.job_uuid,
    msg.client_uuid,
    msg.ct_request_uuid as plangen_uuid,
    msg.supply_uuid,
    msg.supply_plan_uuid as plan_uuid,
    msg.unadjusted_eta as eta,
    (CASE
      WHEN msg.adjustedeta > 1500 THEN 1500.0
      WHEN msg.adjustedeta < 0 THEN 0.0
      ELSE msg.adjustedeta
    END) as adjustedeta,
    round(msg.job_surge, 4) as surge_mul,
    round(msg.eventual_completion_probability, 4) as eventual_comp_prob,
    msg.ranking_metric,
    round(1 - msg.solo_cancel_model_driver_accept_prob, 4) as d_proba,
    round(1 - msg.solo_cancel_model_rider_accept_prob, 4) as r_proba,
    round(1 - msg.spinner_survive_prob_before_next_scan, 4) as s_proba,
    msg.preferred_destination_adjustment,
    msg.objective_value as of_value,
    msg.inconvenience_etd - msg.ranking_metric as trip_length
from   
    rawdata.kafka_hp_multileg_mgv_log_nodedup
where  
    datestr = '{datestr}'
    and msg.city_id = {city_id}
    and msg.tenancy = 'uber/production'
    and msg.vehicle_view_id = {vvid} 
    and msg.flow_type = 'solo_batch'
    and msg.job_uuid <> msg.client_uuid
    and msg.calculator_type = 'markov_eta_v2'
),
test as (
select 
    mgv.datestr,
    mgv.city_id,
    dispatch.scan_uuid,
    mgv.plangen_uuid,
    mgv.job_uuid,
    dispatch.planactiontype,
    mgv.supply_uuid,
    case when dispatch.supplyuuid = mgv.supply_uuid then 1 else 0 end as is_selected,
    mgv.eta,
    mgv.adjustedeta,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1), 4) as eta_one,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.05), 4) as eta_one_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.10), 4) as eta_one_ten,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.15), 4) as eta_one_fifteen,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.20), 4) as eta_one_twenty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.25), 4) as eta_one_quarter,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.30), 4) as eta_one_thirty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.35), 4) as eta_one_thirty_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.40), 4) as eta_one_forty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.45), 4) as eta_one_forty_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.50), 4) as eta_one_fifty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.55), 4) as eta_one_fifty_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.60), 4) as eta_one_sixty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.65), 4) as eta_one_sixty_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.70), 4) as eta_one_seventy,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.75), 4) as eta_one_seventy_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.80), 4) as eta_one_eighty,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.85), 4) as eta_one_eighty_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.90), 4) as eta_one_ninety,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.95), 4) as eta_one_ninety_five,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 2), 4) as eta_square,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 3), 4) as eta_cube,
    mgv.surge_mul,
    mgv.eventual_comp_prob,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 2)), 4) as network_contention_2,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 3)), 4) as network_contention_3,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 5)), 4) as network_contention_5,
    mgv.ranking_metric,
    mgv.d_proba,
    mgv.r_proba,
    mgv.s_proba,
    round((1.0 - mgv.d_proba) * (1.0 - mgv.r_proba) * (1.0 - mgv.s_proba) + mgv.eventual_comp_prob * mgv.d_proba, 4) as cr_ratio,
    round((1.0 - mgv.d_proba) * (1.0 - mgv.r_proba) + mgv.eventual_comp_prob * mgv.d_proba, 4) as crof_ratio,
    mgv.preferred_destination_adjustment,
    mgv.of_value,
    mgv.trip_length,
    fare.est_rider_quoted_final_fare as fare,
    fare.est_rider_quoted_final_fare * 1.0 / fare.usd_fx_rate as fare_usd
from
    mgv
join
    plangen
on 
    mgv.plangen_uuid = plangen.plangen_uuid
    and mgv.job_uuid = plangen.job_uuid
    and mgv.supply_uuid = plangen.supplyuuid
join
    dispatch
on
    mgv.plangen_uuid = dispatch.plangen_uuid
    and mgv.job_uuid = dispatch.job_uuid
join
    dwh.fact_trip_fare fare 
on
    mgv.job_uuid = fare.trip_uuid
    and fare.datestr = '{datestr}'
    and fare.city_id = {city_id}
)
select * from test
"""

In [6]:
@dataclass
class Query:
    prefix: str
    hex_digits: str
    city_id: int
    vvid: str
    datestr: str
    
    def __post_init__(self):
        self.name = f'{self.prefix}_city{self.city_id}_{self.vvid}_{self.datestr}_segment{self.hex_digits}'
        self.qry = QUERY.format(city_id=self.city_id, vvid=self.vvid, digits=self.hex_digits, datestr=self.datestr)

In [7]:
class MyDataFetcher(DataFetcher):
    def query_many_presto(self, *args, **kwargs):
        return super().query_many_presto(*args, **kwargs)

In [8]:
# Calculate new objective function
def clean_df(df):
    df = df[df['fare'].notnull()]
    df['trip_length'][df['trip_length'] <= 100] = 100
    df = df.drop_duplicates(subset=['job_uuid', 'supply_uuid'])
    df = df.dropna()
    return df

def compute_new_of(df):
    
#     # Baseline (Markov)

# {'total_jobs': 6076,
#  'match_rate': 0.945,
#  'overwrite': 0.0,
#  'Average Matched ETA': 487.91,
#  'P90 Matched ETA': 1122.0,
#  'Driver AR': 0.496,
#  'Rider cancel': 0.154,
#  'Average trip length': 829.1,
#  'Average Matched Fare': 16.0,
#  'Total GB': 38381}

#     # EFOF
#     df['new_of'] = - df['eta_square'] * df['cr_ratio'] * df['fare']

# {'total_jobs': 6076,
#  'match_rate': 0.985,
#  'overwrite': 0.164,
#  'Average Matched ETA': 531.9,
#  'P90 Matched ETA': 1219.8,
#  'Driver AR': 0.493,
#  'Rider cancel': 0.171,
#  'Average trip length': 835.03,
#  'Average Matched Fare': 16.1,
#  'Total GB': 39531}

#     # CROF
#     df['new_of'] = - df['eta_square'] * df['crof_ratio']

# {'total_jobs': 6076,
#  'match_rate': 0.984,
#  'overwrite': 0.134,
#  'Average Matched ETA': 530.09,
#  'P90 Matched ETA': 1217.0,
#  'Driver AR': 0.493,
#  'Rider cancel': 0.169,
#  'Average trip length': 833.92,
#  'Average Matched Fare': 16.08,
#  'Total GB': 39573}

#     # WCOF
#     df['new_of'] = - (df['network_contention_5'] * df['eta_square'] * df['crof_ratio'] \
#                       + (1 - df['network_contention_5']) * df['eta_square'] * df['crof_ratio'] * df['fare'] / 10.0
#                      )
    
# {'total_jobs': 6076,
#  'match_rate': 0.984,
#  'overwrite': 0.155,
#  'Average Matched ETA': 531.1,
#  'P90 Matched ETA': 1217.1,
#  'Driver AR': 0.493,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.36,
#  'Average Matched Fare': 16.09,
#  'Total GB': 39525}

#     # WCOF - Latest Version
#     df['new_of'] = - (df['eta_square'] * df['crof_ratio'] / df['surge_mul']**36 \
#                       + (1 - 1/df['surge_mul']**36) * df['eta_square'] * df['crof_ratio'] * df['fare'] / 18.0
#                      )
    
# {'total_jobs': 6076,
#  'match_rate': 0.985,
#  'overwrite': 0.157,
#  'Average Matched ETA': 531.62,
#  'P90 Matched ETA': 1221.6,
#  'Driver AR': 0.493,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.03,
#  'Average Matched Fare': 16.08,
#  'Total GB': 39508}


############################################
#             GUB as label                 #
#              Unit: USD                   #
############################################
#     # gamma = 1.00 - MAIN I
#     df['new_of'] = - (0.4019 * df['d_proba'] \
#                       - 0.9627 * df['eventual_comp_prob'] \
#                       - 1.3453 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.6210 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 0.6435 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 1.1098 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 4.1085 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul']
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.978,
#  'overwrite': 0.187,
#  'Average Matched ETA': 528.79,
#  'P90 Matched ETA': 1203.6,
#  'Driver AR': 0.5,
#  'Rider cancel': 0.169,
#  'Average trip length': 834.69,
#  'Average Matched Fare': 16.12,
#  'Total GB': 40149}


#     # gamma = 1.00 - with Intercept - MAIN I
#     df['new_of'] = - (0.4019 * df['d_proba'] \
#                       - 0.9627 * df['eventual_comp_prob'] \
#                       - 1.3453 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.6210 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 0.6435 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 1.1098 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 4.1085 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 1.3591
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.988,
#  'overwrite': 0.197,
#  'Average Matched ETA': 539.94,
#  'P90 Matched ETA': 1231.0,
#  'Driver AR': 0.501,
#  'Rider cancel': 0.172,
#  'Average trip length': 835.1,
#  'Average Matched Fare': 16.1,
#  'Total GB': 40449}

#     # gamma = 1.00 - with Intercept - MAIN II
#     df['new_of'] = - (0.4019 * df['d_proba'] \
#                       - 0.9627 * df['eventual_comp_prob'] \
#                       - 1.3453 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.6210 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 0.6435 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 1.1098 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 4.1085 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 0.15
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.189,
#  'Average Matched ETA': 531.46,
#  'P90 Matched ETA': 1212.0,
#  'Driver AR': 0.5,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.25,
#  'Average Matched Fare': 16.1,
#  'Total GB': 40195}

    # gamma = 1.00 - with Intercept - MAIN II - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
    df['new_of'] = - (0.4019 * df['d_proba'] \
                      - 0.9627 * df['eventual_comp_prob'] \
                      - 1.3453 * df['eta_one'] * df['cr_ratio'] \
                      + 0.6210 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
                      - 0.6435 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
                      - 1.1098 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
                      + 4.1085 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
                      + 0.15
                     )

# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.19,
#  'Average Matched ETA': 531.33,
#  'P90 Matched ETA': 1212.0,
#  'Driver AR': 0.501,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.63,
#  'Average Matched Fare': 16.12,
#  'Total GB': 40213}


#     df['new_of'] = - (0.4019 * df['d_proba'] \
#                       - 0.9627 * df['eventual_comp_prob'] \
#                       - 1.3453 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.6210 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / 10.0 \
#                       - 0.6435 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       - 1.1098 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / 10.0 / df['surge_mul'] \
#                       + 4.1085 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] / 10.0 / df['surge_mul'] \
#                       + 1.3591
#                      )

#     # gamma = 1.00 - with Intercept - MAIN III - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
#     df['new_of'] = - (0.3933 * df['d_proba'] \
#                       - 0.9410 * df['eventual_comp_prob'] \
#                       - 1.4449 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.5347 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.5527 * df['eta_one_quarter'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 3.0118 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
#                       + 0.15
#                      )
# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.191,
#  'Average Matched ETA': 531.99,
#  'P90 Matched ETA': 1212.9,
#  'Driver AR': 0.501,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.37,
#  'Average Matched Fare': 16.1,
#  'Total GB': 40218}


# #     # gamma = 1.00 - with Intercept - MAIN IV - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
# #     # Scale features and labels by mean(labels)
#     df['new_of'] = - (0.3754 * df['d_proba'] \
#                       - 1.9908 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.0481 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.0983 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 3.1308 * df['eta_one_fifty'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / df['surge_mul']
#                      )

#     # gamma = 0.99 - MAIN I
#     df['new_of'] = - (0.4044 * df['d_proba'] \
#                       - 0.8661 * df['eventual_comp_prob'] \
#                       - 0.7857 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.3434 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.7348 * df['eta_square'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 0.3817 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 2.2700 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
#                       + 0.12
#                      )

#     # gamma = 0.99 - MAIN I - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
#     df['new_of'] = - (0.4044 * df['d_proba'] \
#                       - 0.8661 * df['eventual_comp_prob'] \
#                       - 0.7857 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.3434 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.7348 * df['eta_square'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 0.3817 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 2.2700 * df['eta_one'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
#                       + 0.12
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.19,
#  'Average Matched ETA': 531.63,
#  'P90 Matched ETA': 1213.0,
#  'Driver AR': 0.499,
#  'Rider cancel': 0.17,
#  'Average trip length': 834.36,
#  'Average Matched Fare': 16.1,
#  'Total GB': 40147}

#     # gamma = 0.95 - MAIN I
#     df['new_of'] = - (0.2490 * df['d_proba'] \
#                       - 0.4376 * df['eventual_comp_prob'] \
#                       - 0.2893 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.0160 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 0.0296 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] \
#                       + 0.9739 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 0.05
#                      )
    
# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.187,
#  'Average Matched ETA': 530.77,
#  'P90 Matched ETA': 1212.0,
#  'Driver AR': 0.494,
#  'Rider cancel': 0.17,
#  'Average trip length': 833.81,
#  'Average Matched Fare': 16.1,
#  'Total GB': 39846}

#     # gamma = 0.95 - MAIN I - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
#     df['new_of'] = - (0.2490 * df['d_proba'] \
#                       - 0.4376 * df['eventual_comp_prob'] \
#                       - 0.2893 * df['eta_one'] * df['cr_ratio'] \
#                       + 0.0160 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.0296 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 0.9739 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / df['surge_mul'] \
#                       + 0.05
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.189,
#  'Average Matched ETA': 531.08,
#  'P90 Matched ETA': 1212.1,
#  'Driver AR': 0.495,
#  'Rider cancel': 0.17,
#  'Average trip length': 833.83,
#  'Average Matched Fare': 16.1,
#  'Total GB': 39917}

#     # gamma = 0.90 - MAIN I
#     df['new_of'] = - (0.1968 * df['d_proba'] \
#                       - 0.3421 * df['eventual_comp_prob'] \
#                       + 0.0102 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] \
#                       - 0.0194 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare_usd'] \
#                       + 0.5001 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare_usd'] / df['surge_mul'] \
#                       + 0.05
#                      )
    
# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.19,
#  'Average Matched ETA': 530.24,
#  'P90 Matched ETA': 1211.0,
#  'Driver AR': 0.489,
#  'Rider cancel': 0.17,
#  'Average trip length': 833.39,
#  'Average Matched Fare': 16.09,
#  'Total GB': 39527}

#     # gamma = 0.90 - MAIN I - Use local currency with a fixed exchange rate (which is ~ 1 at the median value)
#     df['new_of'] = - (0.1968 * df['d_proba'] \
#                       - 0.3421 * df['eventual_comp_prob'] \
#                       + 0.0102 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] \
#                       - 0.0194 * df['eta_cube'] * (1 - df['network_contention_5']) * df['cr_ratio'] * df['fare'] \
#                       + 0.5001 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare'] / df['surge_mul']
#                       + 0.05
#                      )

# {'total_jobs': 6076,
#  'match_rate': 0.981,
#  'overwrite': 0.191,
#  'Average Matched ETA': 530.76,
#  'P90 Matched ETA': 1212.0,
#  'Driver AR': 0.49,
#  'Rider cancel': 0.17,
#  'Average trip length': 833.4,
#  'Average Matched Fare': 16.09,
#  'Total GB': 39637}

    return df


In [9]:
# local solver
def solve_dict(
    scan: dict, 
    cost_col: str, 
    job_singleton: float = 1500,
    infinity: float = 1000000
):
    job_list = list(set([k[0] for k in scan.keys()]))
    job_idx = {j: i for i, j in enumerate(job_list)}
    job_count = len(job_list)

    supply_list = list(set([k[1] for k in scan.keys()]))
    supply_idx = {s: i for i, s in enumerate(supply_list)}
    supply_count = len(supply_list)
    
    utility = np.full((len(job_list), len(supply_list) + len(job_list)), infinity, dtype=np.float32)
    for k in scan.keys():
        jidx = job_idx[k[0]]
        sidx = supply_idx[k[1]]
        utility[jidx, sidx] = scan[k][cost_col]
    for i in range(len(job_list)):
        utility[i, supply_count + i] = job_singleton
            
    # solve
    job_sol, supply_sol = linear_sum_assignment(utility)

    result = set()
    for jidx, sidx in zip(job_sol, supply_sol):
        j = job_list[jidx]
        if sidx >= supply_count:
            result.add((j,))
        else:
            s = supply_list[sidx]
            result.add((j, s))
            
    assert len(result) == len(job_list)
    return result  

In [10]:
from dataclasses import field

@dataclass
class ScanMetrics:
    total_jobs: int = 0.
    total_eta: float = 0.
    total_offer: float = 0.
    total_ar: float = 0.
    total_rc: float = 0.
    total_trip: float = 0.
    total_gb: float = 0.
    total_fare: float = 0.
    total_overwrite: int = 0.
    list_etas: list = field(default_factory = list)
    
    def __add__(self, o: 'ScanMetrics') -> 'ScanMetrics':
        return ScanMetrics(
            self.total_jobs + o.total_jobs,
            self.total_eta + o.total_eta,
            self.total_offer + o.total_offer,
            self.total_ar + o.total_ar,
            self.total_rc + o.total_rc,
            self.total_trip + o.total_trip,
            self.total_overwrite + o.total_overwrite,
            self.total_gb + o.total_gb,
            self.total_fare + o.total_fare,
            self.list_etas.expand + o.list_etas
        )
    def __iadd__(self, o: 'ScanMetrics') -> 'ScanMetrics':
        self.total_jobs += o.total_jobs
        self.total_eta += o.total_eta
        self.total_offer += o.total_offer
        self.total_ar += o.total_ar
        self.total_rc += o.total_rc
        self.total_trip += o.total_trip
        self.total_overwrite += o.total_overwrite
        self.total_gb += o.total_gb
        self.total_fare += o.total_fare
        self.list_etas += o.list_etas
        
        return self

In [11]:
# Metric Summary
def metric_summary_dict(
    scan_dict: Dict[str, Dict[str, Any]],
    matching: set, 
    overwrite: int,
) -> ScanMetrics:
    sm = ScanMetrics()
    sm.total_jobs = len(matching)
    sm.total_overwrite = overwrite
    
    for m in matching:
        if len(m) == 2:
            row = scan_dict[(m[0], m[1])]
            sm.total_offer += 1
            sm.total_eta += row['eta']
            sm.total_ar += 1 - row['d_proba']
            sm.total_rc += row['r_proba']
            if row['trip_length'] < 7200:
                sm.total_trip += row['trip_length']
            if row['fare_usd'] > 0:
                sm.total_gb += (1 - row['d_proba']) * (1 - row['r_proba']) * row['fare_usd']
                sm.total_fare += row['fare_usd']
                
            sm.list_etas.append(row['eta'])

    return sm

def solve_all_dict(df, solver: Callable[[dict], set]):
    total_scans = dict(tuple(df.groupby('scan_uuid')))

    sm = ScanMetrics()
    for scan_uuid, scan_df in total_scans.items():
        scan = (scan_df.set_index(['job_uuid', 'supply_uuid']).to_dict(orient='index'))
        matching, overwrite = solver(scan)
        sm += metric_summary_dict(scan, matching, overwrite)
        
    return {'total_jobs': round(sm.total_jobs),
            'match_rate': round(sm.total_offer * 1.0 / sm.total_jobs, 3),
            'overwrite': round(sm.total_overwrite * 1.0 / sm.total_jobs, 3), # different decisions compared to Markov
            'Average Matched ETA': round(sm.total_eta * 1.0 / sm.total_offer, 2),
            'P90 Matched ETA': round(np.percentile(sm.list_etas, 90), 2),
            'Driver AR': round(sm.total_ar * 1.0 / sm.total_offer, 3),
            'Rider cancel': round(sm.total_rc * 1.0 / sm.total_offer, 3),
            'Average trip length': round(sm.total_trip * 1.0 / sm.total_offer, 2),
            'Average Matched Fare': round(sm.total_fare * 1.0 / sm.total_offer, 2),
            'Total GB': round(sm.total_gb)
           }

In [12]:
def different_matching_decision(m1,m2):
    return m1.difference(m2), m2.difference(m1)

def supply_cost_solve_dict(scan, is_markov = False, secondary_singleton = 0.0):
    # Markov
    primary_matching = solve_dict(scan, 'of_value', job_singleton = 1500)
    if is_markov:      
        return primary_matching, 0
    
    # SCA solve
    secondary_matching = solve_dict(scan, 'new_of', job_singleton = secondary_singleton)
    different_matches = len(different_matching_decision(primary_matching, secondary_matching)[0])
    return secondary_matching, different_matches


In [13]:
prefix = 'replay'
hex_digits = '36'

city_id_vvids = {38: '(3298)', 37: '(5235)', 36: '(570)'}

datestrs = [  # 1 week
    '2022-09-13',
    '2022-09-14',
    '2022-09-15',
    '2022-09-16',
    '2022-09-17',
    '2022-09-18',
    '2022-09-19'
]

queries = [
    Query(prefix=prefix, hex_digits=hex_digits, city_id=city_id, vvid=vvid, datestr=datestr)
    for (city_id, vvid), datestr in itertools.product(city_id_vvids.items(), datestrs)
]

cache_qry_map = {
    q.name: q.qry 
    for q in queries
}

cdf = CachedDataFetcher(
    data_fetcher=MyDataFetcher(
        user_email=USER_EMAIL,
        consumer_name=CONSUMER_NAME,
    ),
    cache_qry_map=cache_qry_map,
    datacenter='dca1',
    datasource='presto-secure',
)

cdf.fetch(bust_cache=False)

Loaded 21/21 dataframes from cache!


In [14]:
%%time
# clean data
scans = pd.concat(cdf.dfs.values(), axis=0, ignore_index=True) 
df = scans
df = clean_df(df)
df = compute_new_of(df)

CPU times: user 236 ms, sys: 102 ms, total: 338 ms
Wall time: 336 ms


In [15]:
df['surge_mul'].describe()

count    172545.000000
mean          1.080144
std           0.184598
min           1.000000
25%           1.000000
50%           1.000000
75%           1.100000
max           2.900000
Name: surge_mul, dtype: float64

In [16]:
%%time
# SCA solve
sca_matching = solve_all_dict(df,lambda scan: supply_cost_solve_dict(scan, is_markov = False))
sca_matching

CPU times: user 8.65 s, sys: 36.2 ms, total: 8.69 s
Wall time: 8.69 s


{'total_jobs': 6076,
 'match_rate': 0.981,
 'overwrite': 0.19,
 'Average Matched ETA': 531.33,
 'P90 Matched ETA': 1212.0,
 'Driver AR': 0.501,
 'Rider cancel': 0.17,
 'Average trip length': 834.63,
 'Average Matched Fare': 16.12,
 'Total GB': 40213}

In [17]:
%%time
# Primary (Markov) solve
markov_matching = solve_all_dict(df,lambda scan: supply_cost_solve_dict(scan, is_markov = True))
markov_matching

CPU times: user 8.46 s, sys: 0 ns, total: 8.46 s
Wall time: 8.46 s


{'total_jobs': 6076,
 'match_rate': 0.945,
 'overwrite': 0.0,
 'Average Matched ETA': 487.91,
 'P90 Matched ETA': 1122.0,
 'Driver AR': 0.496,
 'Rider cancel': 0.154,
 'Average trip length': 829.1,
 'Average Matched Fare': 16.0,
 'Total GB': 38381}

INFO:jaeger_tracing:Tracing sampler started with sampling refresh interval 60 sec
