In [1]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate
 
# Install latest mxpkg version (to specify version, use syntax: pip install mxpkg==1.1.7)
pip install dataclasses
pip install matching-ds-tools
 
deactivate

Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index


In [2]:
import json
import datetime
import re

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from queryrunner_client import Client
qclient = Client(user_email='thai@uber.com')
USER_EMAIL = 'thai@uber.com'
CONSUMER_NAME = 'intelligentdispatch'

import os
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from joblib import Parallel, delayed
#num_cores = multiprocessing.cpu_count()  -- 48
n_cores = 4

In [3]:
from dataclasses import dataclass
import itertools
from typing import *
import numpy as np
import pandas as pd
from queryrunner_client import Client as QRClient
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.optimize import linear_sum_assignment

In [4]:
import mdstk
from mdstk.data_fetcher.data_fetcher import DataFetcher
from mdstk.data_fetcher.cached_data_fetcher import CachedDataFetcher

In [5]:
# data collection

QUERY = """
with dispatch as (
select 
    datestr,
    msg.cityid,
    msg.ctplangenrequestuuid as plangen_uuid,
    msg.ctrequestuuid as scan_uuid,
    j as job_uuid,
    msg.supplyuuid,
    msg.planactiontype
from 
    rawdata_user.kafka_hp_multileg_dispatched_plan_nodedup
cross join 
    unnest(msg.jobuuid) jobs(j)
where 
    datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.vehicleviewid = {vvid} 
    and msg.tenancy = 'uber/production'
    and CARDINALITY(msg.jobuuid) > 0
    and substr(msg.ctrequestuuid, 1, length('{digits}')) = '{digits}'
),
plangen as (
select 
    msg.scanuuid as plangen_uuid, 
    p.uuid as job_uuid,
    j.supplyuuid
from 
    rawdata_user.kafka_hp_multileg_matching_observability_proposals_v2_nodedup
cross join 
    unnest(msg.proposals) as job(j)
cross join 
    unnest(j.jobs) as plan(p)
where 
    datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.flowtype = 'solo_batch'
    and msg.tenancy = 'uber/production'
    and j.status = 'eligible'
),
mgv as (
select datestr,
       msg.city_id,
       msg.job_uuid,
       msg.client_uuid,
       msg.ct_request_uuid as plangen_uuid,
       msg.supply_uuid,
       msg.supply_plan_uuid as plan_uuid,
       msg.unadjusted_eta as eta,
       (CASE
          WHEN msg.adjustedeta > 1500 THEN 1500.0
          WHEN msg.adjustedeta < 0 THEN 0.0
          ELSE msg.adjustedeta
       END) as adjustedeta,
       round(msg.job_surge, 4) as surge_mul,
       round(msg.eventual_completion_probability, 4) as eventual_comp_prob,
       msg.ranking_metric,
       round(1 - msg.solo_cancel_model_driver_accept_prob, 4) as d_proba,
       round(1 - msg.solo_cancel_model_rider_accept_prob, 4) as r_proba,
       round(1 - msg.spinner_survive_prob_before_next_scan, 4) as s_proba,
       msg.preferred_destination_adjustment,
       msg.objective_value as of_value,
       msg.inconvenience_etd - msg.ranking_metric as trip_length
from   
    rawdata.kafka_hp_multileg_mgv_log_nodedup
where  
    datestr = '{datestr}'
    and msg.city_id = {city_id}
    and msg.tenancy = 'uber/production'
    and msg.vehicle_view_id = {vvid} 
    and msg.flow_type = 'solo_batch'
    and msg.job_uuid <> msg.client_uuid
    and msg.calculator_type = 'markov_eta_v2'
),
test as (
select 
    mgv.datestr,
    mgv.city_id,
    dispatch.scan_uuid,
    mgv.plangen_uuid,
    mgv.job_uuid,
    dispatch.planactiontype,
    mgv.supply_uuid,
    case when dispatch.supplyuuid = mgv.supply_uuid then 1 else 0 end as is_selected,
    mgv.eta,
    mgv.adjustedeta,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1), 4) as eta_one,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 1.25), 4) as eta_one_quarter,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 2), 4) as eta_square,
    round(POWER(1 - mgv.adjustedeta / 1500.0, 3), 4) as eta_cube,
    mgv.surge_mul,
    mgv.eventual_comp_prob,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 2)), 4) as network_contention_2,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 3)), 4) as network_contention_3,
    round(1.0 / (1.0 + POWER(mgv.surge_mul, 5)), 4) as network_contention_5,
    mgv.ranking_metric,
    mgv.d_proba,
    mgv.r_proba,
    mgv.s_proba,
    round((1.0 - mgv.d_proba) * (1.0 - mgv.r_proba) * (1.0 - mgv.s_proba) + mgv.eventual_comp_prob * mgv.d_proba, 4) as cr_ratio,
    mgv.preferred_destination_adjustment,
    mgv.of_value,
    mgv.trip_length,
    fare.est_rider_quoted_final_fare as fare
from
    mgv
join
    plangen
on 
    mgv.plangen_uuid = plangen.plangen_uuid
    and mgv.job_uuid = plangen.job_uuid
    and mgv.supply_uuid = plangen.supplyuuid
join
    dispatch
on
    mgv.plangen_uuid = dispatch.plangen_uuid
    and mgv.job_uuid = dispatch.job_uuid
join
    dwh.fact_trip_fare fare 
on
    mgv.job_uuid = fare.trip_uuid
    and fare.datestr = '{datestr}'
    and fare.city_id = {city_id}
)
select * from test
"""

In [6]:
@dataclass
class Query:
    prefix: str
    hex_digits: str
    city_id: int
    vvid: str
    datestr: str
    
    def __post_init__(self):
        self.name = f'{self.prefix}_city{self.city_id}_{self.vvid}_{self.datestr}_segment{self.hex_digits}'
        self.qry = QUERY.format(city_id=self.city_id, vvid=self.vvid, digits=self.hex_digits, datestr=self.datestr)

In [7]:
class MyDataFetcher(DataFetcher):
    def query_many_presto(self, *args, **kwargs):
        return super().query_many_presto(*args, **kwargs)

In [8]:
# Calculate new objective function
def clean_df(df):
    df = df[df['fare'].notnull()]
    df['trip_length'][df['trip_length'] <= 100] = 100
    df = df.drop_duplicates(subset=['job_uuid', 'supply_uuid'])
    df = df.dropna()
    return df

def compute_new_of(df):
#     df['new_of'] = - df['eta_square'] * df['cr_ratio'] * df['fare']
    df['new_of'] = - (- 0.3268 * df['cr_ratio'] \
                      - 0.6322 * df['eta_one'] * df['cr_ratio'] \
                      + 0.3956 * df['eta_one'] * (1 - df['network_contention_2']) * df['cr_ratio'] * df['fare']
                     )
    return df

In [9]:
# local solver
def solve_dict(
    scan: dict, 
    cost_col: str, 
    job_singleton: float = 1500,
    infinity: float = 1000000
):
    job_list = list(set([k[0] for k in scan.keys()]))
    job_idx = {j: i for i, j in enumerate(job_list)}
    job_count = len(job_list)

    supply_list = list(set([k[1] for k in scan.keys()]))
    supply_idx = {s: i for i, s in enumerate(supply_list)}
    supply_count = len(supply_list)
    
    utility = np.full((len(job_list), len(supply_list) + len(job_list)), infinity, dtype=np.float32)
    for k in scan.keys():
        jidx = job_idx[k[0]]
        sidx = supply_idx[k[1]]
        utility[jidx, sidx] = scan[k][cost_col]
    for i in range(len(job_list)):
        utility[i, supply_count + i] = job_singleton
            
    # solve
    job_sol, supply_sol = linear_sum_assignment(utility)

    result = set()
    for jidx, sidx in zip(job_sol, supply_sol):
        j = job_list[jidx]
        if sidx >= supply_count:
            result.add((j,))
        else:
            s = supply_list[sidx]
            result.add((j, s))
            
    assert len(result) == len(job_list)
    return result  

In [10]:
@dataclass
class ScanMetrics:
    total_jobs: int = 0.
    total_eta: float = 0.
    total_offer: float = 0.
    total_ar: float = 0.
    total_rc: float = 0.
    total_trip: float = 0.
    total_gb: float = 0.
    total_overwrite: int = 0.
    
    def __add__(self, o: 'ScanMetrics') -> 'ScanMetrics':
        return ScanMetrics(
            self.total_jobs + o.total_jobs,
            self.total_eta + o.total_eta,
            self.total_offer + o.total_offer,
            self.total_ar + o.total_ar,
            self.total_rc + o.total_rc,
            self.total_trip + o.total_trip,
            self.total_overwrite + o.total_overwrite,
            self.total_gb + o.total_gb
        )
    def __iadd__(self, o: 'ScanMetrics') -> 'ScanMetrics':
        self.total_jobs += o.total_jobs
        self.total_eta += o.total_eta
        self.total_offer += o.total_offer
        self.total_ar += o.total_ar
        self.total_rc += o.total_rc
        self.total_trip += o.total_trip
        self.total_overwrite += o.total_overwrite
        self.total_gb += o.total_gb
        return self

In [11]:
# Metric Summary
def metric_summary_dict(
    scan_dict: Dict[str, Dict[str, Any]],
    matching: set, 
    overwrite: int,
) -> ScanMetrics:
    sm = ScanMetrics()
    sm.total_jobs = len(matching)
    sm.total_overwrite = overwrite
    
    for m in matching:
        if len(m) == 2:
            row = scan_dict[(m[0], m[1])]
            sm.total_offer += 1
            sm.total_eta += row['eta']
            sm.total_ar += 1 - row['d_proba']
            sm.total_rc += row['r_proba']
            if row['trip_length'] < 7200:
                sm.total_trip += row['trip_length']
            if row['fare'] > 0:
                sm.total_gb += (1 - row['d_proba']) * (1 - row['r_proba']) * row['fare']
    return sm

def solve_all_dict(df, solver: Callable[[dict], set]):
    total_scans = dict(tuple(df.groupby('scan_uuid')))

    sm = ScanMetrics()
    for scan_uuid, scan_df in total_scans.items():
        scan = (scan_df.set_index(['job_uuid', 'supply_uuid']).to_dict(orient='index'))
        matching, overwrite = solver(scan)
        sm += metric_summary_dict(scan, matching, overwrite)
        
    return {'total_jobs': sm.total_jobs,
            'match_rate': sm.total_offer * 1.0 / sm.total_jobs,
            'overwrite': sm.total_overwrite * 1.0 / sm.total_jobs, # different decisions compared to Markov
            'Average Matched ETA': sm.total_eta * 1.0 / sm.total_offer,
            'Driver AR': sm.total_ar * 1.0 / sm.total_offer,
            'Rider cancel': sm.total_rc * 1.0 / sm.total_offer,
            'Average trip length': sm.total_trip * 1.0 / sm.total_offer,
            'Average GB': sm.total_gb * 1.0 / sm.total_offer
           }

In [12]:
def different_matching_decision(m1,m2):
    return m1.difference(m2), m2.difference(m1)

def supply_cost_solve_dict(scan, markov = False, secondary_singleton = 0):
    #Markov solve
    primary_matching = solve_dict(scan, 'of_value', job_singleton = 1500)
    if markov:      
        return primary_matching, 0
    #SCA solve
    secondary_matching = solve_dict(scan, 'new_of', job_singleton = secondary_singleton)
    different_matches = len(different_matching_decision(primary_matching, secondary_matching)[0])
    return secondary_matching, different_matches


In [13]:
prefix = 'replay'
hex_digits = '36'
city_id_vvids = {1269: '(10004148)', 789: '(11279)'}

datestrs = [  # 2 weeks
    '2022-08-09',
    '2022-08-10',
    '2022-08-11',
    '2022-08-12',
    '2022-08-13',
    '2022-08-14',
    '2022-08-15',
    '2022-08-16',
    '2022-08-17',
    '2022-08-18',
    '2022-08-19',
    '2022-08-20',
    '2022-08-21',
    '2022-08-22',
]

queries = [
    Query(prefix=prefix, hex_digits=hex_digits, city_id=city_id, vvid=vvid, datestr=datestr)
    for (city_id, vvid), datestr in itertools.product(city_id_vvids.items(), datestrs)
]

cache_qry_map = {
    q.name: q.qry 
    for q in queries
}

cdf = CachedDataFetcher(
    data_fetcher=MyDataFetcher(
        user_email=USER_EMAIL,
        consumer_name=CONSUMER_NAME,
    ),
    cache_qry_map=cache_qry_map,
    datacenter='dca1',
    datasource='presto-secure',
)

cdf.fetch(bust_cache=False)

Loaded 28/28 dataframes from cache!


In [14]:
%%time
# clean data
scans = pd.concat(cdf.dfs.values(), axis=0, ignore_index=True) 
df = scans
df = clean_df(df)
df = compute_new_of(df)

CPU times: user 3 s, sys: 827 ms, total: 3.83 s
Wall time: 3.83 s


In [15]:
%%time
# SCA solve
sca_matching = solve_all_dict(df,lambda scan: supply_cost_solve_dict(scan, markov = False))

INFO:jaeger_tracing:Tracing sampler started with sampling refresh interval 60 sec


CPU times: user 32.1 s, sys: 331 ms, total: 32.4 s
Wall time: 32.4 s


In [16]:
# %%time
# # Markov solve
# markov_matching = solve_all_dict(df,lambda scan: supply_cost_solve_dict(scan, markov = True))

In [17]:
sca_matching

# EFOF single OF
# {'total_jobs': 59067.0,
#  'match_rate': 0.9922968832004334,
#  'overwrite': 0.3599302487006281,
#  'Average Matched ETA': 325.8672285538797,
#  'Driver AR': 0.30801089196751497,
#  'Rider cancel': 0.101107730498874,
#  'Average trip length': 868.1338292499829,
#  'Average GB': 4.953158812096452}

{'total_jobs': 59067.0,
 'match_rate': 0.9887754583777744,
 'overwrite': 0.4162053261550443,
 'Average Matched ETA': 335.22525854393535,
 'Driver AR': 0.33310006677624865,
 'Rider cancel': 0.10168994075748239,
 'Average trip length': 870.3670981439627,
 'Average GB': 5.424491636593753}

In [18]:
# markov_matching

# {'total_jobs': 59067.0,
#  'match_rate': 0.9908070496216161,
#  'overwrite': 0.0,
#  'Average Matched ETA': 308.2193117353564,
#  'Driver AR': 0.27074625281935616,
#  'Rider cancel': 0.09404085845123371,
#  'Average trip length': 866.5610689631604,
#  'Average GB': 4.402908360236581}