In [1]:
# !install_package_python3.sh add dsw_qr=0.1.13
# %pip install galileo
# %pip install galileo-py

In [2]:
import os

import pandas as pd
from dsw_qr import dsw_qr
from queryrunner_client import Client
qr = Client(user_email='thai@uber.com')



In [3]:
def prepare_query(city_list, sample_percentage, start_date, end_date):
    QUERY = """
    SET session hash_partition_count=64;
    
    -- calculate features based on average plan value for first 60s of the hour
    -- Feature 1: C/R * eta
    -- Feature 2: C/R * eta * fare / scale
    select
        plans.datestr,
        plans.city_id,
        hour(from_unixtime(cast(plans.ts as bigint))) as hour_of_day,
        count(*) as num_plans,
        avg(plans.surge) as market_surge,
        avg(plans.cr_ratio) as market_cr,
        avg(LOG2(plans.cr_ratio)) as market_log_cr,
        avg(plans.eta) as market_eta,
        avg(LOG2(plans.eta)) as market_log_eta,
        avg(completed.client_upfront_fare_local) as market_fare,
        avg(LOG2(completed.client_upfront_fare_local)) as market_log_fare,
        avg(completed.client_upfront_fare_local / scale_tab.max_scale) as market_fare_max_scale,
        avg(completed.client_upfront_fare_local / scale_tab.p99_scale) as market_fare_p99_scale,
        avg(completed.client_upfront_fare_local / scale_tab.p95_scale) as market_fare_p95_scale,
        avg(completed.client_upfront_fare_local / scale_tab.p90_scale) as market_fare_p90_scale,
        avg(completed.client_upfront_fare_local / scale_tab.p75_scale) as market_fare_p75_scale,
        avg(completed.client_upfront_fare_local / scale_tab.p50_scale) as market_fare_p50_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.max_scale)) as market_log_fare_max_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.p99_scale)) as market_log_fare_p99_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.p95_scale)) as market_log_fare_p95_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.p90_scale)) as market_log_fare_p90_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.p75_scale)) as market_log_fare_p75_scale,
        avg(LOG2(completed.client_upfront_fare_local / scale_tab.p50_scale)) as market_log_fare_p50_scale
    from
      (
        select
            distinct mgv.datestr,
            mgv.city_id,
            mgv.supply_plan_uuid,
            mgv.job_uuid,
            mgv.supply_uuid,
            mgv.job_creation_time_ms,
            1 - mgv.eta / 1500.0 as eta,
            mgv.surge as surge,
            (1.0 - mgv.driver_cancel_prob) * (1.0 - mgv.rider_cancel_prob) * (1.0 - mgv.spinner_cancel_prob) + mgv.eventual_comp_prob * mgv.driver_cancel_prob as cr_ratio,
            rank() over (
            PARTITION BY mgv.supply_uuid,
            mgv.job_uuid
            ORDER BY
              mgv.ts desc
            ) as rank,
            mgv.ts
        from
          (
            select
              distinct datestr,
              msg.job_uuid,
              msg.supply_uuid,
              msg.supply_plan_uuid,
              msg.city_id,
              msg.ct_request_uuid,
              msg.job_creation_time_ms,
              1.0 - msg.solo_cancel_model_driver_accept_prob as driver_cancel_prob,
              1.0 - msg.solo_cancel_model_rider_accept_prob as rider_cancel_prob,
              1.0 - msg.spinner_survive_prob_before_next_scan as spinner_cancel_prob,
              (CASE
                WHEN msg.adjustedeta >= 1500 THEN 1499.0
                WHEN msg.adjustedeta < 0 THEN 0.0
                ELSE msg.adjustedeta
              END) as eta,
              msg.job_surge as surge,
              msg.eventual_completion_probability as eventual_comp_prob,
              msg.job_type,
              msg.flow_type,
              ts
            from
              rawdata.kafka_hp_multileg_mgv_log_nodedup
            where
              msg.tenancy = 'uber/production'
              and msg.solo_cancel_model_driver_accept_prob is not NULL
              and msg.solo_cancel_model_rider_accept_prob is not NULL
              and msg.spinner_survive_prob_before_next_scan is not NULL
              and msg.eventual_completion_probability is not NULL
              and msg.city_id in ({})
              and datestr between '{}' and '{}'
          ) mgv
        where
          mgv.job_type = 'PERSONAL_TRANSPORT'
          and mgv.flow_type in ('solo_batch', 'solo')
          and minute(from_unixtime(cast(mgv.ts as bigint))) = 0 and second(from_unixtime(cast(mgv.ts as bigint))) between 0 and 60
          and abs(
            mod(
              from_big_endian_64(xxhash64(CAST(mgv.job_uuid AS varbinary))),
              100
            )
          ) <= {}
      ) as plans
      join
        dwh.fact_trip as completed 
      on
        plans.job_uuid = completed.uuid
        and plans.supply_uuid = completed.driver_uuid
        and plans.datestr = completed.datestr
        and plans.rank = 1 -- left join fares for last plan
        and completed.datestr between '{}' and '{}'
        and completed.status = 'completed'
        and completed.client_upfront_fare_usd > 0
        and completed.client_upfront_fare_local > 0
    join
        (
        select
            datestr,
            max(client_upfront_fare_local) as max_scale,
            approx_percentile(client_upfront_fare_local, 0.99) as p99_scale,
            approx_percentile(client_upfront_fare_local, 0.95) as p95_scale,
            approx_percentile(client_upfront_fare_local, 0.90) as p90_scale,
            approx_percentile(client_upfront_fare_local, 0.75) as p75_scale,
            approx_percentile(client_upfront_fare_local, 0.50) as p50_scale
        from
            dwh.fact_trip
        where
            datestr between '{}' and '{}'
        group by
            1
        ) as scale_tab
    on
        date(plans.datestr) = date(scale_tab.datestr) + INTERVAL '1' DAY
    group by
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint)))
    order by
      plans.datestr,
      plans.city_id,
      hour_of_day
    """.format(",".join([str(city_id) for city_id in city_list]), start_date, end_date, sample_percentage, start_date, end_date, start_date, end_date)
    return QUERY

In [4]:
# city_list, sample_percentage, start_date, end_date
city_list = [1313, 205, 1329, 1398, 1389, 1287, 1289, 1394, 588, 1402, 1031,
             206, 1760, 1438, 1021, 929, 741, 739, 1383, 1129, 1137, 961,
             1384, 1151, 1423, 1479, 1149, 1359, 1357, 1275, 1291, 1395,
             1379, 1333, 1297, 1175, 1408, 1025]
sample_percentage = 100
dates_list = [
              ('2023-01-15', '2023-01-17'), ('2023-01-17', '2023-01-19'),
              ('2023-01-19', '2023-01-21'), ('2023-01-21', '2023-01-23'),
              ('2023-01-23', '2023-01-25'), ('2023-01-25', '2023-01-27'),
              ('2023-01-27', '2023-01-29'), ('2023-01-29', '2023-01-31'),
              ('2023-01-31', '2023-02-02'), ('2023-02-02', '2023-02-04'),
              ('2023-02-04', '2023-02-06'), ('2023-02-06', '2023-02-08'),
              ('2023-02-08', '2023-02-10'), ('2023-02-10', '2023-02-12')
             ]

In [5]:
for i in range(len(dates_list)):
    start_date, end_date = dates_list[i]
    QUERY = prepare_query(city_list, 
                          sample_percentage,
                          start_date,
                          end_date)
    cursor = qr.execute("presto-secure", QUERY)
    result = cursor.fetchall()
    pd.DataFrame(result).to_csv(f"lcof_hourly_features_{i+1}.csv", index=False)

02/14/2023 04:27:06 AM Send empty tier_metadata {} to Queryrunner V2.
02/14/2023 04:27:07 AM [93m [Polling] 574d075b-b85a-4679-8006-6c47191afb5d [0m
02/14/2023 04:27:07 AM [93m [Status] created [0m
02/14/2023 04:27:08 AM [93m [Status] started auth check [0m
02/14/2023 04:27:09 AM [93m [Status] started waiting to execute [0m
02/14/2023 04:27:10 AM [93m [Status] started execution [0m
02/14/2023 04:42:07 AM [93m [Status] completed success [0m
02/14/2023 04:42:07 AM [92m [Query Success] completed success [0m
02/14/2023 04:42:07 AM Send empty tier_metadata {} to Queryrunner V2.
02/14/2023 04:42:07 AM [93m [Polling] 1aa47d2f-8279-47b7-aad1-6b625a86d155 [0m
02/14/2023 04:42:07 AM [93m [Status] created [0m
02/14/2023 04:42:08 AM [93m [Status] started validation [0m
02/14/2023 04:42:09 AM [93m [Status] started waiting to execute [0m
02/14/2023 04:42:13 AM [93m [Status] started execution [0m
02/14/2023 04:54:53 AM [93m [Status] completed success [0m
02/14/2023 04:54:53 