In [None]:
import os

import pandas as pd
from dsw_qr import dsw_qr
from queryrunner_client import Client
qr = Client(user_email='thai@uber.com')

In [None]:
def prepare_query(city_list, sample_percentage, start_date, end_date):
    QUERY = """
    SET session hash_partition_count=64;
    
    -- calculate hourly average gb per plan per scan at city-level
    select distinct
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint))) as hour_of_day,
      count(*) as num_plans,
      avg(LOG2(gbs.gross_bookings_usd)) as hour_log2_gb_gamma_100,
      avg(LOG2(gbs.gross_bookings_usd) * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_gb_gamma_99,
      avg(LOG2(gbs.gross_bookings_usd) * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_gb_gamma_95,
      avg(LOG2(gbs.gross_bookings_usd) * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_gb_gamma_90,
      avg(LOG2(completed.client_upfront_fare_usd)) as hour_log2_fare_gamma_100,
      avg(LOG2(completed.client_upfront_fare_usd) * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_fare_gamma_99,
      avg(LOG2(completed.client_upfront_fare_usd) * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_fare_gamma_95,
      avg(LOG2(completed.client_upfront_fare_usd) * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_log2_fare_gamma_90,
      avg(gbs.gross_bookings_usd) as hour_gb_gamma_100,
      avg(gbs.gross_bookings_usd * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_99,
      avg(gbs.gross_bookings_usd * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_95,
      avg(gbs.gross_bookings_usd * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_90,
      avg(completed.client_upfront_fare_usd) as hour_fare_gamma_100,
      avg(completed.client_upfront_fare_usd * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_fare_gamma_99,
      avg(completed.client_upfront_fare_usd * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_fare_gamma_95,
      avg(completed.client_upfront_fare_usd * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_fare_gamma_90
    from
      (
        select
          distinct mgv.datestr,
          mgv.city_id,
          mgv.supply_plan_uuid,
          mgv.job_uuid,
          mgv.supply_uuid,
          mgv.job_creation_time_ms,
          rank() over (
            PARTITION BY mgv.supply_uuid,
            mgv.job_uuid
            ORDER BY
              mgv.ts desc
          ) as rank,
          mgv.ts
        from
          (
            select
              distinct datestr,
              msg.job_uuid,
              msg.supply_uuid,
              msg.supply_plan_uuid,
              msg.city_id,
              msg.ct_request_uuid,
              msg.job_creation_time_ms,
              msg.job_type,
              msg.flow_type,
              ts
            from
              rawdata.kafka_hp_multileg_mgv_log_nodedup
            where
              msg.tenancy = 'uber/production'
              and msg.solo_cancel_model_driver_accept_prob is not NULL
              and msg.solo_cancel_model_rider_accept_prob is not NULL
              and msg.spinner_survive_prob_before_next_scan is not NULL
              and msg.eventual_completion_probability is not NULL
              and msg.city_id in ({})
              and datestr between '{}' and '{}'
          ) mgv
        where
          mgv.job_type = 'PERSONAL_TRANSPORT'
          and mgv.flow_type in ('solo_batch', 'solo')
          and abs(
            mod(
              from_big_endian_64(xxhash64(CAST(mgv.job_uuid AS varbinary))),
              100
            )
          ) <= {}
      ) plans
      join
        dwh.fact_trip completed 
      on 
        plans.job_uuid = completed.uuid
        and plans.supply_uuid = completed.driver_uuid
        and plans.datestr = completed.datestr
        and plans.rank = 1 -- left join fares for last plan
        and completed.datestr between '{}' and '{}'
        and completed.status = 'completed'
        and completed.client_upfront_fare_usd > 0
      join
        secure_finance.fds_rides_vc gbs
      on
        plans.job_uuid = gbs.job_uuid
        and plans.supply_uuid = gbs.driver_uuid
        and plans.datestr = gbs.operational_date
        and plans.rank = 1 -- left join fares for last plan
        and gbs.operational_date between '{}' and '{}'
        and gbs.is_completed = true
        and gbs.gross_bookings_usd > 0
    group by
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint)))
    order by
      plans.datestr,
      plans.city_id,
      hour_of_day
    """.format(",".join([str(city_id) for city_id in city_list]), start_date, end_date, sample_percentage, start_date, end_date, start_date, end_date)
    return QUERY

In [None]:
# city_list, sample_percentage, start_date, end_date
city_list = [1313, 205, 1329, 1398, 1389, 1287, 1289, 1394, 588, 1402, 1031,
             206, 1760, 1438, 1021, 929, 741, 739, 1383, 1129, 1137, 961,
             1384, 1151, 1423, 1479, 1149, 1359, 1357, 1275, 1291, 1395,
             1379, 1333, 1297, 1175, 1408, 1025]
sample_percentage = 100
dates_list = [
              ('2023-01-16', '2023-01-17'), ('2023-01-18', '2023-01-19'),
              ('2023-01-20', '2023-01-21'), ('2023-01-22', '2023-01-23'), 
              ('2023-01-24', '2023-01-25'), ('2023-01-26', '2023-01-27'),
              ('2023-01-28', '2023-01-29'), ('2023-01-30', '2023-01-31'),
              ('2023-02-01', '2023-02-02'), ('2023-02-03', '2023-02-04'),
              ('2023-02-05', '2023-02-06'), ('2023-02-07', '2023-02-08'), 
              ('2023-02-09', '2023-02-10'), ('2023-02-11', '2023-02-12')
             ]



In [None]:
for i in range(len(dates_list)):
    start_date, end_date = dates_list[i]
    QUERY = prepare_query(city_list, 
                          sample_percentage,
                          start_date,
                          end_date)
    cursor = qr.execute("presto-secure", QUERY)
    result = cursor.fetchall()
    pd.DataFrame(result).to_csv(f"lcof_hourly_labels_{i+1}.csv", index=False)