# install packages

In [None]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate

install_package_python3.sh add dsw_qr==0.1.13

$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo-py
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install tchannel

# run query

In [None]:
import os

import pandas as pd
from dsw_qr import dsw_qr
from queryrunner_client import Client
qr = Client(user_email='thai@uber.com')

In [None]:
def prepare_query(city_list, sample_percentage, start_date, end_date):
    QUERY = """
    SET session hash_partition_count=64;
    -- calculate hourly average gb per plan per scan at city-level
    select distinct
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint))) as hour_of_day,
      count(*) as num_plans,
      avg(completed.client_upfront_fare_local) as hour_gb_gamma_100,
      avg(completed.client_upfront_fare_local * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_99,
      avg(completed.client_upfront_fare_local * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_95,
      avg(completed.client_upfront_fare_local * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gb_gamma_90,
      avg(completed.base_fare_local) as hour_gub_gamma_100,
      avg(completed.base_fare_local * POWER(0.99, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gub_gamma_99,
      avg(completed.base_fare_local * POWER(0.95, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gub_gamma_95,
      avg(completed.base_fare_local * POWER(0.90, minute(from_unixtime(cast(plans.ts as bigint))))) as hour_gub_gamma_90
    from
      (
        select
          distinct mgv.datestr,
          mgv.city_id,
          mgv.supply_plan_uuid,
          mgv.job_uuid,
          mgv.supply_uuid,
          mgv.job_creation_time_ms,
          rank() over (
            PARTITION BY mgv.supply_uuid,
            mgv.job_uuid
            ORDER BY
              mgv.ts desc
          ) as rank,
          mgv.ts
        from
          (
            select
              distinct datestr,
              msg.job_uuid,
              msg.supply_uuid,
              msg.supply_plan_uuid,
              msg.city_id,
              msg.ct_request_uuid,
              msg.job_creation_time_ms,
              msg.job_type,
              msg.flow_type,
              ts
            from
              rawdata.kafka_hp_multileg_mgv_log_nodedup
            where
              msg.tenancy = 'uber/production'
              and msg.solo_cancel_model_driver_accept_prob is not NULL
              and msg.solo_cancel_model_rider_accept_prob is not NULL
              and msg.spinner_survive_prob_before_next_scan is not NULL
              and msg.eventual_completion_probability is not NULL
              and msg.city_id in ({})
              and datestr between '{}' and '{}'
          ) mgv
        where
          mgv.job_type = 'PERSONAL_TRANSPORT'
          and mgv.flow_type in ('solo_batch', 'solo')
          and abs(
            mod(
              from_big_endian_64(xxhash64(CAST(mgv.job_uuid AS varbinary))),
              100
            )
          ) <= {}
      ) plans
      join
        dwh.fact_trip completed 
      on plans.job_uuid = completed.uuid
          and plans.supply_uuid = completed.driver_uuid
          and plans.datestr = completed.datestr
          and plans.rank = 1 -- left join fares for last plan
          and completed.datestr between '{}' and '{}'
          and completed.status = 'completed'
    group by
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint)))
    order by
      plans.datestr,
      plans.city_id,
      hour_of_day
    """.format(",".join([str(city_id) for city_id in city_list]), start_date, end_date, sample_percentage, start_date, end_date)
    return QUERY

In [None]:
# city_list, sample_percentage, start_date, end_date
city_list = [34, 38, 37, 138, 450, 245, 453, 540, 36, 47]
sample_percentage = 100
dates_list = [('2022-08-23', '2022-08-24'), ('2022-08-25', '2022-08-26'),
              ('2022-08-27', '2022-08-28'), ('2022-08-29', '2022-08-30'),
              ('2022-08-31', '2022-09-01'), ('2022-09-02', '2022-09-03'),
              ('2022-09-04', '2022-09-05'), ('2022-09-06', '2022-09-07'),
              ('2022-09-08', '2022-09-09'), ('2022-09-10', '2022-09-11'),
              ('2022-09-12', '2022-09-13'), ('2022-09-14', '2022-09-15'),
              ('2022-09-16', '2022-09-17'), ('2022-09-18', '2022-09-19')
             ]

In [None]:
for i in range(len(dates_list)):
    start_date, end_date = dates_list[i]
    QUERY = prepare_query(city_list, 
                          sample_percentage,
                          start_date,
                          end_date)
    cursor = qr.execute("presto-secure", QUERY)
    result = cursor.fetchall()
    pd.DataFrame(result).to_csv(f"emea_hourly_labels_local_currency_{i+1}.csv", index=False)