# install packages

In [None]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate

install_package_python3.sh add dsw_qr==0.1.13

$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo-py
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install tchannel

# run query

In [None]:
import os

import pandas as pd
from dsw_qr import dsw_qr
from queryrunner_client import Client
qr = Client(user_email='thai@uber.com')

In [None]:
def prepare_query(city_list, sample_percentage, start_date, end_date):
    QUERY = """
    SET session hash_partition_count=64;
    -- calculate features based on average plan value for first 60s of the hour
    select
        plans.datestr,
        plans.city_id,
        hour(from_unixtime(cast(plans.ts as bigint))) as hour_of_day,
        count(*) as num_plans,
        avg(plans.driver_cancel_prob) as market_driver_cancel_prob,
        avg(plans.rider_cancel_prob) as market_rider_cancel_prob,
        avg(plans.spinner_cancel_prob) as market_spinner_cancel_prob,
        avg(plans.eta) as market_eta,
        avg(plans.surge_mul) as market_surge_mul,
        avg(plans.eventual_comp_prob) as market_eventual_comp_prob,
        avg(completed.client_upfront_fare_usd) as market_gb,
        avg(completed.base_fare_usd) as market_gub,
        avg(plans.eta_one) as market_eta_one,
        avg(plans.eta_one_twenty) as market_eta_one_twenty,
        avg(plans.eta_one_quarter) as market_eta_one_quarter,
        avg(plans.eta_square) as market_eta_square,
        avg(plans.eta_cube) as market_eta_cube,
        avg(plans.network_contention_2) as market_network_contention_2,
        avg(plans.network_contention_3) as market_network_contention_3,
        avg(plans.network_contention_5) as market_network_contention_5,
        avg(plans.cr_ratio) as market_cr_ratio,
        avg(plans.eta_one * plans.cr_ratio) as market_eta_one_cr_ratio,
        avg(plans.eta_one_twenty * plans.cr_ratio) as market_eta_one_twenty_cr_ratio,
        avg(plans.eta_one_quarter * plans.cr_ratio) as market_eta_one_quarter_cr_ratio,
        avg(plans.eta_square * plans.cr_ratio) as market_eta_square_cr_ratio,
        avg(plans.eta_cube * plans.cr_ratio) as market_eta_cube_cr_ratio,
        avg(plans.network_contention_2 * plans.eta_one) as market_network_2_eta_1,
        avg(plans.network_contention_3 * plans.eta_one) as market_network_3_eta_1,
        avg(plans.network_contention_5 * plans.eta_one) as market_network_5_eta_1,
        avg(plans.network_contention_2 * plans.eta_one_twenty) as market_network_2_eta_1_20,
        avg(plans.network_contention_3 * plans.eta_one_twenty) as market_network_3_eta_1_20,
        avg(plans.network_contention_5 * plans.eta_one_twenty) as market_network_5_eta_1_20,
        avg(plans.network_contention_2 * plans.eta_one_quarter) as market_network_2_eta_1_25,
        avg(plans.network_contention_3 * plans.eta_one_quarter) as market_network_3_eta_1_25,
        avg(plans.network_contention_5 * plans.eta_one_quarter) as market_network_5_eta_1_25,
        avg(plans.network_contention_2 * plans.eta_square) as market_network_2_eta_2,
        avg(plans.network_contention_3 * plans.eta_square) as market_network_3_eta_2,
        avg(plans.network_contention_5 * plans.eta_square) as market_network_5_eta_2,
        avg(plans.network_contention_2 * plans.eta_cube) as market_network_2_eta_3,
        avg(plans.network_contention_3 * plans.eta_cube) as market_network_3_eta_3,
        avg(plans.network_contention_5 * plans.eta_cube) as market_network_5_eta_3,
        avg((1 - plans.network_contention_2) * plans.cr_ratio) as market_network_2_cr_ratio,
        avg((1 - plans.network_contention_3) * plans.cr_ratio) as market_network_3_cr_ratio,
        avg((1 - plans.network_contention_5) * plans.cr_ratio) as market_network_5_cr_ratio,
        avg(plans.eta_one * plans.network_contention_2 * plans.cr_ratio) as market_network_2_eta_1_cr_ratio,
        avg(plans.eta_one * plans.network_contention_3 * plans.cr_ratio) as market_network_3_eta_1_cr_ratio,
        avg(plans.eta_one * plans.network_contention_5 * plans.cr_ratio) as market_network_5_eta_1_cr_ratio,
        avg(plans.eta_one_twenty * plans.network_contention_2 * plans.cr_ratio) as market_network_2_eta_1_20_cr_ratio,
        avg(plans.eta_one_twenty * plans.network_contention_3 * plans.cr_ratio) as market_network_3_eta_1_20_cr_ratio,
        avg(plans.eta_one_twenty * plans.network_contention_5 * plans.cr_ratio) as market_network_5_eta_1_20_cr_ratio,
        avg(plans.eta_one_quarter * plans.network_contention_2 * plans.cr_ratio) as market_network_2_eta_1_25_cr_ratio,
        avg(plans.eta_one_quarter * plans.network_contention_3 * plans.cr_ratio) as market_network_3_eta_1_25_cr_ratio,
        avg(plans.eta_one_quarter * plans.network_contention_5 * plans.cr_ratio) as market_network_5_eta_1_25_cr_ratio,
        avg(plans.eta_square * plans.network_contention_2 * plans.cr_ratio) as market_network_2_eta_2_cr_ratio,
        avg(plans.eta_square * plans.network_contention_3 * plans.cr_ratio) as market_network_3_eta_2_cr_ratio,
        avg(plans.eta_square * plans.network_contention_5 * plans.cr_ratio) as market_network_5_eta_2_cr_ratio,
        avg(plans.eta_cube * plans.network_contention_2 * plans.cr_ratio) as market_network_2_eta_3_cr_ratio,
        avg(plans.eta_cube * plans.network_contention_3 * plans.cr_ratio) as market_network_3_eta_3_cr_ratio,
        avg(plans.eta_cube * plans.network_contention_5 * plans.cr_ratio) as market_network_5_eta_3_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_2) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_2_eta_1_gb_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_3) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_3_eta_1_gb_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_5) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_5_eta_1_gb_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_2) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_2_eta_1_20_gb_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_3) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_3_eta_1_20_gb_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_5) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_5_eta_1_20_gb_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_2) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_2_eta_1_25_gb_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_3) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_3_eta_1_25_gb_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_5) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_5_eta_1_25_gb_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_2) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_2_eta_2_gb_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_3) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_3_eta_2_gb_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_5) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_5_eta_2_gb_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_2) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_2_eta_3_gb_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_3) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_3_eta_3_gb_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_5) * plans.cr_ratio * completed.client_upfront_fare_usd) as market_network_5_eta_3_gb_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_2) * plans.cr_ratio * completed.base_fare_usd) as market_network_2_eta_1_gub_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_3) * plans.cr_ratio * completed.base_fare_usd) as market_network_3_eta_1_gub_cr_ratio,
        avg(plans.eta_one * (1 - plans.network_contention_5) * plans.cr_ratio * completed.base_fare_usd) as market_network_5_eta_1_gub_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_2) * plans.cr_ratio * completed.base_fare_usd) as market_network_2_eta_1_20_gub_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_3) * plans.cr_ratio * completed.base_fare_usd) as market_network_3_eta_1_20_gub_cr_ratio,
        avg(plans.eta_one_twenty * (1 - plans.network_contention_5) * plans.cr_ratio * completed.base_fare_usd) as market_network_5_eta_1_20_gub_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_2) * plans.cr_ratio * completed.base_fare_usd) as market_network_2_eta_1_25_gub_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_3) * plans.cr_ratio * completed.base_fare_usd) as market_network_3_eta_1_25_gub_cr_ratio,
        avg(plans.eta_one_quarter * (1 - plans.network_contention_5) * plans.cr_ratio * completed.base_fare_usd) as market_network_5_eta_1_25_gub_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_2) * plans.cr_ratio * completed.base_fare_usd) as market_network_2_eta_2_gub_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_3) * plans.cr_ratio * completed.base_fare_usd) as market_network_3_eta_2_gub_cr_ratio,
        avg(plans.eta_square * (1 - plans.network_contention_5) * plans.cr_ratio * completed.base_fare_usd) as market_network_5_eta_2_gub_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_2) * plans.cr_ratio * completed.base_fare_usd) as market_network_2_eta_3_gub_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_3) * plans.cr_ratio * completed.base_fare_usd) as market_network_3_eta_3_gub_cr_ratio,
        avg(plans.eta_cube * (1 - plans.network_contention_5) * plans.cr_ratio * completed.base_fare_usd) as market_network_5_eta_3_gub_cr_ratio
    from
      (
        select
            distinct mgv.datestr,
            mgv.city_id,
            mgv.supply_plan_uuid,
            mgv.job_uuid,
            mgv.supply_uuid,
            mgv.job_creation_time_ms,
            mgv.driver_cancel_prob,
            mgv.rider_cancel_prob,
            mgv.spinner_cancel_prob,
            mgv.eta,
            mgv.surge_mul,
            mgv.eventual_comp_prob,
            POWER(1 - mgv.eta / 1500.0, 1) as eta_one,
            POWER(1 - mgv.eta / 1500.0, 1.20) as eta_one_twenty,
            POWER(1 - mgv.eta / 1500.0, 1.25) as eta_one_quarter,
            POWER(1 - mgv.eta / 1500.0, 2) as eta_square,
            POWER(1 - mgv.eta / 1500.0, 3) as eta_cube,
            1.0 / (1.0 + POWER(mgv.surge_mul, 2)) as network_contention_2,
            1.0 / (1.0 + POWER(mgv.surge_mul, 3)) as network_contention_3,
            1.0 / (1.0 + POWER(mgv.surge_mul, 5)) as network_contention_5,
            (1.0 - mgv.driver_cancel_prob) * (1.0 - mgv.rider_cancel_prob) * (1.0 - mgv.spinner_cancel_prob) + mgv.eventual_comp_prob * mgv.driver_cancel_prob as cr_ratio,
            rank() over (
            PARTITION BY mgv.supply_uuid,
            mgv.job_uuid
            ORDER BY
              mgv.ts desc
            ) as rank,
            mgv.ts
        from
          (
            select
              distinct datestr,
              msg.job_uuid,
              msg.supply_uuid,
              msg.supply_plan_uuid,
              msg.city_id,
              msg.ct_request_uuid,
              msg.job_creation_time_ms,
              1.0 - msg.solo_cancel_model_driver_accept_prob as driver_cancel_prob,
              1.0 - msg.solo_cancel_model_rider_accept_prob as rider_cancel_prob,
              1.0 - msg.spinner_survive_prob_before_next_scan as spinner_cancel_prob,
              (CASE
                WHEN msg.adjustedeta > 1500 THEN 1500.0
                WHEN msg.adjustedeta < 0 THEN 0.0
                ELSE msg.adjustedeta
              END) as eta,
              msg.job_surge as surge_mul,
              msg.eventual_completion_probability as eventual_comp_prob,
              msg.job_type,
              msg.flow_type,
              ts
            from
              rawdata.kafka_hp_multileg_mgv_log_nodedup
            where
              msg.tenancy = 'uber/production'
              and msg.solo_cancel_model_driver_accept_prob is not NULL
              and msg.solo_cancel_model_rider_accept_prob is not NULL
              and msg.spinner_survive_prob_before_next_scan is not NULL
              and msg.eventual_completion_probability is not NULL
              and msg.city_id in ({})
              and datestr between '{}' and '{}'
          ) mgv
        where
          mgv.job_type = 'PERSONAL_TRANSPORT'
          and mgv.flow_type in ('solo_batch', 'solo')
          and minute(from_unixtime(cast(mgv.ts as bigint))) = 0 and second(from_unixtime(cast(mgv.ts as bigint))) between 0 and 60
          and abs(
            mod(
              from_big_endian_64(xxhash64(CAST(mgv.job_uuid AS varbinary))),
              100
            )
          ) <= {}
      ) as plans
      join
        dwh.fact_trip as completed 
      on
        plans.job_uuid = completed.uuid
        and plans.supply_uuid = completed.driver_uuid
        and plans.datestr = completed.datestr
        and plans.rank = 1 -- left join fares for last plan
        and completed.datestr between '{}' and '{}'
        and completed.status = 'completed'
    group by
      plans.datestr,
      plans.city_id,
      hour(from_unixtime(cast(plans.ts as bigint)))
    order by
      plans.datestr,
      plans.city_id,
      hour_of_day
    """.format(",".join([str(city_id) for city_id in city_list]), start_date, end_date, sample_percentage, start_date, end_date)
    return QUERY

In [None]:
# city_list, sample_percentage, start_date, end_date
city_list = [1269, 789, 797, 801, 803, 204, 144, 787, 148, 933]
sample_percentage = 100
dates_list = [('2022-07-29', '2022-07-30'), ('2022-07-31', '2022-08-01'),
              ('2022-08-02', '2022-08-03'), ('2022-08-04', '2022-08-05'),
              ('2022-08-06', '2022-08-07'), ('2022-08-08', '2022-08-09'),
              ('2022-08-10', '2022-08-11'), ('2022-08-12', '2022-08-13'),
              ('2022-08-14', '2022-08-15'), ('2022-08-16', '2022-08-17'),
              ('2022-08-18', '2022-08-19'), ('2022-08-20', '2022-08-21'),
              ('2022-08-22', '2022-08-23'), ('2022-08-24', '2022-08-25')
             ]

In [None]:
for i in range(len(dates_list)):
    start_date, end_date = dates_list[i]
    QUERY = prepare_query(city_list, 
                          sample_percentage,
                          start_date,
                          end_date)
    cursor = qr.execute("presto-secure", QUERY)
    result = cursor.fetchall()
    pd.DataFrame(result).to_csv(f"latam_hourly_more_features_{i+1}.csv", index=False)