In [1]:
# This notebook performs stratified city selection  by pulling MX availability and performing clster based stratified samplong

# Please download the cluster files from https://drive.google.com/drive/folders/1dXFwzw0A-oyN3dX88CXV9-KprOAtcmBt

# Install packages (one-time)

In [52]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate

install_package_python3.sh add dsw_qr==0.1.13

$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo-py
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install tchannel


Updating dependencies
Resolving dependencies...


Package operations: 1 install, 9 updates, 0 removals

  - Updating tornado (4.5.3 -> 5.1.1)
  - Updating cachetools (3.1.1 -> 4.1.1)
  - Updating idna (2.8 -> 2.10)
  - Updating protobuf (3.12.2 -> 3.13.0)
  - Updating pytz (2022.1 -> 2021.3)
  - Updating requests (2.22.0 -> 2.24.0)
  - Updating pandas (1.1.5 -> 0.25.3)
  - Installing wheel (0.35.1)
  - Updating queryrunner-client (3.5.0 -> 3.4.1)
  - Updating h3 (3.6.4 -> 3.7.0)
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Collecting cachetools~=3.0
  Using cached https://pypi.uberinternal.com/packages/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d33e704/cachetools-3.1.1-py2.py3-none-any.whl (11 kB)
Collecting tornado<5,>=4.4
  Using cached tornado-4.5.3-cp36-cp36m-linux_x86_64.whl
Installing collected packages: tornado, cachetools
  Attempting uninstall: tornado
    F

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 0.71.0 requires cachetools>=4.0, but you have cachetools 3.1.1 which is incompatible.
streamlit 0.71.0 requires tornado>=5.0, but you have tornado 4.5.3 which is incompatible.


In [31]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate

install_package_python3.sh remove scikit-learn
install_package_python3.sh add scikit-learn==0.22.2

Updating dependencies
Resolving dependencies...

Writing lock file


Package operations: 1 install, 2 updates, 1 removal

  - Updating tornado (4.5.3 -> 5.1.1)
  - Updating cachetools (3.1.1 -> 4.1.1)
  - Installing wheel (0.35.1)
  - Removing scikit-learn (0.22.2)

Updating dependencies
Resolving dependencies...

Writing lock file


Package operations: 2 installs, 0 updates, 0 removals

  - Installing wheel (0.35.1)
  - Installing scikit-learn (0.22.2)


# Input user params

In [32]:
# input user params
XP_start_date = "2022-08-01 00:00:00" # XP start time
DAYS = 14 # duration of XP
region = "global" # global/apac/emea/latam/usc
n_cities = 25 # approximate number of cities to be selected
blacklisted_city_ids = {18} # if manually want to black certain cities
MIN_CITIES_PER_CLUSTER = 1 # min cities that we try to sample per cluster
CLUSTER_COVERAGE_THRESHOLD = 50 # percentage of clusters to be covered

In [33]:
import os

import pandas as pd
from dsw_qr import dsw_qr

In [34]:
if not os.path.exists("data"):
    os.mkdir("data")

# fetch city bookings data

In [35]:
USER_EMAIL = 'targupt@uber.com'

In [36]:
QUERY = """
select
  configurations.`_key`,
  configurations_mx.start_at,
  configurations_mx.end_at,
  configurations_mx.experiment_lane,
  configurations_mx.contact_email,
  cities_mx.city_id,
  cities_mx.city_label
from
  cities_mx
  join configurations_mx on configurations_mx.configuration_id = cities_mx.configuration_id
  join configurations on configurations.id = configurations_mx.configuration_id
where
  configurations_mx.experiment_type = 'switchback'
order by start_at desc
"""

In [37]:
from queryrunner_client import Client
qr = Client(user_email='targupt@uber.com')
qr.list_datasources()
cursor = qr.execute("xpc", QUERY)
result = cursor.fetchall()

05/31/2022 07:15:44 PM Send empty tier_metadata {} to queryrunner. Query is default to tier 5.
05/31/2022 07:15:44 PM [93m [Polling] bd524553-9ff3-423b-99b2-5239e2fc7668 [0m
05/31/2022 07:15:44 PM [93m [Status] started validation [0m
05/31/2022 07:15:45 PM [93m [Status] started execution [0m
05/31/2022 07:15:46 PM [93m [Status] completed success [0m
05/31/2022 07:15:46 PM [92m [Query Success] completed success [0m


In [38]:
pd.DataFrame(result).to_csv("data/mx_cities_and_xps.csv", index=False)

# calculate city availability

In [39]:
import datetime

In [40]:
XP_start_date = datetime.datetime.strptime(XP_start_date,"%Y-%m-%d %H:%M:%S")
target_interval = [XP_start_date, XP_start_date + datetime.timedelta(days=DAYS)]

In [41]:
xps = pd.read_csv("data/mx_cities_and_xps.csv")
xps['start_at'] = pd.to_datetime(xps['start_at'])
xps['end_at'] = pd.to_datetime(xps['end_at'])
xps = xps[(xps.end_at > XP_start_date) & (xps.experiment_lane != 'eats')]

xps.sort_values('start_at', inplace=True)

In [42]:
xps['interval'] = xps.apply(lambda r: [r['start_at'], r['end_at']], axis=1) 

In [43]:
booked_intervals = xps.groupby('city_id')['interval'].apply(list).to_dict()

In [44]:
"""def overlap(a, b):
    if a[0] > b[1] or b[0] > a[1]:
        return 0
    else:
        l = max(a[0], b[0])
        u = min(a[1], b[1])
        #print(u-l, (u-l).days)
        return (u-l).days

def is_overlap(a, b):
    return overlap(a, b) > 0
"""

def is_overlap(a, b):
    if a[0] >= b[1] or b[0] >= a[1]:
        return False
    return True 

def is_city_available(city_id, target_interval=target_interval, booked_interval=booked_intervals):
    if city_id in blacklisted_city_ids:
        return False
    if city_id not in booked_intervals:
        return True
    
    city_booked_intervals = booked_intervals[city_id]
    for booked_interval in city_booked_intervals:
        if is_overlap(target_interval, booked_interval):
            return False
    return True
    
# is_city_available(3, target_interval, booked_intervals)

# City selection results

In [45]:
import joblib
import math

In [46]:
# load cluster data
clusters = joblib.load("clusters/{}.pkl".format(region))

In [47]:
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

In [48]:
# def MIN_CITIES_PER_CLUSTER(gb):
#     return 1 if gb > 5 else 0

selected_cites = []
unavailable_cities = []
clusters_covered = set()
for i, c in enumerate(clusters):
    n_cluster_cities = int(n_cities*c['cluster_gb']/total_cluster_gb) + MIN_CITIES_PER_CLUSTER
    cluster_selected_cities = 0
    for city in c['cities']:
        if not is_city_available(city['data.city_id']):
            unavailable_cities.append(city)
        if cluster_selected_cities < n_cluster_cities and is_city_available(city['data.city_id']):
            city['cluster'] = i
            selected_cites.append(city)
            cluster_selected_cities += 1
            clusters_covered.add(i)
            
            
clusters_coverage = len(clusters_covered)/len(clusters)*100

In [49]:
# return selected cities if conditions are met
if len(selected_cites) >= n_cities and clusters_coverage >= CLUSTER_COVERAGE_THRESHOLD:
    selected_cites = pd.DataFrame(selected_cites)
    print(selected_cites[['cluster', 'data.city_id', 'city.city_name', 'city.mega_region',
           'city.country_name', 'city.region', 'city.sub_region',
           'offers_pct_of_total', 'gb_pct_of_total', 'city_w_offers_pct',
           'city_w_gb_pct', 'num_offers', 'GB_in_MM']])
else:
    print("Not enough cities available. Please try a different date!")

    cluster  data.city_id             city.city_name city.mega_region  \
0         0             5              New York City      US & Canada   
1         0            14                      Miami      US & Canada   
2         0             1              San Francisco      US & Canada   
3         0             8            Washington D.C.      US & Canada   
4         0            13                    Toronto      US & Canada   
5         0             6                     Boston      US & Canada   
6         0            20               Philadelphia      US & Canada   
7         1             3                      Paris             EMEA   
8         1           244             Birmingham, UK             EMEA   
9         1            36                  Stockholm             EMEA   
10        1            34                  Amsterdam             EMEA   
11        2           458                  Sao Paulo            LatAm   
12        2           791               Porto Alegr

#### add power analysis

In [53]:
# Ref: https://michelangelo-studio.uberinternal.com/file/b87b0fff-0b91-49b1-bfa2-698900394be0
import os
import multiprocessing
from joblib import Parallel, delayed
num_cores = 4


import matplotlib.pyplot as plt
import pandas as pd


from mxpkg.lib.dsw_utils import notebook_init, log_level
notebook_init()


from mxpkg.api.switchback import SwitchbackAnalysis
from mxpkg.api.power import SwitchbackPowerAnalysis

ModuleNotFoundError: No module named 'mxpkg'

In [None]:
x_metrics = [
    'requests_completed_rate',
    'completed_trips',
    'completed_to_session_ratio',
    'offered_eta_per_offer',
    'driver_acceptance_rate',
    'rider_cancel_rate',
    'rider_pre_dispatch_cancels_per_request',
    'driver_cancellation_rate', 
#     'driver_accept_to_completion_rate_rides',
    'utilization',
    'supply_hours',
    'supply_efficiency',
    'unfulfilled_rate',
    'requests_completed_rate', 
    'completed_to_session_ratio',
    'unique_session_count_bb8_hp',
    'driver_cancellation_rate',
    'unfulfilled_rate',
    'completed_trips'
]
#    'gross_billings_usd_finance',

x_metrics = [
    'completed_trips'
]


In [None]:
cities = selected_cites['data.city_id']
pa = SwitchbackPowerAnalysis(
            switch_granularity=96,     
            segment_filters={'city': cities},
            metric_names=x_metrics                
        )
pa.collect_data()
pa.run_analysis() 
df = pa.mde_table()
#df.to_csv(f'power-analysis/mde_results_2022_model_coverage.csv', index=False)

df