In [1]:
#This notebook performs representative city selection for SIM/CFP based on stratified sampling from city clusterings with city vectors encoding marketplace conditions

# Install packages (one-time)

In [1]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate

install_package_python3.sh add dsw_qr==0.1.13

$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install galileo-py
$VIRTUAL_ENV_DIR/python3/bin/python -m pip install tchannel


Updating dependencies
Resolving dependencies...


Package operations: 1 install, 0 updates, 0 removals

  - Installing wheel (0.35.1)
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index
Collecting cachetools~=3.0
  Using cached https://pypi.uberinternal.com/packages/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d33e704/cachetools-3.1.1-py2.py3-none-any.whl (11 kB)
Collecting tornado<5,>=4.4
  Using cached tornado-4.5.3-cp36-cp36m-linux_x86_64.whl
Installing collected packages: tornado, cachetools
  Attempting uninstall: tornado
    Found existing installation: tornado 5.1.1
    Uninstalling tornado-5.1.1:
      Successfully uninstalled tornado-5.1.1
  Attempting uninstall: cachetools
    Found existing installation: cachetools 4.1.1
    Uninstalling cachetools-4.1.1:
      Successfully uninstalled cachetools-4.1.1
Successfully installed cachetools-3.1.1 tornado-4.5.3
Looking i

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 0.71.0 requires cachetools>=4.0, but you have cachetools 3.1.1 which is incompatible.
streamlit 0.71.0 requires tornado>=5.0, but you have tornado 4.5.3 which is incompatible.
mxpkg 1.1.49 requires bcrypt==3.1.7, but you have bcrypt 3.2.0 which is incompatible.
mxpkg 1.1.49 requires certifi==2020.4.5.1, but you have certifi 2020.6.20 which is incompatible.
mxpkg 1.1.49 requires cffi==1.14.0, but you have cffi 1.14.3 which is incompatible.
mxpkg 1.1.49 requires clay-config-file==1.2.0, but you have clay-config-file 1.2.1 which is incompatible.
mxpkg 1.1.49 requires click==7.1.1, but you have click 7.1.2 which is incompatible.
mxpkg 1.1.49 requires colorama==0.4.3, but you have colorama 0.4.4 which is incompatible.
mxpkg 1.1.49 requires cryptography==2.9, but you have cryptography 3.2 which is incompatible

# Input user params

In [7]:
# input user params
region = "global" # global/apac/emea/latam/usc
n_cities = 10 # approximate number of cities to be selected
blacklisted_city_ids = {18} # if manually want to black certain cities
MIN_CITIES_PER_CLUSTER = 1 # min cities that we try to sample per cluster
CLUSTER_COVERAGE_THRESHOLD = 50 # percentage of clusters to be covered
min_cluster_gb_for_sampling = 1 # minimim GB for us to sample from cluster

In [8]:
import os

import pandas as pd
from dsw_qr import dsw_qr

In [9]:
if not os.path.exists("data"):
    os.mkdir("data")

# Run sampling

In [10]:
import joblib
import math

In [11]:
# load cluster data
clusters = joblib.load("clusters/{}.pkl".format(region))

In [12]:
total_cluster_gb = sum([c['cluster_gb'] for c in clusters])

In [14]:
def is_city_available(city_id):
    return True

selected_cites = []
unavailable_cities = []
clusters_covered = set()
for i, c in enumerate(clusters):
    if c['cluster_gb'] > min_cluster_gb_for_sampling:
        n_cluster_cities = max(int(n_cities*c['cluster_gb']/total_cluster_gb), MIN_CITIES_PER_CLUSTER)
        cluster_selected_cities = 0
        for city in c['cities']:
            if not is_city_available(city['data.city_id']):
                unavailable_cities.append(city)
            if cluster_selected_cities < n_cluster_cities and is_city_available(city['data.city_id']):
                city['cluster'] = i
                selected_cites.append(city)
                cluster_selected_cities += 1
                clusters_covered.add(i)
            
            
clusters_coverage = len(clusters_covered)/len(clusters)*100

In [15]:
len(selected_cites)

14

In [16]:
# return selected cities if conditions are met
if len(selected_cites) >= n_cities and clusters_coverage >= CLUSTER_COVERAGE_THRESHOLD:
    selected_cites = pd.DataFrame(selected_cites)
    print(selected_cites[['cluster', 'data.city_id', 'city.city_name', 'city.mega_region',
           'city.country_name', 'city.region', 'city.sub_region',
           'offers_pct_of_total', 'gb_pct_of_total', 'city_w_offers_pct',
           'city_w_gb_pct', 'num_offers', 'GB_in_MM']])
else:
    print("Not enough cities available. Please try a different date!")
    
#selected_cites.to_csv("sim_cities/stratified_sampling_selected_cities_for_sim_{}.csv".format(region), index=False)

    cluster  data.city_id city.city_name city.mega_region city.country_name  \
0         0             5  New York City      US & Canada     United States   
1         0            14          Miami      US & Canada     United States   
2         1            18         London             EMEA    United Kingdom   
3         2           458      Sao Paulo            LatAm            Brazil   
4         3             7        Chicago      US & Canada     United States   
5         4            12    Los Angeles      US & Canada     United States   
6         5            90    Mexico City            LatAm            Mexico   
7         6            39      Melbourne             APAC         Australia   
8         7           143      Hong Kong             APAC         Hong Kong   
9         8            16         Madrid             EMEA             Spain   
10       10           148       Santiago            LatAm             Chile   
11       11           531          Cairo            

In [197]:
%%bash

zip -r sim_cities.zip sim_cities

  adding: sim_cities/ (stored 0%)
  adding: sim_cities/stratified_sampling_selected_cities_for_sim_emea.csv (deflated 49%)
  adding: sim_cities/stratified_sampling_selected_cities_for_sim_global.csv (deflated 51%)
  adding: sim_cities/stratified_sampling_selected_cities_for_sim_apac.csv (deflated 50%)
  adding: sim_cities/stratified_sampling_selected_cities_for_sim_latam.csv (deflated 52%)
  adding: sim_cities/stratified_sampling_selected_cities_for_sim_usc.csv (deflated 58%)


