In [1]:
%%bash
source $VIRTUAL_ENV_DIR/python3/bin/activate
pip3 install -U matching-ds-tools==0.7.5

Looking in indexes: https://yoober11:****@pypi.uberinternal.com/index


In [2]:
import mdstk
mdstk.__version__

'0.7.5'

In [3]:
# PYTHON 3.6

import datetime
import numpy as np
import pandas as pd
import requests

import json
import os
import errno
import copy
import hashlib

import itertools as it

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm

from mdstk import (
    SimulationJob,
    BatchJob,
    DataFetcher,
    CanonicalTimeBucket,
    MultiverseClient,
    __version__ as MDSTK_VERSION
)


print('Matching DS Toolkit v{}'.format(MDSTK_VERSION))
assert tuple(map(int, MDSTK_VERSION.split('.'))) >= (0,5,12), (
    'MDSTK out-of-date.\nNeed at least matching-ds-tools >= 0.5.12 to use the script.\nUpdate by running the following command in a DSW terminal:\n\nsource ${VIRTUAL_ENV_DIR}/bin/activate && pip3 install -U matching-ds-tools && deactivate'
)
print('\nAlways get the latest template at t.uber.com/batch-sim-notebook')

Matching DS Toolkit v0.7.5

Always get the latest template at t.uber.com/batch-sim-notebook


# NOTEBOOK-WIDE CONSTANTS

In [131]:
# replace with your actual email and uuid

USER_EMAIL = 'mshehata@uber.com' # your email
USER_UUID  = 'e87649b7-67cd-4b25-b061-6307b527fb43' # your uuid (go to whober and look under your name)

PROJECT_DIR = 'notebook_test_sims' # name of project directory that will contain sim metadata for this project
#PROJECT_PRE = 'UnfulfilMaxRetries' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
#PROJECT_PRE = 'UnfulfilJobAge' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
#PROJECT_PRE = 'OFDelta' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>

#PROJECT_PRE = 'UnfulfilJobAge' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
PROJECT_PRE = 'UnfulfilJobAgeAndMaxRetries' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>

PROJECT_NAME = 'ParamTuning' # name of the project, used as a batch id tag

assert USER_EMAIL
assert USER_UUID
assert PROJECT_DIR
assert PROJECT_PRE
assert PROJECT_NAME

# creates a directory, doesn't throw an exception if it already exists
def safe_create_new_dir(name):
    try:
        os.makedirs(name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

safe_create_new_dir(PROJECT_DIR)

# helper functions

In [132]:
# Modifies raw trip input from the query.
#
# Feel free to customize by adding an argument to this function (in this example, "shared_rides_trip_only"),
# then add the keyword argument-value pair to the configuration_space dictionary below.
#
# City settings get passed in here by default; to use them, just add them to the arguments in the function
# definition (e.g. hel_vvid, pool_vvid, x_vvid below).

def modify_trip_data(trip_df, hel_vvid, pool_vvid, x_vvid,
                     shared_rides_trips_only=False, uberx_trips_only=False, salt='a83y4thaih',**kwargs):
    
    # Job-based random number generator, might be helpful if you want to randomize anything
    fn = lambda x: np.random.RandomState(int(hashlib.md5(x+salt).hexdigest(),base=16) % (2**23)).rand()
    
    df = trip_df.copy()
    
    # For example, this is how you might randomly but predictably turn pool trips into helium trips.
    
    # rand_series = df['uuid'].apply(fn)
   
    # df = df[ (rand_series <  helium_pct + pool_pct) ]
    
    # df.loc[  (rand_series <  helium_pct) , 'product_type' ] = 'helium'
    # df.loc[  (rand_series >= helium_pct) , 'product_type' ] = 'pool'
    
    # df.loc[ (df['product_type']=='helium') , 'product_type_uuid' ] = hel_prod_uuid
    # df.loc[ (df['product_type']=='pool'  ) , 'product_type_uuid' ] = pool_prod_uuid
    
    # df.loc[ (df['product_type']=='helium') , 'max_wait_time_sec' ] = helium_wait
    # df.loc[ (df['product_type']=='pool'  ) , 'max_wait_time_sec' ] = pool_wait
    
    if uberx_trips_only:
        df = df[df['vehicle_view_id'] == x_vvid].reset_index(drop=True)
        
    if shared_rides_trips_only:
        df = df[df['vehicle_view_id'].isin((hel_vvid, pool_vvid))].reset_index(drop=True)
    
    _int_cols = ['city_id',
                 'capacity',
                 'vehicle_view_id',
                 'eyeball_eta',
                 'max_wait_time_sec']
    
    df = df[pd.notnull(df[_int_cols]).all(axis=1)]
    
    for _col in _int_cols:
        if _col in df.columns:
            df[_col] = df[_col].astype(int)
    
    return df


# Modifies the fliprs, branches, and other settings in the sim.
#
# Feel free to customize by adding an argument to this function (in this example, "driving_buffer_factor"),
# then add the keyword argument-value pair to the configuration_space dictionary below.

def modify_sim_settings(new_sim_job,
                        city_id,
                        city_radar_dict,
                        max_retries,
                        job_age,
                        **kwargs):
    # update the city here
    new_sim_job.set_city_id(city_id)
    
    # update pre/post-processing scripts here
    new_sim_job.update_preprocessing_scripts([])
    new_sim_job.update_postprocessing_scripts(['pt_fare_calculation_postprocessing.py',
                                               'experience_metrics_postprocessing.py',
                                               'report_generation_postprocessing.py'])
    
    # update radars here
    new_sim_job.update_fliprs({'rt-control-tower': {'ct_radars_v1': city_radar_dict}})
    
    
    # update dependencies here
    # new_sim_job.update_dependencies({'mitm-perf': 'v2.4.2'})
    
    # update settings here
    # new_sim_job.update_settings({'pickup_walking_radius': 350,
    #                              'dropoff_walking_radius': 350})

    # update fliprs here
    #new_sim_job.update_fliprs({'rt-control-tower':{'jit.max_retries_for_unfulfil': max_retries}})
    #new_sim_job.update_fliprs({'rt-control-tower':{'jit.job_age_before_unfulfil': job_age}})
    #new_sim_job.update_fliprs({'multileg':{'parallel_solving.of_delta': of_delta}})
    #new_sim_job.update_fliprs({'multileg':{'parallel_solving.contention_threshold': contention_threshold}})
    
    new_sim_job.update_fliprs({'rt-control-tower':{'jit.max_retries_for_unfulfil': max_retries}})
    new_sim_job.update_fliprs({'rt-control-tower':{'jit.job_age_before_unfulfil': job_age}})
    return new_sim_job


# Takes a single baseline sim and modifies it with the provided parameters.
# This includes modifying the settings, flipr config, trip data, and simulation name.

def batch_sim_mapper(sim_job, batch_uuid, param_name, **params):

    # SIM NAME + BATCH IDS
    
    desc = '{}_{}_{}'.format(PROJECT_PRE, batch_uuid, param_name)
    batch_ids = [PROJECT_NAME,
                  params['city']['key'].upper(),
                  params['time_bucket']['key'].upper(),
                 '{}_{}'.format(PROJECT_PRE,str(batch_uuid).upper())]
    
    sim_job.set_description(desc)
    sim_job.set_batch_ids(batch_ids)
    
    # FLIPR + SIM SETTINGS (we pass both city args and configuration args into the modify_sim_settings() function)
    modify_sim_settings(sim_job, **dict(list(params['city']['args'].items()) + list(params['configuration']['args'].items())))

    # TRIP DATA (we pass both city args and configuration args into the modify_trip_data() function)
    new_trip_data = modify_trip_data(processed_trips_dict[params['city']['key'], params['time_bucket']['key']], 
                                     **dict(list(params['city']['args'].items()) + list(params['configuration']['args'].items())))
    
    assert len(new_trip_data), (
        'There are no trips for {} simulation'.format(param_name)
    )
    sim_job.set_demand_input(new_trip_data)
    
    return sim_job


# Time commitment preprocessing
#
# Before running, run the following command while ssh'ed into DSW server, 
# replacing <ldap> with your own username,
# making sure the proper dependencies are installed.
# 
# git clone gitolite@code.uber.internal:meff/mitm-perf /mnt/cephfs/hadoop-compute/phoenix/<ldap>/mitm-perf

def offline_preprocessing(df_dict, script_list=['time_commitment_preprocessing.py'], yaml_dict=None):

    PYTHON_PATH = os.path.join(os.getenv('VIRTUAL_ENV_DIR'), 'bin/python3')
    MITM_PERF_PATH = '../mitm-perf'
    
    assert PYTHON_PATH
    assert MITM_PERF_PATH
    
    processed_trips_dict = {}
    
    script_list = list(map(lambda r: r[:-3], script_list))
    yaml_dict = yaml_dict or {}
    
    current_dir = !pwd
    current_dir = current_dir[0]
    
    for key, df in tqdm(df_dict.items()):
        assert len(df), f'No trips found for {key}'
        df.to_csv('./{}/trip_data'.format(PROJECT_DIR), index=False)
        if key in yaml_dict:
            with open('./{}/simulation.yaml'.format(PROJECT_DIR), 'w') as f:
                yaml.safe_dump(yaml_dict[key], f)
        
        for script in script_list:
            !cd {MITM_PERF_PATH} && $ -m scripts.processing.{script} -i {current_dir}/{PROJECT_DIR} -o {current_dir}/{PROJECT_DIR} > /dev/null
        
        processed_trips_dict[key] = pd.read_csv('./{}/trip_data'.format(PROJECT_DIR), na_values=['\\N'])
        !rm ./{PROJECT_DIR}/trip_data
        if key in yaml_dict:
            !rm ./{PROJECT_DIR}/simulation.yaml
        
    return processed_trips_dict

# Start sim

In [177]:
## SIM CONSTANTS

data_fetcher = DataFetcher(user_email=USER_EMAIL, consumer_name='intelligentdispatch')
sim_client = MultiverseClient(user_email=USER_EMAIL, user_uuid=USER_UUID)

BASELINE_UUID = None ## None defaults to using the baseline corresponding to flow-type below, feel free to use your own uuid string
FLOW_TYPE = 'solo' ## choose between pool, solo, rt-surge (reach out to sim team if you have a new baseline use case)

PRIORITY = 1
PIPELINES = ['rt-sim{}'.format(str(k).zfill(2)) for k in range(1,33)]

START_DATE = '2022-06-01 00:00:00'
END_DATE = '2022-06-07 00:00:00'
NUM_HOURS = 1
RNG_SEED = 2835602352


# These are the cities for which metadata (e.g. vvids, radars) will be fetched, feel free to change

CITY_IDS = [12, 7, 90, 1542, 20, 39, 6, 25, 31, 296, 135, 5, 531, 14, 27, 198, 791, 1, 211, 214, 143, 8, 18, 16, 493, 458, 244, 23, 3, 13, 148, 24] 
CITY_IDS = [25, 20]
CITY_SETTINGS = data_fetcher.get_city_settings(CITY_IDS, datestr=START_DATE.split()[0], filter_fifo_radars=True, filter_hcv_radars=True) 

06/28/2022 05:04:36 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:36 PM [93m [Polling] e8ab26cd-1559-4ac3-afd7-f7fd73fa409b [0m
06/28/2022 05:04:36 PM [93m [Status] created [0m
06/28/2022 05:04:37 PM [93m [Status] started waiting to execute [0m
06/28/2022 05:04:38 PM [93m [Status] started execution [0m
06/28/2022 05:04:46 PM [93m [Status] completed success [0m
06/28/2022 05:04:46 PM [92m [Query Success] completed success [0m


In [178]:
CITY_SETTINGS.keys(), len(CITY_SETTINGS.keys())

(dict_keys(['PHL', 'DAL']), 2)

In [180]:
##  PARAMETER SPACE
# Use the CanonicalTimeBucket object to sample different intervals 
# during the given time range for each of the 7 time buckets.

bucketer = CanonicalTimeBucket()
tb_space = {tb: dict(zip(('start_timestamp_local', 'end_timestamp_local'), 
                          bucketer.sample_from_bucket(tb, START_DATE, END_DATE, NUM_HOURS,
                                                      random_number_gen=np.random.RandomState(seed=RNG_SEED))))
            for tb in bucketer}


# Sample city space

city_space = CITY_SETTINGS


# Sample param space for fliprs and trip modifications.

# This is a dictionary where the keys are parameter names, and the values are themselves dictionaries of
# keyword arguments that get sent to modify_sim_settings() and modify_trip_data()

#configuration_space = {'MaxRetries16': {'max_retries': 16}, 'MaxRetries12': {'max_retries': 12},
#                       'MaxRetries8': {'max_retries': 8}}

#configuration_space = {'JobAge720000': {'job_age': 720000}, 'JobAge480000': {'job_age': 480000},
#                       'JobAge960000': {'job_age': 960000}, 'JobAge1200000': {'job_age': 1200000}, 'JobAge1440000': {'job_age': 1440000}}

#configuration_space = {'Delta40': {'of_delta': 40}, 'Delta0': {'of_delta': 0},
#                       'Delta20': {'of_delta': 20}}

#configuration_space = {'Contention1': {'contention_threshold': 1}, 'Contention2': {'contention_threshold': 2},
#                       'Contention3': {'contention_threshold': 3}}

configuration_space = {'JobAge16minsMaxRetries24': {'max_retries': 24, 'job_age': 960000},'JobAge16minsMaxRetries16': {'max_retries': 16, 'job_age': 960000}, 'JobAge16minsMaxRetries12': {'max_retries': 12, 'job_age': 960000},
                       'JobAge16minsMaxRetries8': {'max_retries': 8, 'job_age': 960000}}



# filtered cartesian product of all possible parameters,
# customize the last line to fit your needs

param_space_v2 = {'{}_{}_{}'.format(city,tb,cfg):
                      {'city': {'key': city,
                                'args': city_args},
                       'time_bucket': {'key': tb,
                                       'args': tb_args},
                       'configuration': {'key': cfg,
                                         'args': cfg_args}}
                      for (city,city_args), (tb,tb_args), (cfg,cfg_args)
                  in it.product(city_space.items(), tb_space.items(), configuration_space.items())
                  }

In [181]:
# Prints all the combinations of parameters that you will be simulating

print('Total configurations: {}\n'.format(len(param_space_v2)))
print('\n'.join(sorted(param_space_v2.keys())))

Total configurations: 56

DAL_Graveyard_JobAge16minsMaxRetries12
DAL_Graveyard_JobAge16minsMaxRetries16
DAL_Graveyard_JobAge16minsMaxRetries24
DAL_Graveyard_JobAge16minsMaxRetries8
DAL_Weekday_AM_Commute_JobAge16minsMaxRetries12
DAL_Weekday_AM_Commute_JobAge16minsMaxRetries16
DAL_Weekday_AM_Commute_JobAge16minsMaxRetries24
DAL_Weekday_AM_Commute_JobAge16minsMaxRetries8
DAL_Weekday_Day_JobAge16minsMaxRetries12
DAL_Weekday_Day_JobAge16minsMaxRetries16
DAL_Weekday_Day_JobAge16minsMaxRetries24
DAL_Weekday_Day_JobAge16minsMaxRetries8
DAL_Weekday_Evening_JobAge16minsMaxRetries12
DAL_Weekday_Evening_JobAge16minsMaxRetries16
DAL_Weekday_Evening_JobAge16minsMaxRetries24
DAL_Weekday_Evening_JobAge16minsMaxRetries8
DAL_Weekday_PM_Commute_JobAge16minsMaxRetries12
DAL_Weekday_PM_Commute_JobAge16minsMaxRetries16
DAL_Weekday_PM_Commute_JobAge16minsMaxRetries24
DAL_Weekday_PM_Commute_JobAge16minsMaxRetries8
DAL_Weekend_Off-Peak_JobAge16minsMaxRetries12
DAL_Weekend_Off-Peak_JobAge16minsMaxRetries16
DAL

In [182]:
# Concurrently queries for trips for all cities + time buckets specified above
# Now fetches both Pool and X trips, but modify_trip_data() can filter based on vvid/product_type

query_param_dict = {(params['city']['key'], params['time_bucket']['key']): 
                    dict(list(params['time_bucket']['args'].items()) + list(params['city']['args'].items()))
                    for params in param_space_v2.values()}

raw_trips_dict = dict(zip(query_param_dict.keys(), data_fetcher.query_many_presto(map(lambda p: data_fetcher.generate_demand_query(**p), query_param_dict.values()))))

06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.
06/28/2022 05:04:53 PM Send empty tier_metadata {} to Queryrunner V2.


In [183]:
# Uses local mitm-perf repository to preprocess your trips file. 
# To not do this, uncomment/comment out the lines below in the appropriate manner.

# processed_trips_dict = offline_preprocessing(raw_trips_dict)
processed_trips_dict = raw_trips_dict


In [184]:
# Prints how many trips are in each bucket. Good to confirm that there are trips here.

dict(zip(processed_trips_dict.keys(), map(len, processed_trips_dict.values())))

{('PHL', 'Graveyard'): 968,
 ('PHL', 'Weekday_AM_Commute'): 4249,
 ('PHL', 'Weekday_Day'): 4154,
 ('PHL', 'Weekday_Evening'): 3367,
 ('PHL', 'Weekday_PM_Commute'): 5306,
 ('PHL', 'Weekend_Off-Peak'): 3964,
 ('PHL', 'Weekend_Peak'): 4357,
 ('DAL', 'Graveyard'): 1025,
 ('DAL', 'Weekday_AM_Commute'): 3269,
 ('DAL', 'Weekday_Day'): 3895,
 ('DAL', 'Weekday_Evening'): 2822,
 ('DAL', 'Weekday_PM_Commute'): 4094,
 ('DAL', 'Weekend_Off-Peak'): 3310,
 ('DAL', 'Weekend_Peak'): 3552}

In [185]:
# Creates batch job and fetches initial baseline sim.

batch_job = BatchJob(baseline_uuid=BASELINE_UUID,
                     flow_type=FLOW_TYPE,
                     sim_client=sim_client)

batch_uuid = batch_job.get_batch_uuid()


In [186]:
# Applies the batch_sim_mapper() function to the param space dictionary.

batch_job.apply(batch_sim_mapper, param_space_v2)

In [187]:
# Submits the entire batch at once with the specified priority and pipelines.
batch_job.submit(priority=PRIORITY, pipelines=PIPELINES)

# Dumps the batch job to a JSON file, including the map between sim uuid and different configs
batch_job.to_json('{}/{}_{}.json'.format(PROJECT_DIR, PROJECT_PRE, batch_job.get_batch_uuid()))


In [142]:
# At this point, check thermometer.uberinternal.com to visually confirm sim submission.

# Post submission

In [None]:
# kill jobs
# batch_job.cancel_unfinished()

In [188]:
# Displays the status of all the sims in the batch.
print(json.dumps(batch_job.get_state(), indent=2))

{
  "PHL_Graveyard_JobAge16minsMaxRetries24": "sim_failed",
  "PHL_Graveyard_JobAge16minsMaxRetries16": "postproc_completed",
  "PHL_Graveyard_JobAge16minsMaxRetries12": "postproc_completed",
  "PHL_Graveyard_JobAge16minsMaxRetries8": "postproc_completed",
  "PHL_Weekday_AM_Commute_JobAge16minsMaxRetries24": "postproc_completed",
  "PHL_Weekday_AM_Commute_JobAge16minsMaxRetries16": "postproc_completed",
  "PHL_Weekday_AM_Commute_JobAge16minsMaxRetries12": "postproc_completed",
  "PHL_Weekday_AM_Commute_JobAge16minsMaxRetries8": "postproc_completed",
  "PHL_Weekday_Day_JobAge16minsMaxRetries24": "postproc_completed",
  "PHL_Weekday_Day_JobAge16minsMaxRetries16": "postproc_completed",
  "PHL_Weekday_Day_JobAge16minsMaxRetries12": "postproc_completed",
  "PHL_Weekday_Day_JobAge16minsMaxRetries8": "postproc_completed",
  "PHL_Weekday_Evening_JobAge16minsMaxRetries24": "postproc_completed",
  "PHL_Weekday_Evening_JobAge16minsMaxRetries16": "postproc_completed",
  "PHL_Weekday_Evening_JobAge

In [189]:
# Same printout from before, except now the sim uuids are filled in.
print(batch_job)

{
  "batch_uuid": "57058252-c6fb-4c93-92ca-4c51b39b8009",
  "baseline_uuid": null,
  "baseline_description": "AutoRelease Solo Baseline 06/26/2022",
  "flow_type": "solo",
  "simulations": {
    "PHL_Graveyard_JobAge16minsMaxRetries24": {
      "parameters": {
        "city": "PHL",
        "time_bucket": "Graveyard",
        "configuration": "JobAge16minsMaxRetries24"
      },
      "sim_uuid": "dff36d39-ac4d-4f9e-9dea-1e3264df9403"
    },
    "PHL_Graveyard_JobAge16minsMaxRetries16": {
      "parameters": {
        "city": "PHL",
        "time_bucket": "Graveyard",
        "configuration": "JobAge16minsMaxRetries16"
      },
      "sim_uuid": "14407463-3958-4371-b9d7-66a42f5d82b1"
    },
    "PHL_Graveyard_JobAge16minsMaxRetries12": {
      "parameters": {
        "city": "PHL",
        "time_bucket": "Graveyard",
        "configuration": "JobAge16minsMaxRetries12"
      },
      "sim_uuid": "0815411f-53c0-4cff-b4b5-b25684553ac8"
    },
    "PHL_Graveyard_JobAge16minsMaxRetries8": {


In [190]:
len(batch_job.param_dict), len(batch_job.sim_dict), batch_job.param_dict.keys()

# remove failed jobs

state = batch_job.get_state()
failed = set([k for k in state if state[k] != 'postproc_completed'])

failed

{'DAL_Graveyard_JobAge16minsMaxRetries16',
 'DAL_Weekday_PM_Commute_JobAge16minsMaxRetries24',
 'DAL_Weekend_Off-Peak_JobAge16minsMaxRetries16',
 'DAL_Weekend_Off-Peak_JobAge16minsMaxRetries24',
 'PHL_Graveyard_JobAge16minsMaxRetries24',
 'PHL_Weekend_Off-Peak_JobAge16minsMaxRetries16',
 'PHL_Weekend_Off-Peak_JobAge16minsMaxRetries24'}

In [191]:
len(batch_job.param_dict), len(batch_job.sim_dict), batch_job.param_dict.keys()

# remove failed jobs

state = batch_job.get_state()
failed = set([k for k in state if state[k] != 'postproc_completed'])

param_dict_copy = {}
sim_dict_copy = {}


for k in batch_job.param_dict:
    if k not in failed:
        param_dict_copy[k] = batch_job.param_dict[k]
        
        
for k in batch_job.sim_dict:
    if k not in failed:
        sim_dict_copy[k] = batch_job.sim_dict[k]
         
batch_job.param_dict = param_dict_copy
batch_job.sim_dict = sim_dict_copy

In [192]:
# When the sims are done, we can fetch all their trip_stats output with a single command.

trip_stats_dict = batch_job.get_trip_stats()

for sim_name, trip_stats_df in trip_stats_dict.items():
    trip_stats_df.to_csv('{}/{}_{}_{}_trip_stats.csv'.format(PROJECT_DIR, PROJECT_PRE, batch_uuid, sim_name),index=False)

In [193]:
# analyze trip stats

#trip_stats_dict['TO_Graveyard_MaxRetries16'].shape, trip_stats_dict['TO_Graveyard_MaxRetries16'].columns.tolist()#

In [194]:
#trip_stats_dict['TO_Graveyard_MaxRetries16'][(trip_stats_dict['TO_Graveyard_MaxRetries16']['unfulfilled']==False) & (trip_stats_dict['TO_Graveyard_MaxRetries16']['driver_cancelled']==False) & (trip_stats_dict['TO_Graveyard_MaxRetries16']['rider_cancelled']==False)].shape

In [195]:
from collections import defaultdict

#trip_stats_dict.keys()
def parse_key(key):
    # returns city, param, bucket
    splits = key.split("_")
    bucket = "_".join(splits[1:-1])
    return splits[0], splits[-1], bucket

failed_city_buckets = [(parse_key(key)[0], parse_key(key)[2]) for key in failed]

city_totals = defaultdict(lambda: defaultdict(int))
for key in trip_stats_dict.keys():
    city, param, bucket = parse_key(key) 
    if (city, bucket) not in failed_city_buckets : #and 'Weekend' not in bucket
        #if city == 'BOS':
        #    print(param, key, trip_stats_dict[key][(trip_stats_dict[key]['unfulfilled']==False) & (trip_stats_dict[key]['driver_cancelled']==False) & (trip_stats_dict[key]['rider_cancelled']==False)].shape[0])
        city_totals[city][param] += trip_stats_dict[key][(trip_stats_dict[key]['unfulfilled']==False) & (trip_stats_dict[key]['driver_cancelled']==False) & (trip_stats_dict[key]['rider_cancelled']==False)].shape[0]

In [196]:
city_totals

defaultdict(<function __main__.<lambda>()>,
            {'PHL': defaultdict(int,
                         {'JobAge16minsMaxRetries24': 19258,
                          'JobAge16minsMaxRetries16': 19200,
                          'JobAge16minsMaxRetries12': 19193,
                          'JobAge16minsMaxRetries8': 19219}),
             'DAL': defaultdict(int,
                         {'JobAge16minsMaxRetries24': 10587,
                          'JobAge16minsMaxRetries16': 10529,
                          'JobAge16minsMaxRetries12': 10470,
                          'JobAge16minsMaxRetries8': 10397})})

In [78]:
# Don't worry about losing progress, you can always reload the BatchJob object from the previously saved json.

PROJECT_DIR = 'notebook_test_sims' # name of project directory that will contain sim metadata for this project
#PROJECT_PRE = 'UnfulfilMaxRetries' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
#PROJECT_PRE = 'UnfulfilJobAge' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
#PROJECT_PRE = 'OFDelta' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>
PROJECT_PRE = 'UnfulfilJobAge' # name of the project prefix -- sim name will look like <PROJECT_PRE>_<batch_uuid>

PROJECT_NAME = 'ParamTuning' # name of the project, used as a batch id tag
batch_uuid = '049e99c0-a479-4b74-815c-e1d90b9a8e0a'

batch_job = BatchJob.from_json('{}/{}_{}.json'.format(PROJECT_DIR, PROJECT_PRE, batch_uuid), submitter_email=USER_EMAIL, submitter_uuid=USER_UUID)

In [79]:
# remove failed jobs

state = batch_job.get_state()
failed = set([k for k in state if state[k] != 'postproc_completed'])

param_dict_copy = {}
sim_dict_copy = {}


for k in batch_job.param_dict:
    if k not in failed:
        param_dict_copy[k] = batch_job.param_dict[k]
        
        
for k in batch_job.sim_dict:
    if k not in failed:
        sim_dict_copy[k] = batch_job.sim_dict[k]
         
batch_job.param_dict = param_dict_copy
batch_job.sim_dict = sim_dict_copy

In [80]:
# Loads all the previously saved trip stats from disk.


trip_stats_dict  = {}
    
for param_name in batch_job.get_param_list():
    trip_stats_dict[param_name] = pd.read_csv('{}/{}_{}_{}_trip_stats.csv'.format(PROJECT_DIR, PROJECT_PRE, batch_uuid, param_name))

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
