In [1]:
import json
import datetime
import re
import itertools

import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from queryrunner_client import Client as QRClient
#from batch_utils.batch_querier import BatchQuerier
from mdstk.data_fetcher.data_fetcher import DataFetcher
from mdstk.data_fetcher.cached_data_fetcher import CachedDataFetcher

from dataclasses import dataclass


#from queryrunner_client import Client
#qclient = Client(user_email='mehrdad@uber.com')

import os
import pulp
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from joblib import Parallel, delayed
#num_cores = multiprocessing.cpu_count()
n_cores = 4


# Fetch data

In [2]:
USER_EMAIL = 'mehrdadb@uber.com'
CONSUMER_NAME = 'intelligentdispatch'

QUERY = """
with dispatch as (
    select 
        datestr,
        msg.cityid,
        msg.ctplangenrequestuuid as plangen_uuid,
        msg.ctrequestuuid as scan_uuid,
        msg.jobuuid[1] as job_uuid,
        msg.supplyuuid,
        msg.planactiontype
    from rawdata_user.kafka_hp_multileg_dispatched_plan_nodedup
    where datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.vehicleviewid in {vvid} 
    and msg.tenancy = 'uber/production'
    and CARDINALITY(msg.jobuuid) > 0
    and substr(msg.ctrequestuuid, 1, length('{digits}')) = '{digits}'
),
plangen as (
    select 
        msg.jobs[1].uuid as job_uuid,
        msg.supplyuuid,
        msg.scanuuid as plangen_uuid,
        msg.waypoints[1].latitude as pickup_latitude
    from rawdata_user.kafka_hp_plangenerator_matching_plans_log_nodedup
    where datestr = '{datestr}'
    and msg.cityid = {city_id}
    and msg.tenancy = 'uber/production'
    and CARDINALITY(msg.jobs) > 0
    and msg.planstatus = 'eligible'
),
mgv as (
    select datestr,
           msg.city_id,
           msg.job_uuid,
           msg.client_uuid,
           msg.ct_request_uuid as plangen_uuid,
           msg.supply_uuid,
           msg.supply_plan_uuid as plan_uuid,
           msg.unadjusted_eta as eta,
           msg.fd_eta,
           msg.adjustedeta,
           msg.ranking_metric,
           round(1 - msg.solo_cancel_model_driver_accept_prob, 4) as d_proba,
           round(1 - msg.solo_cancel_model_rider_accept_prob, 4) as r_proba,
           round(1 - msg.spinner_survive_prob_before_next_scan, 4) as s_proba,
           msg.preferred_destination_adjustment,
           msg.objective_value as of_value,
           msg.inconvenience_etd - msg.ranking_metric as trip_length,
           msg.pickup_latitude as pickup_latitude,
           msg.pickup_longitude as pickup_longitude,
           msg.supply_latitude as supply_latitude,
           msg.supply_longitude as supply_longitude
    from   rawdata.kafka_hp_multileg_mgv_log_nodedup
    where  datestr = '{datestr}'
    and    msg.city_id = {city_id}
    and    msg.tenancy = 'uber/production'
    and    msg.vehicle_view_id in {vvid} 
    and    msg.flow_type = 'solo_batch'
    and    msg.job_uuid <> msg.client_uuid
),
test as (
    select 
        mgv.datestr,
        mgv.city_id,
        plangen.pickup_latitude as plangen_pickup_lat,
        dispatch.scan_uuid,
        mgv.plangen_uuid,
        mgv.job_uuid,
        dispatch.planactiontype,
        mgv.supply_uuid,
        case when dispatch.supplyuuid = mgv.supply_uuid then 1 else 0 end as is_selected,
        mgv.eta,
        mgv.adjustedeta,
        mgv.fd_eta,
        mgv.ranking_metric,
        mgv.d_proba,
        mgv.r_proba,
        mgv.s_proba,
        mgv.preferred_destination_adjustment,
        mgv.of_value,
        mgv.trip_length,
        ftf.est_rider_quoted_final_fare as upfront_fare,
        ftf.est_rider_rsp_multiplier,
        ft.surge_multiplier,
        case when fst.trip_uuid is null then 0 else 1 end as is_scheduled_trip,
        fst.upfront_fare as fst_upfront_fare,
        fst.reservation_variant,
        mgv.pickup_latitude as mgv_pickup_lat,
        mgv.pickup_longitude as mgv_pickup_lng,
        mgv.supply_latitude as mgv_supply_lat,
        mgv.supply_longitude as mgv_supply_lng
    from mgv
    join plangen
    on mgv.plangen_uuid = plangen.plangen_uuid
    and mgv.job_uuid = plangen.job_uuid
    and mgv.supply_uuid = plangen.supplyuuid
    join dispatch
    on mgv.plangen_uuid = dispatch.plangen_uuid
    and mgv.job_uuid = dispatch.job_uuid
    join dwh.fact_trip_fare ftf
    on mgv.job_uuid = ftf.trip_uuid
    and ftf.datestr = mgv.datestr
    and ftf.datestr = '{datestr}'
    join dwh.fact_trip ft
    on mgv.job_uuid = ft.uuid
    and mgv.datestr = ft.datestr
    and ft.datestr = '{datestr}'
    left join rider.fact_scheduled_trip fst
    on mgv.job_uuid = fst.trip_uuid
    and mgv.datestr = fst.datestr
    and fst.datestr = '{datestr}'
)
select * from test
"""

@dataclass
class Query:
    prefix: str
    hex_digits: str
    city_id: int
    vvid: str
    datestr: str
    
    def __post_init__(self):
        self.name = f'{self.prefix}_city{self.city_id}_{self.vvid}_{self.datestr}_segment{self.hex_digits}'
        self.qry = QUERY.format(city_id=self.city_id, vvid=self.vvid, digits=self.hex_digits, datestr=self.datestr)

In [3]:
# Query for VVIDs: https://querybuilder-ea.uberinternal.com/r/rEQz18SJz/run/DoTpgoE81/edit
prefix = 'replay'
hex_digits = '35'

# 6, 1, 21, 15, 26,
#    235, 35, 93, 50, 30

city_name = {
    1: 'SF',
    3: 'Paris',
    12: 'LA',
    52: 'Lisbon',
    458: 'Sao Paulo',
    90: 'Mexico City',
    146: 'Bogota',
    803: 'Goiania',
    218: 'Tijuana',
    1379: 'Cabro Frio',
    799: 'Recife',
}

city_id_vvids = {
    1: '(8)', #SF
#     3: '(235)', # Paris
#     6: '(116)',
    12: '(125)', # Los Angeles
#     21: '(184)',
#     15: '(1783)',
#     26: '(347)',
#     235: '(685)',
#     35: '(442)',
#     93: '(353)',
#     50: '(425)',
#     52: '(120)', # Lisbon
#     30: '(350)',
#     458: '(3825)', # Sao Paulo
#     493: '(5433)', # Belo Horizonte
#     90: '(651)',
#     146: '(1934)',
#     803: '(10369)',
#     218: '(837)',
#     1379: '(10002430)',
#     799: '(11047)',

}
datestrs = [  # 1 week
#     '2021-09-21',
#     '2021-09-22',
#     '2021-09-23',
#     '2021-09-24',
#     '2021-09-25',
#     '2021-09-26',
    '2021-09-27',    
]

queries = [
    Query(prefix=prefix, hex_digits=hex_digits, city_id=city_id, vvid=vvid, datestr=datestr)
    for (city_id, vvid), datestr in itertools.product(city_id_vvids.items(), datestrs)
]

cache_qry_map = {
    q.name: q.qry 
    for q in queries
}

In [4]:
class MyDataFetcher(DataFetcher):
    def query_many_presto(self, *args, **kwargs):
        return super().query_many_presto(*args, **kwargs)#, timeout=2000)
    
cdf = CachedDataFetcher(
    data_fetcher=MyDataFetcher(
        user_email=USER_EMAIL,
        consumer_name=CONSUMER_NAME,
    ),
    cache_qry_map=cache_qry_map,
    #datacenter='dca1',
    datasource='presto-secure',
)

cdf.fetch(bust_cache=False)


Loaded 2/2 dataframes from cache!


In [5]:
scans = pd.concat(cdf.dfs.values(), axis=0, ignore_index=True).query("of_value > 1")
scans.shape

(39132, 29)

In [6]:
scans.head()

Unnamed: 0,datestr,city_id,plangen_pickup_lat,scan_uuid,plangen_uuid,job_uuid,planactiontype,supply_uuid,is_selected,eta,...,upfront_fare,est_rider_rsp_multiplier,surge_multiplier,is_scheduled_trip,fst_upfront_fare,reservation_variant,mgv_pickup_lat,mgv_pickup_lng,mgv_supply_lat,mgv_supply_lng
0,2021-09-27,1,37.79411,353d6eef-601f-4c30-9275-e39c863b53b9,aebaaafb-3c34-4afb-bbad-9222a53540ec,d8e6439d-4b62-4387-8a9c-136699df1754,OFFER,fc099e06-4f97-43c0-bd7e-bc1876650ab8,0,818.0,...,15.82,0.96521,1.3,0,,,37.785175,-122.40537,37.781803,-122.401458
2,2021-09-27,1,37.78415,35830cae-a0ea-4618-a9f8-42103035e55b,4040b712-d02f-4824-aa64-16acb88c4610,8533c894-8085-4bdf-8617-a1551ae27064,OFFER,ddea8314-8cc3-4840-8fbe-c8d082cbae8b,0,523.0,...,18.17,1.074415,2.0,0,,,37.78415,-122.40456,37.77903,-122.401484
4,2021-09-27,1,37.776104,353d6eef-601f-4c30-9275-e39c863b53b9,aebaaafb-3c34-4afb-bbad-9222a53540ec,d8e6439d-4b62-4387-8a9c-136699df1754,OFFER,c59fb151-d271-4860-9024-1b44d6589378,0,733.0,...,15.82,0.96521,1.3,0,,,37.785175,-122.40537,37.788347,-122.416866
5,2021-09-27,1,37.776104,353d6eef-601f-4c30-9275-e39c863b53b9,aebaaafb-3c34-4afb-bbad-9222a53540ec,d8e6439d-4b62-4387-8a9c-136699df1754,OFFER,7793e7fb-f6c8-4cae-98bd-69f189cd4060,0,708.0,...,15.82,0.96521,1.3,0,,,37.785175,-122.40537,37.776562,-122.394405
8,2021-09-27,1,37.406757,351aac3b-2c8e-4323-adf0-146b0bff57da,217eb3f5-ec1d-49cf-b975-57dcb2dc7ef5,9e5de763-34e0-43fb-9e7c-28e98bd0d1dd,KEEP_ALIVE,98d603d9-a479-415b-9c0d-bd328edd6803,0,785.0,...,56.93,,4.9,0,,,37.406757,-121.97379,37.35244,-121.973464


In [7]:
import geopandas as gpd
import osmnx as ox


In [8]:
import pandas as pd
import zipfile
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import contextily as ctx
from shapely.geometry import Point, LineString

In [9]:
with zipfile.ZipFile('tl_2017_06075_roads.zip', 'r') as zip_ref:
    zip_ref.extractall('shapefiles')

In [10]:
# geo_df = gpd.read_file('shapefiles/tl_2017_06075_roads.shp')
geo_df = gpd.read_file('stanford_shapefile/zk060kc5726.shp')
### geo_df = gpd.read_file('bay_area_shapefile/geo_export_8129597e-2c44-4cec-b815-c997d3a423d1.shp')
### geo_df = gpd.read_file('bay_area_shapefile_2/bayarea_zipcodes.shp')


INFO:fiona.ogrext:Failed to auto identify EPSG: 7


In [11]:
from flipr_client.clients.remote_client import RemoteClient
from batch_utils import helpers

flipr = RemoteClient(
    host='localhost',
    port=14570,
    application_identifier='autolaszlo'
)
radars = helpers.get_radars(1, flipr)
print(len(radars['radars']))
radar_set = set()
for radar in radars['radars']:
    if radar['radarFlow'] == 'uberx':
        radar_set.add((radar['center']['latitude'], radar['center']['longitude'], radar['maxRadius']))
#         print(radar['maxRadius'])
#         print(radar['center']['latitude'], radar['center']['longitude'])
print(len(radar_set))

88
15


In [28]:
# geo_df = geo_df.to_crs(epsg=3857)
# # Initialize our plot
# fig, ax = plt.subplots(figsize=(10,10))
# # Plot our SF GeoDataFrame
# geo_df.plot(ax=ax, alpha = .1)
# # Add in a background using contextily
# ## ctx.add_basemap(ax)
# ctx.add_basemap(ax, source = ctx.sources.OSM_A)

from matplotlib.patches import Circle


only_sf = False
show_radars = False

blue_color = 'b'# "#390099"#"#3a86ff"
red_color = "#d81159"

print(cdf.dfs.keys())
prune_of = 1500
for scans_key in cdf.dfs.keys():
    city_id = int(scans_key.split('_')[1][4:])
    datestr = scans_key.split('_')[3]
#     if city_id not in [1, 3, 12, 52, 458]:
    if city_id not in [1]:
        continue
    scans = cdf.dfs[scans_key]
#     number_of_clusters = get_number_of_components_in_all_scans(scans)
    df = scans
    markov_df = df[df['of_value']>1]
#     markov_df = compute_new_of(markov_df, c_d = 1)
    total_scans = list(set(markov_df['scan_uuid']))
    job_count_list = []
    cluster_count_list = []
    #d = {'col1': [1, 2], 'col2': [3, 4]}
    #df = pd.DataFrame(data=d)
#     OG figure is scan[19]
    for scan_count, scan_uuid in enumerate(total_scans[5:30]):
#     for scan_count, scan_uuid in enumerate(total_scans[19:20]):
#     for scan_count, scan_uuid in enumerate(total_scans[19:35]):
        scan = markov_df[markov_df['scan_uuid'] == scan_uuid]
        plan_count = scan.shape[0]
        pickup_lats = list(scan['mgv_pickup_lat'])
        pickup_lngs = list(scan['mgv_pickup_lng'])
        supply_lats = list(scan['mgv_supply_lat'])
        supply_lngs = list(scan['mgv_supply_lng'])
        of_column = 'of_value'
        of_values = list(scan[of_column])
#         drivers = scan[['mgv_supply_lat', 'mgv_supply_lng']].copy()
#         riders = scan[['mgv_pickup_lat', 'mgv_pickup_lng']].copy()
        driver_points = []
        rider_points = []
        graph_edges = []
        unique_drivers = set()
        unique_riders = set()
        for j in range(plan_count):
            if of_values[j] > prune_of:
                continue
            pickup = pickup_lats[j]
            unique_riders.add(list(scan['job_uuid'])[j])
            unique_drivers.add(list(scan['supply_uuid'])[j])
            supply_point = Point(float(list(scan['mgv_supply_lng'])[j]), float(list(scan['mgv_supply_lat'])[j]))
            job_point = Point(float(list(scan['mgv_pickup_lng'])[j]), float(list(scan['mgv_pickup_lat'])[j]))
            driver_points.append(supply_point)
            rider_points.append(job_point)
            graph_edges.append(LineString([supply_point, job_point]))
        
#         print(scan_count, len(set(list(scan['job_uuid']))), len(graph_edges))
#         continue
#         if len(unique_riders) < len(unique_drivers):
#             continue
#         print(scan_count, len(unique_riders), len(unique_drivers))
#         continue
#         print(unique_riders)
#         print(rider_points)

        geo_df = geo_df.to_crs(epsg=3857)
        # Initialize our plot
        fig, ax = plt.subplots(figsize=(10,10))
        # Plot our SF GeoDataFrame
        sf_alpha = 0.05 if only_sf else 0.01
        geo_df.plot(ax=ax, alpha = sf_alpha)
        # Add in a background using contextily
        # ctx.add_basemap(ax)
        if only_sf:
            ctx.add_basemap(ax, source = ctx.sources.OSM_A)

            
        riders = pd.DataFrame()
        drivers = pd.DataFrame()
        edges = pd.DataFrame()
        radars = pd.DataFrame()

        drivers['geometry'] = driver_points
        # Convert Pandas DataFrame to a GeoDataFrame
        drivers_geo_df = gpd.GeoDataFrame(drivers, geometry='geometry')
        # Initialize crs to 4326 because that's the format of our geomtry
        drivers_geo_df.crs = "EPSG:4326"
        # Change the crs to match our SF GeoDataFrame
        drivers_geo_df = drivers_geo_df.to_crs(epsg = 3857)
        
        riders['geometry'] = rider_points
        # Convert Pandas DataFrame to a GeoDataFrame
        riders_geo_df = gpd.GeoDataFrame(riders, geometry='geometry')
        # Initialize crs to 4326 because that's the format of our geomtry
        riders_geo_df.crs = "EPSG:4326"
        # Change the crs to match our SF GeoDataFrame
        riders_geo_df = riders_geo_df.to_crs(epsg = 3857)

        edges['geometry'] = graph_edges
        edges_geo_df = gpd.GeoDataFrame(edges, geometry='geometry')
        edges_geo_df.crs = "EPSG:4326"
        edges_geo_df = edges_geo_df.to_crs(epsg = 3857)
        
        # Plot our drivers locations
        drivers_geo_df.plot(ax=ax, color=red_color, marker = '.', markersize=50, edgecolor = 'black', legend = True)
        # Add in a background using contextily
##         ctx.add_basemap(ax)

        edges_geo_df.plot(ax=ax, color=blue_color, linewidth=0.4)

        # Plot our riders locations
        riders_geo_df.plot(ax=ax, color='gray', marker = '.', markersize=250, edgecolor = 'black', legend = True)
        # Add in a background using contextily
##         ctx.add_basemap(ax)
        #important it seems
        ctx.add_basemap(ax, source = ctx.sources.OSM_A)
#         print(drivers_geo_df)
    
        
        if show_radars:
            radars_points = []
            radars_sizes = []
            for lat, lng, radius in radar_set:
                radars_points.append(Point(lng, lat))
                radars_sizes.append(radius*0.9)
    #             drawObject = Circle((lng, lat), radius, color='r', alpha=0.1)
    #             ax.add_patch(drawObject)
            radars['geometry'] = radars_points
            # Convert Pandas DataFrame to a GeoDataFrame
            radars_geo_df = gpd.GeoDataFrame(radars, geometry='geometry')
            # Initialize crs to 4326 because that's the format of our geomtry
            radars_geo_df.crs = "EPSG:4326"
            # Change the crs to match our SF GeoDataFrame
            radars_geo_df = radars_geo_df.to_crs(epsg = 3857)
            radars_geo_df['sizes'] = radars_sizes
            radars_geo_df.plot(ax=ax, color=red_color, marker = 'o', markersize=radars_sizes, edgecolor = 'red', legend = True, alpha=0.2)

#         plt.plot((4.50*1e6, -1.360*1e7))
#         ctx.add_basemap(ax)
        ctx.add_basemap(ax, source = ctx.sources.OSM_A)
        ax.set_axis_off()

        if not only_sf:
            miny = 4.47*1e6
            maxy = 4.56*1e6
            minx = -1.364*1e7
            maxx = -1.356*1e7
        # # ax.set(xlim=(minx, maxx), ylim=(miny, maxy))
            ax.set_xlim(minx, maxx)
            ax.set_ylim(miny, maxy)

        title = 'automated_new_'
        title += "SF_City" if only_sf else "BayArea"
        if show_radars:
            title += '_show_radars'
        ax.set_axis_off()
        fig.tight_layout()
        plt.savefig('new_bayarea_pics2/%s_%s_%s.png' % (title, scans_key, scan_count), dpi=300)
        plt.cla()
        plt.clf()
        plt.close()
#         break
    break

# Turn off axis
# ax.set_axis_off()




dict_keys(['replay_city1_(8)_2021-09-27_segment35', 'replay_city12_(125)_2021-09-27_segment35'])


SyntaxError: 'return' outside function (<ipython-input-28-a2d6b19fd363>, line 176)

In [29]:
# if only_sf:
#     miny = 4.54*1e6
#     maxy = 4.555*1e6
#     minx = -1.364*1e7
#     maxx = -1.362*1e7

if not only_sf:
    miny = 4.47*1e6
    maxy = 4.56*1e6
    minx = -1.364*1e7
    maxx = -1.356*1e7
# # ax.set(xlim=(minx, maxx), ylim=(miny, maxy))
    ax.set_xlim(minx, maxx)
    ax.set_ylim(miny, maxy)

title = 'New_'
title += "SF_City" if only_sf else "BayArea"
if show_radars:
    title += '_show_radars'
fig.tight_layout()
# plt.show()
plt.savefig('%s_%s_%s.png' % (title, scans_key, scan_count), dpi=300)
# plt.cla()
# plt.clf()
# plt.close()


<Figure size 432x288 with 0 Axes>