<span style="color:blue">Thanks for using Drogon for your interactive Spark application. We update Drogon/SparkMagic as often as possible to make it easier, faster and more reliable for you. Have a question or feedback? Ping us on [uChat](https://uchat.uberinternal.com/uber/channels/spark).</span>

What's New
- Now you can use `%%configure` and `%%spark` magics to configure and start a Spark session (deprecating hard-to-use `%load_ext sparkmagic.magics` and `manage_spark` magics). Check out [this example](https://workbench.uberinternal.com/explore/knowledge/localfile/cwang/sparkmagic_python2_example.ipynb) for more details.
- Improved `%%configure` magic. You now can use it to make all Spark and Drogon configurations from within notebook itself. Check out our [latest documentation & examples](https://docs.google.com/document/d/1mkYtDHquh4FjqTeA0Fxii8lyV-P6qzmoABhmmRwm_00/edit#heading=h.xn14pmoorsn0) for more details.
- Bug fixes and performance updates.


In [None]:
%%configure -f
{
  "pyFiles": [], 
  "kind": "pyspark", 
  "proxyUser": "abhishek.sharma", 
  "sparkEnv": "SPARK_24", 
  "driverMemory": "12g", 
  "queue": "maps_automation", 
  "numExecutors": 1000, 
  "conf": {
    "spark.hadoop.fs.s3a.attempts.maximum": "10",
    "spark.hadoop.fs.s3a.multipart.size": "100857600",
    "spark.hadoop.fs.s3a.block.size": "33554432",
    "spark.hadoop.fs.s3a.threads.keepalivetime": "60",
    "spark.hadoop.fs.s3a.threads.core": "64",
    "spark.hadoop.fs.s3a.threads.max": "128",
    "spark.hadoop.fs.s3a.connection.maximum": "64",
    "spark.sql.catalogImplementation": "hive",
    "spark.yarn.nodemanager.vmem-check-enabled": "false",
    "spark.default.parallelism" : "2000",
    "spark.sql.shuffle.partitions":"2000",
      "spark.yarn.queue": "maps_automation"
  }, 
  "executorCores": 2, 
  "driverCores": 2, 
  "jars": [
  "hdfs:///lib/hive/jars/udf/fraud/hadoop-aws-2.8.2.jar", "hdfs:///lib/hive/jars/udf/fraud/aws-java-sdk-bundle-1.11.574.jar"
  ], 
  "executorMemory": "14g",
  "drogonHeaders": {
    "X-Drogon-Auth-HDFS-DT": "<TOKEN_HERE>", 
    "X-DROGON-CLUSTER": "dca1/nonsecure"
  }    
}

In [None]:
%%spark

In [None]:
experiment = '_23_07_29_07_20c'
date_filter = "between '2020-07-23' and '2020-07-29'"
date_filter_streaks =  "between '2020-07-09' and '2020-07-22'"

# experiment = '_30_07_05_08_20c2'
# date_filter = "between '2020-07-30' and '2020-08-05'"
# date_filter_streaks =  "between '2020-07-16' and '2020-07-29'"

# experiment = '_06_08_12_08_20c'
# date_filter = "between '2020-08-06' and '2020-08-12'"
# date_filter_streaks =  "between '2020-07-23' and '2020-08-05'"

city_filter = "in (20, 8, 14, 198, 5, 12, 134, 26, 23, 25, 24, 208, 27, 1541, 7, 6, 45, 227, 4, 35)"
city_filter_apostrophe = 'in ' + str(tuple(str(i) for i in eval(city_filter[4:-1])))
base_dir = 'hdfs:///user/abhishek.sharma/'
db = 'maps_automation'

config = {'experiment': experiment, 'date_filter': date_filter, 'date_filter_streaks': date_filter_streaks,
          'city_filter': city_filter, 'city_filter_apostrophe': city_filter_apostrophe, 'base_dir': base_dir,
          'db': db}

from pyspark.sql import HiveContext
hive_context = HiveContext(sc)

In [None]:
query_candidate_segments = """
with subset_tickets as (
  select
    t.msg.map_ticket_id,
    t.msg.trip_id,
    c.city_id,
    t.msg.problem_location_point.lat as report_lat,
    t.msg.problem_location_point.long as report_long,
    t.datestr as report_date
  from
    rawdata_user.kafka_hp_umm_hotfix_backend_living_maps_nodedup t
    join dwh.dim_city c on esri.ST_Contains(
      esri.ST_GeomFromText(c.simplified_shape),
      esri.ST_Point(
        t.msg.problem_location_point.long,
        t.msg.problem_location_point.lat
      )
    )
  where
    t.msg.env = 'production'
    and t.msg.reporter_context = 'mobile'
    and t.msg.map_ticket_state = 'REPORTED'
    and t.datestr {date_filter}
    and c.city_id {city_filter}
),
subset_segments as (
  select
    uuid as segment_id,
    geometry.polyline.points [0].lnge7 / 1e7 as segment_start_long,
    geometry.polyline.points [0].late7 / 1e7 as segment_start_lat,
    geometry.polyline.points [size(geometry.polyline.points) - 1].lnge7 / 1e7 as segment_end_long,
    geometry.polyline.points [size(geometry.polyline.points) - 1].late7 / 1e7 as segment_end_lat
  FROM
    umm.map_feature_segments_tomtom
  where
    builduuid = '6dc8e44c-be20-11ea-bb97-000af7f88c50'
),
suggested_segments as (
  select
    t.map_ticket_id as map_ticket_id,
    collect_set(t.segment.segmentuuid) as suggested_segments
  from
    subset_tickets t
    join rawdata_user.kafka_hp_gurafu_route_logs_nodedup su on su.msg.tripuuid = t.trip_id 
    lateral view explode (su.msg.segments) t as segment
  where
    su.datestr {date_filter}
    and su.msg.cityid {city_filter}
  group by
    1
),
near_segments as (
  select
    t.map_ticket_id,
    t.trip_id,
    t.city_id,
    t.report_lat,
    t.report_long,
    t.report_date,    
    collect_list(s.segment_id) as near_segments,
    collect_list(
      case
        when(
          esri.ST_GeodesicLengthWGS84(
            esri.ST_SetSRID(
              esri.ST_Linestring(
                s.segment_start_long,
                s.segment_start_lat,
                t.report_long,
                t.report_lat
              ),
              4326
            )
          ) < 50
          or esri.ST_GeodesicLengthWGS84(
            esri.ST_SetSRID(
              esri.ST_Linestring(
                s.segment_end_long,
                s.segment_end_lat,
                t.report_long,
                t.report_lat
              ),
              4326
            )
          ) < 50
        )
        then s.segment_id
        else null
      end
    ) as nearer_segments
  from
    subset_tickets t
    join subset_segments s on esri.ST_GeodesicLengthWGS84(
      esri.ST_SetSRID(
        esri.ST_Linestring(
          s.segment_start_long,
          s.segment_start_lat,
          t.report_long,
          t.report_lat
        ),
        4326
      )
    ) < 300
    or esri.ST_GeodesicLengthWGS84(
      esri.ST_SetSRID(
        esri.ST_Linestring(
          s.segment_end_long,
          s.segment_end_lat,
          t.report_long,
          t.report_lat
        ),
        4326
      )
    ) < 300
  group by
    1, 2, 3, 4, 5, 6
)
select
  n.map_ticket_id,
  n.trip_id,
  n.city_id,
  n.report_lat,
  n.report_long,
  n.report_date,
  etl.bhouse_array_union(
    etl.bhouse_intersect_array(su.suggested_segments, n.near_segments),
    n.nearer_segments
  ) as candidate_segments
from
  suggested_segments su
  join near_segments n on n.map_ticket_id = su.map_ticket_id
""".format(**config)

candidate_segments_df = hive_context.sql(query_candidate_segments)

In [None]:
parquet_location = '{base_dir}{db}_candidate_segments{experiment}'.format(**config)
candidate_segments_df.coalesce(1000).write.mode("overwrite").parquet(parquet_location)
df = spark.read.parquet(parquet_location)
df.coalesce(10).write.saveAsTable("{db}.candidate_segments{experiment}".format(**config))

In [None]:
query_agg_suggested_streaks = '''
with candidate_segments as (
  select
    distinct t.candidate_segment
  from
    {db}.candidate_segments{experiment} c lateral view explode(c.candidate_segments) t as candidate_segment
),
suggested_routes as (
  select
    t.datestr,
    t.segments as suggested_route
  FROM
    (
      select
        uuid
      from
        dwh.fact_trip ft
      where
        datestr {date_filter_streaks}
        and city_id {city_filter}
        and marketplace = 'personal_transport'
    ) ft
    join (
      select
        msg.tripUuid,
        datestr,
        msg.segments
      from
        rawdata_user.kafka_hp_gurafu_route_logs_nodedup
      where
        datestr {date_filter_streaks}
        and msg.cityid {city_filter}
        and msg.experimentname is NULL
    ) t on t.tripUuid = ft.uuid
),
suggested_segments as (
  select
    datestr,
    x.segment.segmentuuid,
    x.segment.startjunctionuuid,
    x.segment.endjunctionuuid
  from
    suggested_routes lateral view explode (suggested_route) x as segment
)
select
  s.segmentuuid,
  s.datestr,
  s.startjunctionuuid,
  s.endjunctionuuid,
  count(1) as suggested_streaks
from
  candidate_segments c
  join suggested_segments s on s.segmentuuid = c.candidate_segment
group by
  1, 2, 3, 4
'''.format(**config)

agg_suggested_streaks_df = hive_context.sql(query_agg_suggested_streaks)

In [None]:
parquet_location = '{base_dir}{db}_agg_suggested_streaks{experiment}'.format(**config)
agg_suggested_streaks_df.coalesce(1000).write.mode("overwrite").parquet(parquet_location)
df = spark.read.parquet(parquet_location)
df.coalesce(10).write.saveAsTable("{db}.agg_suggested_streaks{experiment}".format(**config))

In [None]:
# Close spark sesssion
sc.stop()