<span style="color:blue">Thanks for using Drogon for your interactive Spark application. We update Drogon/SparkMagic as often as possible to make it easier, faster and more reliable for you. Have a question or feedback? Ping us on [uChat](https://uchat.uberinternal.com/uber/channels/spark).</span>

What's New
- Now you can use `%%configure` and `%%spark` magics to configure and start a Spark session (deprecating hard-to-use `%load_ext sparkmagic.magics` and `manage_spark` magics). Check out [this example](https://workbench.uberinternal.com/explore/knowledge/localfile/cwang/sparkmagic_python2_example.ipynb) for more details.
- Improved `%%configure` magic. You now can use it to make all Spark and Drogon configurations from within notebook itself. Check out our [latest documentation & examples](https://docs.google.com/document/d/1mkYtDHquh4FjqTeA0Fxii8lyV-P6qzmoABhmmRwm_00/edit#heading=h.xn14pmoorsn0) for more details.
- Bug fixes and performance updates.


In [None]:
/**
 * Computation for a pre fix trip impact for a given region or city.
 */

In [1]:
%%configure -f
{
  "kind": "spark", 
  "proxyUser": "dhruven.vora", 
  "sparkEnv": "SPARK_24", 
  "driverMemory": "12g", 
  "queue": "maps_route_analytics", 
  "numExecutors": 200, 
  "executorCores": 1, 
  "driverCores": 4,
  "conf": {
    "spark.driver.maxResultSize": "10g",
    "spark.executor.memoryOverhead": 3072, 
    "spark.locality.wait": "0",
    "spark.default.parallelism":10000
  },
  "executorMemory": "24g",
  "drogonHeaders": {
    "X-DROGON-CLUSTER": "phx2/Secure"
  }
}

In [2]:
%%spark

Starting Spark application (can take 60s or more)...
Starting heartbeat thread...done.
Waiting for Drogon session to be ready...............................................
Drogon session is ready.


Drogon Session ID,Spark Application ID,Kind,State,Spark UI,Driver log
518766339,application_1669142328971_34228,spark,idle,Link,Link


SparkSession available as 'spark'.


Cell execution took 92 seconds.


In [3]:
/**
class definition used in the script
*/
case class Segment (
    segment_uuid: String,
    start_junction_uuid: String,
    end_junction_uuid: String
)

case class Location (
    latitude: Double,
    longitude: Double
)

case class SegmentTraversalCount (
    segment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class MapFeature (
    uuid: String,
    featureType: String,
    segment: String,
    direction: String,
    isCondition: Boolean
)

case class UMMIssue (
    issueuuid: String,
    ummbuilduuid: String,
    latitude: Double,
    longitude: Double,
    sampletripuuids: List[String],
    featureuuids: String,
    numberoftrips: Int,
    cityid: Int,
    detectorname: String
)

case class NavRouteDivergence (
    trip_id: List[String],
    pre_div_segment: Segment,
    div_segment: Segment,
    post_div_suggested_segment: Segment,
    post_div_traversed_segment: Segment,
    observations: Int
)

case class NavRouteDivergenceCount (
    preDivSegment: Segment,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int,
    sampleTrips: List[String]
)

defined class NavRouteDivergenceCount

In [4]:
/**
input params
*/
val startDate= "2022-11-09"
val endDate = "2022-11-15"
val cityId = 202
val issueCreationDate = "2022-11-16"

issueCreationDate: String = 2022-11-16

In [5]:
/**
this class loads issues from map_creation.meds_umm_issues
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql._


object UmmIssueLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadUmmIssues(utcFromDateStr: String, utcToDateStr: String, cityId: Integer): DataFrame = {

    var query =
      s"""select 
         | issueuuid,
         | ummbuilduuid,
         | latitude,
         | longitude,
         | sampletripuuids,
         | featureuuids,
         | numberoftrips,
         | cityid,
         | detectorname
         | from map_creation.meds_umm_issues
         | where createddate between '$utcFromDateStr' and '$utcToDateStr'
         | and productionrun = true
         | and cityid = $cityId""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[UMMIssue] = {

    rawDataset.map(r => {
        var segments = ListBuffer[String]()
        r.getAs[Seq[String]]("featureuuids").foreach(row => segments += row)
        
        var trips = ListBuffer[String]()            
        
        UMMIssue(
            issueuuid = r.getAs[String]("issueuuid"),
            ummbuilduuid = r.getAs[String]("ummbuilduuid"),
            latitude = r.getAs[Double]("latitude"),
            longitude = r.getAs[Double]("longitude"),
            sampletripuuids = trips.toList,
            featureuuids = segments.toList.head,
            numberoftrips = r.getAs[Int]("numberoftrips"),
            cityid = r.getAs[Int]("cityid"),
            detectorname = r.getAs[String]("detectorname")
          )
    })
  }
}

defined object UmmIssueLoader

In [6]:
/** 
load map_creation.meds_umm_issues 
*/
val ummIssuesRaw = UmmIssueLoader.loadUmmIssues(issueCreationDate, issueCreationDate, cityId)
val ummIssues = UmmIssueLoader.makeDataset(ummIssuesRaw)
ummIssues.count()

res16: Long = 1315

In [None]:
// categorize issues as segments or features

In [None]:
/**
this class loads route_corpus_features.segment_traversal_counts
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object SegmentTraversalCountLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def load(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select segment_uuid, start_junction_uuid, end_junction_uuid, 
         | sum(suggested_traversals) as suggested_traversals,
         | sum(overlap_traversals) as overlap_traversals,
         | sum(actual_traversals) as actual_traversals
         | from route_corpus_features.segment_traversal_counts
         | where segment_uuid is not null 
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | and datestr between '$utcFromDateStr' and '$utcToDateStr'
         | group by segment_uuid, start_junction_uuid, end_junction_uuid""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[SegmentTraversalCount] = {

    rawDataset.map(r => {
      SegmentTraversalCount(
        segment = Segment(r.getAs[String]("segment_uuid"), r.getAs[String]("start_junction_uuid"), r.getAs[String]("end_junction_uuid")),
        suggestedCount = r.getAs[Long]("suggested_traversals").toInt,
        overlapCount = r.getAs[Long]("overlap_traversals").toInt,
        actualCount = r.getAs[Long]("actual_traversals").toInt
      )
    })
  }
}

In [None]:
/**
load route_corpus_features.segment_traversal_counts
*/
val stcRaw = SegmentTraversalCountLoader.load(startDate, endDate)
val stc = SegmentTraversalCountLoader.makeDataset(stcRaw).cache
stc.count()

In [None]:
/**
this class loads umm.map_feature_road_furnitures_tomtom
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object UmmMapFeatureLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def load(buildId: String): DataFrame = {

    var query =
      s"""select uuid, 
         | data.roadfurniture.type as type, 
         | data.roadfurniture.condition as condition, 
         | data.roadfurniture.onsegment as onsegment
         | from umm.map_feature_road_furnitures_tomtom
         | where uuid is not null
         | AND builduuid = '$buildId'
         | AND data.roadfurniture.onsegment is not null
         | AND data.roadfurniture.onsegment.uuid is not null
         | AND data.roadfurniture.condition is null
         | AND data.roadfurniture.type = 'PERMANENT_BARRIER'""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[MapFeature] = {

    rawDataset.map(r => {
      MapFeature(
        uuid = r.getAs[String]("uuid"),
        featureType = r.getAs[String]("type"),
        segment = r.getAs[Row]("onsegment").getAs[String]("uuid"),
        direction = r.getAs[Row]("onsegment").getAs[String]("direction"),
        isCondition = false
      )
    })
  }
}

In [None]:
/**
load umm.map_feature_road_furnitures_tomtom
*/
val mapFeaturesRaw = UmmMapFeatureLoader.load("418384f4-6680-11ed-ae4a-506b4bb1373e")
val mapFeatures = UmmMapFeatureLoader.makeDataset(mapFeaturesRaw).cache
mapFeatures.count()