In [None]:
// Invalid Permanent Passage Detector

In [None]:
%%configure -f
{
  "kind": "spark", 
  "proxyUser": "dhruven.vora", 
  "sparkEnv": "SPARK_24", 
  "driverMemory": "12g", 
  "queue": "maps_route_analytics", 
  "numExecutors": 100, 
  "executorCores": 1, 
  "driverCores": 4,
  "conf": {
    "spark.driver.maxResultSize": "10g",
    "spark.executor.memoryOverhead": 3072, 
    "spark.locality.wait": "0",
    "spark.default.parallelism":10000
  },
  "executorMemory": "24g",
  "drogonHeaders": {
    "X-DROGON-CLUSTER": "phx2/Secure"
  }
}

In [None]:
%%spark

In [None]:
/**
class definition used in the script
*/
case class Segment (
    segment_uuid: String,
    start_junction_uuid: String,
    end_junction_uuid: String
)

case class Location (
    latitude: Double,
    longitude: Double
)

case class SegmentTraversalCount (
    segment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class TransitionTraversalCount (
    firstSegment: Segment,
    lastSegment: Segment,
    viaSegment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class MapFeature (
    uuid: String,
    featureType: String,
    segment: String,
    direction: String,
    isCondition: Boolean
)

case class UMMIssue (
    issueuuid: String,
    ummbuilduuid: String,
    latitude: Double,
    longitude: Double,
    sampletripuuids: List[String],
    featureuuids: String,
    numberoftrips: Int,
    cityid: Int
)

case class NavRouteDivergence (
    trip_id: List[String],
    pre_div_segment: Segment,
    div_segment: Segment,
    post_div_suggested_segment: Segment,
    post_div_traversed_segment: Segment,
    observations: Int
)

case class NavRouteDivergenceCount (
    preDivSegment: Segment,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int,
    sampleTrips: List[String]
)

In [None]:
/**
input params
*/
val startDate= "2022-11-20"
val endDate = "2022-11-26"

In [None]:
/**
this class loads route_corpus_features.segment_traversal_counts
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object SegmentTraversalCountLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def load(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select segment_uuid, start_junction_uuid, end_junction_uuid, 
         | sum(suggested_traversals) as suggested_traversals,
         | sum(overlap_traversals) as overlap_traversals,
         | sum(actual_traversals) as actual_traversals
         | from route_corpus_features.segment_traversal_counts
         | where segment_uuid is not null 
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | and datestr between '$utcFromDateStr' and '$utcToDateStr'
         | group by segment_uuid, start_junction_uuid, end_junction_uuid""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[SegmentTraversalCount] = {

    rawDataset.map(r => {
      SegmentTraversalCount(
        segment = Segment(r.getAs[String]("segment_uuid"), r.getAs[String]("start_junction_uuid"), r.getAs[String]("end_junction_uuid")),
        suggestedCount = r.getAs[Long]("suggested_traversals").toInt,
        overlapCount = r.getAs[Long]("overlap_traversals").toInt,
        actualCount = r.getAs[Long]("actual_traversals").toInt
      )
    })
  }
}

In [None]:
/**
load route_corpus_features.segment_traversal_counts
*/
val stcRaw = SegmentTraversalCountLoader.load(startDate, endDate)
val stc = SegmentTraversalCountLoader.makeDataset(stcRaw).cache
stc.count()

In [None]:
/**
This class loads transitions from route_corpus_features.transition_traversal_counts
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession, Column}
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Map


object TransitionTraversalCountsLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadTTC(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select first_segment, last_segment, via_segments, 
         | sum(suggested_traversals) as suggested_traversals, 
         | sum(actual_traversals) as actual_traversals, 
         | sum(overlap_traversals) as overlap_traversals 
         | from route_corpus_features.transition_traversal_counts
         | where datestr between '$utcFromDateStr' and '$utcToDateStr'
         | AND first_segment is not NULL
         | AND last_segment is not NULL
         | AND via_segments is not NULL
         | AND via_segments[0] is not NULL
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | group by first_segment, last_segment, via_segments""".stripMargin.replaceAll("\n", " ")
      
    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[TransitionTraversalCount] = {
    import spark.implicits._

    rawDataset.map(r => {
        
        val firstSegment = Segment(r.getAs[Row]("first_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("end_junction_uuid"))
        
        val lastSegment = Segment(r.getAs[Row]("last_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("end_junction_uuid"))
        
        var viaSegmentsBuffer = ListBuffer[Segment]()
        
        r.getAs[Seq[Any]]("via_segments").foreach(row => {
            val segmentInfo = row.asInstanceOf[Row]
            viaSegmentsBuffer += Segment(segmentInfo.getAs[String]("segment_uuid"),
                                         segmentInfo.getAs[String]("start_junction_uuid"),
                                         segmentInfo.getAs[String]("end_junction_uuid")
                                        )
        })
        
        
      TransitionTraversalCount(
        firstSegment,
        lastSegment,
        viaSegmentsBuffer.toList.head,  
        r.getAs[Long]("suggested_traversals").toInt,
        r.getAs[Long]("overlap_traversals").toInt,
        r.getAs[Long]("actual_traversals").toInt)
    })
      .filter(T => T.firstSegment.end_junction_uuid == T.viaSegment.start_junction_uuid && 
            T.viaSegment.end_junction_uuid == T.lastSegment.start_junction_uuid)
  }
}

In [None]:
/**
load divergences from route_corpus_features.transition_traversal_counts
*/
val ttcRaw = TransitionTraversalCountsLoader.loadTTC(startDate, endDate)
val ttc = TransitionTraversalCountsLoader.makeDataset(ttcRaw).cache
ttc.count()

In [None]:
/**
this class loads umm.map_feature_road_furnitures_tomtom
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object UmmMapFeatureLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def load(buildId: String): DataFrame = {

    var query =
      s"""select uuid, 
         | data.roadfurniture.type as type, 
         | data.roadfurniture.condition as condition, 
         | data.roadfurniture.onsegment as onsegment
         | from umm.map_feature_road_furnitures_tomtom
         | where uuid is not null
         | AND builduuid = '$buildId'
         | AND data.roadfurniture.onsegment is not null
         | AND data.roadfurniture.onsegment.uuid is not null
         | AND data.roadfurniture.condition is null
         | AND data.roadfurniture.type = 'PERMANENT_BARRIER'""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[MapFeature] = {

    rawDataset.map(r => {
      MapFeature(
        uuid = r.getAs[String]("uuid"),
        featureType = r.getAs[String]("type"),
        segment = r.getAs[Row]("onsegment").getAs[String]("uuid"),
        direction = r.getAs[Row]("onsegment").getAs[String]("direction"),
        isCondition = false
      )
    })
  }
}

In [None]:
/**
load umm.map_feature_road_furnitures_tomtom
*/
val mapFeaturesRaw = UmmMapFeatureLoader.load("41837d28-6a6e-11ed-b801-b026282546b0")
val mapFeatures = UmmMapFeatureLoader.makeDataset(mapFeaturesRaw).cache
mapFeatures.count()

In [None]:
/**
create features for invalid blocked passage using segments
*/
val ibpdFeatures = mapFeatures.
    filter(feature => feature.segment != null).alias("F").
    dropDuplicates("segment").
    joinWith(stc.alias("S"), col("F.segment")===col("S.segment.segment_uuid")).
    map(tuple => (tuple._1.uuid, tuple._2.actualCount)).
    cache

ibpdFeatures.count()

In [None]:
/**
create features for invalid blocked passage using segments
*/
val ibpdFeatures = mapFeatures.
    filter(feature => feature.segment != null).alias("F").
    dropDuplicates("segment").
    joinWith(ttc.alias("T"), col("F.segment")===col("T.viaSegment.segment_uuid")).
    groupBy(col("_1.uuid").alias("uuid")).
    agg(sum("_2.actualCount").alias("actualCount")).
    cache

ibpdFeatures.count()

In [None]:
/**
compute issues count by applying trip count filter 
*/
val issues = ibpdFeatures.filter(tuple => tuple.getAs[Long]("actualCount") >= 20)
issues.count()

In [None]:
/**
create issues by joining divergence and segment data and then joining back with map features.
*/

val seg_div = div.alias("D").joinWith(stc.alias("S"), col("D.post_div_traversed_segment.segment_uuid")===col("S.segment.segment_uuid"))

val ibpdFeatures_Div = mapFeatures.
    filter(feature => feature.segment != null).alias("F").
    dropDuplicates("segment").
    joinWith(seg_div.alias("D"), col("F.segment")===col("D._1.post_div_traversed_segment.segment_uuid")&&col("D._2.suggestedCount")===0).
    map(tuple => (tuple._1.uuid, tuple._2._1.observations)).
    cache

ibpdFeatures_Div.count()

In [None]:
/**
compute issues count by applying trip count filter 
*/
val issues_Div = ibpdFeatures_Div.filter(tuple => tuple._2 >= 3)
issues_Div.count()

In [None]:
/**
this class loads issues from map_creation.meds_umm_issues
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql._


object UmmIssueLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadUmmIssues(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select 
         | issueuuid,
         | ummbuilduuid,
         | latitude,
         | longitude,
         | sampletripuuids,
         | featureuuids,
         | numberoftrips,
         | cityid
         | from map_creation.meds_umm_issues
         | where createddate between '$utcFromDateStr' and '$utcToDateStr'
         | and productionrun = false
         | and detectorname = 'InvalidPermanentBlockPassageDetector'""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[UMMIssue] = {

    rawDataset.map(r => {
        var segments = ListBuffer[String]()
        r.getAs[Seq[String]]("featureuuids").foreach(row => segments += row)
        
        var trips = ListBuffer[String]()            
        
        UMMIssue(
            issueuuid = r.getAs[String]("issueuuid"),
            ummbuilduuid = r.getAs[String]("ummbuilduuid"),
            latitude = r.getAs[Double]("latitude"),
            longitude = r.getAs[Double]("longitude"),
            sampletripuuids = trips.toList,
            featureuuids = segments.toList.head,
            numberoftrips = r.getAs[Int]("numberoftrips"),
            cityid = r.getAs[Int]("cityid")
          )
    })
      
  }
}

In [None]:
/** 
load map_creation.meds_umm_issues 
*/
val ummIssuesRaw = UmmIssueLoader.loadUmmIssues("2022-07-21", "2022-07-21")
val ummIssues = UmmIssueLoader.makeDataset(ummIssuesRaw)
ummIssues.count()

In [None]:
/**
Join issues without divergence and with divergence
*/
val joinedIssues = issues.alias("I").joinWith(ummIssues.alias("U"), col("I._1")===col("U.featureuuids"), "outer")
joinedIssues.count()

In [None]:
/**
Show top 10 issues
*/
joinedIssues.filter(issue => issue._1 == null && issue._2 != null).limit(10).collect().foreach(println)

In [None]:
issues_Div.alias("I").joinWith(ummIssues.alias("U"), col("I._1")===col("U.featureuuids"), "outer").
filter(issue => issue._1 == null && issue._2 != null).limit(10).collect().foreach(println)