In [None]:
%%configure -f
{
  "kind": "spark", 
  "proxyUser": "dhruven.vora", 
  "sparkEnv": "SPARK_24", 
  "driverMemory": "12g", 
  "queue": "maps_route_analytics", 
  "numExecutors": 400, 
  "executorCores": 1, 
  "driverCores": 4,
  "conf": {
    "spark.driver.maxResultSize": "10g",
    "spark.executor.memoryOverhead": 3072, 
    "spark.locality.wait": "0",
    "spark.default.parallelism":10000
  },
  "executorMemory": "24g",
  "drogonHeaders": {
    "X-DROGON-CLUSTER": "PHX2/Secure"
  }
}

In [None]:
%%spark

In [None]:
/**
 * This section defines all the objects will be used in the following algorithm.
 */
case class Segment (
    segment_uuid: String,
    start_junction_uuid: String,
    end_junction_uuid: String
)

case class Location (
    latitude: Double,
    longitude: Double
)

case class SegmentTraversalCount (
    segment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class NavRouteDivergence (
    trip_id: List[String],
    pre_div_segment: Segment,
    div_segment: Segment,
    post_div_suggested_segment: Segment,
    post_div_traversed_segment: Segment
)

case class NavRouteDivergenceCount (
    preDivSegment: Segment,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int,
    sampleTrips: List[String]
)

case class TransitionTraversalCount (
    firstSegment: Segment,
    lastSegment: Segment,
    viaSegment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class TransitionDivergenceFeature (
    actualTransition: TransitionTraversalCount,
    suggestedTransition: TransitionTraversalCount,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int
)

case class TransitionTraversalCountPair (
    actualTransition: TransitionTraversalCount,
    suggestedTransition: TransitionTraversalCount
)

case class TurnPermittedFeature (
    segmentIds: List[String],
    actualTraversalsCountOnTransition: Int,
    suggestedTraversalsCountOnTransition: Int,
    actualTraversalsCountOnSuggestedSegment: Int,
    sampleTripsUuids: List[String]
)

case class UMMIssue (
    issueuuid: String,
    ummbuilduuid: String,
    latitude: Double,
    longitude: Double,
    sampletripuuids: String,
    featureuuids: String,
    numberoftrips: Int,
    cityid: Int
)

case class MapFeature (
    uuid: String,
    segments: List[String]
)

case class PermanentBarrier (
    uuid: String,
    featureType: String,
    segment: String,
    direction: String,
    isCondition: Boolean
)

In [None]:
/**
 * Input params for the algorithm
 */
val startDate= "2022-12-11"
val endDate = "2022-12-17"
val ummVersion = "f905b99a-8137-11ed-9118-000af7d19b40"

In [None]:
/**
This class loads transitions from route_corpus_features.transition_traversal_counts
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession, Column}
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Map


object TransitionTraversalCountsLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadTTC(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select first_segment, last_segment, via_segments, 
         | sum(suggested_traversals) as suggested_traversals, 
         | sum(actual_traversals) as actual_traversals, 
         | sum(overlap_traversals) as overlap_traversals 
         | from route_corpus_features.transition_traversal_counts
         | where datestr between '$utcFromDateStr' and '$utcToDateStr'
         | AND first_segment is not NULL
         | AND last_segment is not NULL
         | AND via_segments is not NULL
         | AND via_segments[0] is not NULL
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | group by first_segment, last_segment, via_segments""".stripMargin.replaceAll("\n", " ")
      
    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[TransitionTraversalCount] = {
    import spark.implicits._

    rawDataset.map(r => {
        
        val firstSegment = Segment(r.getAs[Row]("first_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("end_junction_uuid"))
        
        val lastSegment = Segment(r.getAs[Row]("last_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("end_junction_uuid"))
        
        var viaSegmentsBuffer = ListBuffer[Segment]()
        
        r.getAs[Seq[Any]]("via_segments").foreach(row => {
            val segmentInfo = row.asInstanceOf[Row]
            viaSegmentsBuffer += Segment(segmentInfo.getAs[String]("segment_uuid"),
                                         segmentInfo.getAs[String]("start_junction_uuid"),
                                         segmentInfo.getAs[String]("end_junction_uuid")
                                        )
        })
        
        
      TransitionTraversalCount(
        firstSegment,
        lastSegment,
        viaSegmentsBuffer.toList.head,  
        r.getAs[Long]("suggested_traversals").toInt,
        r.getAs[Long]("overlap_traversals").toInt,
        r.getAs[Long]("actual_traversals").toInt)
    })
      .filter(T => T.firstSegment.end_junction_uuid == T.viaSegment.start_junction_uuid && 
            T.viaSegment.end_junction_uuid == T.lastSegment.start_junction_uuid)
  }
}

In [None]:
/**
load divergences from route_corpus_features.transition_traversal_counts
*/
val ttcRaw = TransitionTraversalCountsLoader.loadTTC(startDate, endDate)
val ttc = TransitionTraversalCountsLoader.makeDataset(ttcRaw).cache
ttc.count()

In [None]:
/**
this class loads maneuvers map features from umm.map_feature_maneuvers_tomtom
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql._


object MapFeatureLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadTurnRestrictions(builduuid: String): DataFrame = {

    var query =
      s"""select 
         | uuid,
         | data.maneuver.segments as segments
         | from umm.map_feature_maneuvers_tomtom
         | where builduuid = '$builduuid'
         | and data.maneuver.type in ('FORBIDDEN_MANEUVER','FORBIDDEN_U_TURN')""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[MapFeature] = {

    rawDataset.map(r => {
        var segments = ListBuffer[String]()
        r.getAs[Seq[Row]]("segments").foreach(row => segments += row.getAs[String]("uuid"))    
        
        MapFeature(
            uuid = r.getAs[String]("uuid"),
            segments = segments.toList
          )
    })
  }
}

In [None]:
/**
load maneuvers map features from umm.map_feature_maneuvers_tomtom
*/
val mapFeaturesRaw = MapFeatureLoader.loadTurnRestrictions(ummVersion)
val turnRestrictionMapFeatures = MapFeatureLoader.makeDataset(mapFeaturesRaw).cache()
turnRestrictionMapFeatures.count()

In [None]:
/**
this class loads umm.map_feature_road_furnitures_tomtom
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object PermanentBarrierLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def load(buildId: String): DataFrame = {

    var query =
      s"""select uuid, 
         | data.roadfurniture.type as type, 
         | data.roadfurniture.condition as condition, 
         | data.roadfurniture.onsegment as onsegment
         | from umm.map_feature_road_furnitures_tomtom
         | where uuid is not null
         | AND builduuid = '$buildId'
         | AND data.roadfurniture.onsegment is not null
         | AND data.roadfurniture.onsegment.uuid is not null
         | AND data.roadfurniture.condition is null
         | AND data.roadfurniture.type = 'PERMANENT_BARRIER'""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[PermanentBarrier] = {

    rawDataset.map(r => {
      PermanentBarrier(
        uuid = r.getAs[String]("uuid"),
        featureType = r.getAs[String]("type"),
        segment = r.getAs[Row]("onsegment").getAs[String]("uuid"),
        direction = r.getAs[Row]("onsegment").getAs[String]("direction"),
        isCondition = false
      )
    })
  }
}

In [None]:
/**
load umm.map_feature_road_furnitures_tomtom
*/
val permanentBarriersRaw = PermanentBarrierLoader.load(ummVersion)
val permanentBarriers = PermanentBarrierLoader.makeDataset(permanentBarriersRaw).cache
permanentBarriers.count()

In [None]:
val trWithPb = turnRestrictionMapFeatures.alias("TR").
joinWith(permanentBarriers.alias("PB"), 
         col("TR.segments")(0)===col("PB.segment")).cache()

In [None]:
val trWithPb2 = turnRestrictionMapFeatures.alias("TR").
joinWith(permanentBarriers.alias("PB"), 
         col("TR.segments")(1)===col("PB.segment")).cache()

In [None]:
trWithPb.map(r => r._1).count()

In [None]:
trWithPb.map(r => r._1).distinct().filter(r => r.segments.length == 2).limit(20).show(false)

In [None]:
trWithPb2.map(r => r._1).count()

In [None]:
trWithPb2.map(r => r._1).distinct().count()

In [None]:
trWithPb.map(r => r._1).distinct().union(trWithPb2.map(r => r._1).distinct()).distinct().count()

In [None]:
// ================================================================================================================
// DATA LOADING DONE...
// ================================================================================================================

In [None]:
/* Finding the logic for 2 segment features
 * 1. Get all 2 segment turn restrictions
 * 2. Join with transitions where first and second segments of transition match or 
 *     second and third segment of transition match
 */

val mapFeaturesWith2Segments = turnRestrictionMapFeatures.except(trWithPb.map(r => r._1)).filter(r => r.segments.size <= 3).cache

val features = ttc.alias("TTF").joinWith(mapFeaturesWith2Segments.alias("TRF"),
                                        (col("TTF.firstSegment.segment_uuid")===element_at(col("TRF.segments"), 1)&&
                                        col("TTF.viaSegment.segment_uuid")===element_at(col("TRF.segments"), 2)&&
                                        col("TTF.lastSegment.segment_uuid")===element_at(col("TRF.segments"), 3))
                                        ).
                                groupBy(col("_2.uuid")).
                                agg(sum(col("_1.actualCount")).alias("netActualCount"),
                                   sum(col("_1.suggestedCount")).alias("netSuggestedCount")).cache

features.count()

In [None]:
/**
publish features generated to HDFS in csv format
*/
features.
filter(r => r.getAs[Long]("netActualCount") > 21).
map(r => {
    
    val sampleTrips = r.getAs[Row]("sampleTripsUuids").getList(0)
    
    UMMIssue (
        issueuuid = "",
        ummbuilduuid = "9cfbd494-5212-11ed-9455-5c6f6910eaea",
        latitude = 0.0,
        longitude = 0.0,
        sampletripuuids = sampleTrips,
        featureuuids = r.getAs[String]("uuid"),
        numberoftrips = r.getAs[Long]("netActualCount").toInt,
        cityid = -1
    )
}).
limit(2).collect().foreach(println)
// repartition(1).
// write.
// mode(SaveMode.Overwrite).
// option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false").
// option("header","true").
// csv("/user/dhruven.vora/turn_permitted_issues_2_seg.csv")