In [None]:
/** 
Initialize the configuration 
*/
%%configure -f
{
  "kind": "spark", 
  "proxyUser": "dhruven.vora", 
  "sparkEnv": "SPARK_24", 
  "driverMemory": "12g", 
  "queue": "maps_route_analytics", 
  "numExecutors": 400, 
  "executorCores": 1, 
  "driverCores": 4,
  "conf": {
    "spark.driver.maxResultSize": "10g",
    "spark.executor.memoryOverhead": 3072, 
    "spark.locality.wait": "0",
    "spark.default.parallelism":10000
  },
  "executorMemory": "24g",
  "drogonHeaders": {
    "X-DROGON-CLUSTER": "phx2/Secure",
    "X-Drogon-Auth-HDFS-DT": "MgAMZGhydXZlbi52b3JhDGRocnV2ZW4udm9yYQCKAYIYec6zigGCPIZSs4w4RLhujgaSFJ9eA_I2f4OlQS1wfHF6EcZDvoIDEldFQkhERlMgZGVsZWdhdGlvbhExMC44MC42Ni4xMzU6ODAyMA"
  }
}

In [None]:
%%spark

In [None]:
/**
 * This section defines all the objects will be used in the following algorithm.
 */
case class Segment (
    segment_uuid: String,
    start_junction_uuid: String,
    end_junction_uuid: String
)

case class Location (
    latitude: Double,
    longitude: Double
)

case class SegmentTraversalCount (
    segment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class NavRouteDivergence (
    trip_id: List[String],
    pre_div_segment: Segment,
    div_segment: Segment,
    post_div_suggested_segment: Segment,
    post_div_traversed_segment: Segment
)

case class NavRouteDivergenceCount (
    preDivSegment: Segment,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int,
    sampleTrips: List[String]
)

case class TransitionTraversalCount (
    firstSegment: Segment,
    lastSegment: Segment,
    viaSegment: Segment,
    suggestedCount: Int,
    overlapCount: Int,
    actualCount: Int
)

case class TransitionDivergenceFeature (
    actualTransition: TransitionTraversalCount,
    suggestedTransition: TransitionTraversalCount,
    divSegment: Segment,
    postDivSuggestedSegment: Segment,
    postDivTraversedSegment: Segment,
    observations: Int
)

case class TransitionTraversalCountPair (
    actualTransition: TransitionTraversalCount,
    suggestedTransition: TransitionTraversalCount
)

case class TurnRestrictionFeature (
    segmentIds: List[String],
    actualTraversalsCountOnTransition: Int,
    suggestedTraversalsCountOnTransition: Int,
    actualTraversalsCountOnSuggestedSegment: Int,
    sampleTripsUuids: List[String]
)

case class UMMIssue (
    issueuuid: String,
    ummbuilduuid: String,
    latitude: Double,
    longitude: Double,
    sampletripuuids: List[String],
    featureuuids: List[String],
    numberoftrips: Int,
    cityid: Int
)

In [None]:
/**
 * Input params for the algorithm
 */
val startDate= "2022-10-16"
val endDate = "2022-10-22"
val umm_version = "9cfbd494-5212-11ed-9455-5c6f6910eaea"

In [None]:
/**
Load divergences from maps_intel.navigation_route_divergence
*/

import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql._


object NavRouteDivergenceLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadDivergences(utcFromDateStr: String, utcToDateStr: String, cityIds: Array[Int]): DataFrame = {

    var query =
      s"""select 
         | pre_divergence_traversed_segments,
         | divergence_segment,
         | post_divergence_suggested_segment,
         | post_divergence_suggested_segments,
         | post_divergence_traversed_segment,
         | count(*) as observations,
         | slice(collect_set(distinct job_uuid),1,5) as sample_trips_uuids 
         | from maps_intel.navigation_route_divergence
         | where datestr between '$utcFromDateStr' and '$utcToDateStr'
         | AND divergence_type = 'validDivergenceFound'
         | AND pre_divergence_traversed_segments is not null
         | and divergence_segment is not null 
         | and post_divergence_suggested_segment is not null 
         | and post_divergence_suggested_segments is not null 
         | and post_divergence_traversed_segment is not null 
         | AND divergence_segment.segment_uuid != post_divergence_suggested_segment.segment_uuid
         | AND divergence_segment.segment_uuid != post_divergence_traversed_segment.segment_uuid
         | AND post_divergence_suggested_segment.segment_uuid != post_divergence_traversed_segment.segment_uuid
         | and lineofbusiness in ('rides')
         | AND vehicle_type in ('CAR')
         | GROUP BY 1,2,3,4,5""".stripMargin
        .replaceAll("\n", " ")

    if (!cityIds.isEmpty) {
      query = query + s""" and city_id in (${cityIds.mkString(",")})"""
    }

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[NavRouteDivergence] = {

    rawDataset.filter(r => {
        var preDivSegments = ListBuffer[Segment]()
        
        r.getAs[Seq[Any]]("pre_divergence_traversed_segments").filter(row => row != null).foreach(row => {
            val segmentInfo = row.asInstanceOf[Row]
            if(segmentInfo.getAs[String]("segment_uuid") != null && 
               segmentInfo.getAs[String]("start_junction_uuid") != null &&
               segmentInfo.getAs[String]("end_junction_uuid") != null) {
                preDivSegments += Segment(segmentInfo.getAs[String]("segment_uuid"),
                                             segmentInfo.getAs[String]("start_junction_uuid"),
                                             segmentInfo.getAs[String]("end_junction_uuid")
                                            )
            }
        })
        
        !preDivSegments.isEmpty
    }).map(r => {
        var preDivSegments = ListBuffer[Segment]()
        
        r.getAs[Seq[Any]]("pre_divergence_traversed_segments").foreach(row => {
            val segmentInfo = row.asInstanceOf[Row]
            preDivSegments += Segment(segmentInfo.getAs[String]("segment_uuid"),
                                        segmentInfo.getAs[String]("start_junction_uuid"),
                                        segmentInfo.getAs[String]("end_junction_uuid")
                                )
        })
        
        var trips = ListBuffer[String]()
        
        r.getAs[Seq[String]]("sample_trips_uuids").foreach(row => trips += row)
            
        
        NavRouteDivergence(
            trip_id = trips.toList,
            pre_div_segment = preDivSegments.toList.head,
            div_segment = Segment(r.getAs[Row]("divergence_segment").getAs[String]("segment_uuid"),
                                 r.getAs[Row]("divergence_segment").getAs[String]("start_junction_uuid"),
                                 r.getAs[Row]("divergence_segment").getAs[String]("end_junction_uuid")),
            post_div_suggested_segment = Segment(r.getAs[Row]("post_divergence_suggested_segment").getAs[String]("segment_uuid"),
                                 r.getAs[Row]("post_divergence_suggested_segment").getAs[String]("start_junction_uuid"),
                                 r.getAs[Row]("post_divergence_suggested_segment").getAs[String]("end_junction_uuid")),
            post_div_traversed_segment = Segment(r.getAs[Row]("post_divergence_traversed_segment").getAs[String]("segment_uuid"),
                                 r.getAs[Row]("post_divergence_traversed_segment").getAs[String]("start_junction_uuid"),
                                 r.getAs[Row]("post_divergence_traversed_segment").getAs[String]("end_junction_uuid"))
          )
    })
      
  }
}

In [None]:
/**
Load and cache divergences from maps_intel.navigation_route_divergence
*/
val navDivergencesRaw = NavRouteDivergenceLoader.loadDivergences(startDate, endDate, Array())
val div = NavRouteDivergenceLoader.makeDataset(navDivergencesRaw).cache
div.count()

In [None]:
/**
This class loads trasitions from route_corpus_features.transition_traversal_counts
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession, Column}
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Map

object TransitionTraversalCountsLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadTTC(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select first_segment, last_segment, via_segments, 
         | sum(suggested_traversals) as suggested_traversals, 
         | sum(actual_traversals) as actual_traversals, 
         | sum(overlap_traversals) as overlap_traversals 
         | from route_corpus_features.transition_traversal_counts
         | where datestr between '$utcFromDateStr' and '$utcToDateStr'
         | AND first_segment is not NULL
         | AND last_segment is not NULL
         | AND via_segments is not NULL
         | AND via_segments[0] is not NULL
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | group by first_segment, last_segment, via_segments""".stripMargin.replaceAll("\n", " ")
      
    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[TransitionTraversalCount] = {
    import spark.implicits._

    rawDataset.map(r => {
        
        val firstSegment = Segment(r.getAs[Row]("first_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("first_segment").getAs[String]("end_junction_uuid"))
        
        val lastSegment = Segment(r.getAs[Row]("last_segment").getAs[String]("segment_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("start_junction_uuid"),
                             r.getAs[Row]("last_segment").getAs[String]("end_junction_uuid"))
        
        var viaSegmentsBuffer = ListBuffer[Segment]()
        
        r.getAs[Seq[Any]]("via_segments").foreach(row => {
            val segmentInfo = row.asInstanceOf[Row]
            viaSegmentsBuffer += Segment(segmentInfo.getAs[String]("segment_uuid"),
                                         segmentInfo.getAs[String]("start_junction_uuid"),
                                         segmentInfo.getAs[String]("end_junction_uuid")
                                        )
        })
        
        
      TransitionTraversalCount(
        firstSegment,
        lastSegment,
        viaSegmentsBuffer.toList.head,  
        r.getAs[Long]("suggested_traversals").toInt,
        r.getAs[Long]("overlap_traversals").toInt,
        r.getAs[Long]("actual_traversals").toInt)
    })
      .filter(T => T.firstSegment.end_junction_uuid == T.viaSegment.start_junction_uuid && 
            T.viaSegment.end_junction_uuid == T.lastSegment.start_junction_uuid)
  }
}

In [None]:
/**
Load trasitions from route_corpus_features.transition_traversal_counts
*/
val ttcRaw = TransitionTraversalCountsLoader.loadTTC(startDate, endDate)
val ttc = TransitionTraversalCountsLoader.makeDataset(ttcRaw).cache
ttc.count()

In [None]:
/**
this class loads segments from route_corpus_features.segment_traversal_counts
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._

object SegmentTraversalCountLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadSegments(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select segment_uuid, start_junction_uuid, end_junction_uuid, 
         | sum(suggested_traversals) as suggested_traversals,
         | sum(overlap_traversals) as overlap_traversals,
         | sum(actual_traversals) as actual_traversals
         | from route_corpus_features.segment_traversal_counts
         | where segment_uuid is not null 
         | and line_of_business = 'rides'
         | AND vehicle_type in ('CAR')
         | and datestr between '$utcFromDateStr' and '$utcToDateStr'
         | group by segment_uuid, start_junction_uuid, end_junction_uuid""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[SegmentTraversalCount] = {

    rawDataset.map(r => {
      SegmentTraversalCount(
        segment = Segment(r.getAs[String]("segment_uuid"), r.getAs[String]("start_junction_uuid"), r.getAs[String]("end_junction_uuid")),
        suggestedCount = r.getAs[Long]("suggested_traversals").toInt,
        overlapCount = r.getAs[Long]("overlap_traversals").toInt,
        actualCount = r.getAs[Long]("actual_traversals").toInt
      )
    })
  }
}

In [None]:
/**
load segments from route_corpus_features.segment_traversal_counts
*/
val stcRaw = SegmentTraversalCountLoader.loadSegments(startDate, endDate)
val stc = SegmentTraversalCountLoader.makeDataset(stcRaw).cache
stc.count()

In [None]:
/**
filter divergences where segments are not joined by common junctions 
*/
val filteredDiv = div.alias("DIV").
                    joinWith(stc.alias("STC"), 
                        col("DIV.div_segment.segment_uuid")===col("STC.segment.segment_uuid")
                        &&col("DIV.div_segment.start_junction_uuid")===col("STC.segment.start_junction_uuid")
                        &&col("DIV.div_segment.end_junction_uuid")===col("STC.segment.end_junction_uuid")).
                    joinWith(stc.alias("STC_SUG"),
                        col("_1.post_div_suggested_segment.segment_uuid")===col("STC_SUG.segment.segment_uuid")
                        &&col("_1.post_div_suggested_segment.start_junction_uuid")===col("STC_SUG.segment.start_junction_uuid")
                        &&col("_1.post_div_suggested_segment.end_junction_uuid")===col("STC_SUG.segment.end_junction_uuid")).
                    joinWith(stc.alias("STC_ACT"),
                        col("_1._1.post_div_traversed_segment.segment_uuid")===col("STC_ACT.segment.segment_uuid")
                        &&col("_1._1.post_div_traversed_segment.start_junction_uuid")===col("STC_ACT.segment.start_junction_uuid")
                        &&col("_1._1.post_div_traversed_segment.end_junction_uuid")===col("STC_ACT.segment.end_junction_uuid")).
                    map(tuple => tuple._1._1._1).
                    filter(d => d.div_segment.end_junction_uuid == d.post_div_suggested_segment.start_junction_uuid 
                        && d.div_segment.end_junction_uuid == d.post_div_traversed_segment.start_junction_uuid).
                    cache
filteredDiv.count()

In [None]:
/**
Compute the TTC pair of Suggested transition and actual transition.
For testing purpose, pairing is done here only with suggestedTransition where there is no traversal at all.
*/

import org.apache.spark.sql.Column

val ttcPairs = ttc.alias("ACT").
joinWith(ttc.alias("SUG"), 
         col("ACT.firstSegment")===col("SUG.firstSegment")&&
         col("ACT.viaSegment")===col("SUG.viaSegment")&&
         col("ACT.lastSegment")=!=col("SUG.lastSegment")
        ).
map(pair => TransitionTraversalCountPair(pair._1, pair._2))

In [None]:
ttcPairs.count()

In [None]:
/** 
Join transition pairs with divergences to compute the feature 
*/

import org.apache.spark.sql.Column

val tdf = ttcPairs.alias("TTC").joinWith(filteredDiv.alias("DIV"),
                                         col("TTC.actualTransition.firstSegment.segment_uuid")===col("DIV.pre_div_segment.segment_uuid")&&
                                         col("TTC.actualTransition.firstSegment.start_junction_uuid")===col("DIV.pre_div_segment.start_junction_uuid")&&
                                         col("TTC.actualTransition.firstSegment.end_junction_uuid")===col("DIV.pre_div_segment.end_junction_uuid")&&
                                         col("TTC.actualTransition.viaSegment.segment_uuid")===col("DIV.div_segment.segment_uuid")&&
                                         col("TTC.actualTransition.viaSegment.start_junction_uuid")===col("DIV.div_segment.start_junction_uuid")&&
                                         col("TTC.actualTransition.viaSegment.end_junction_uuid")===col("DIV.div_segment.end_junction_uuid")&&
                                         col("TTC.actualTransition.lastSegment.segment_uuid")===col("DIV.post_div_traversed_segment.segment_uuid")&&
                                         col("TTC.actualTransition.lastSegment.start_junction_uuid")===col("DIV.post_div_traversed_segment.start_junction_uuid")&&
                                         col("TTC.actualTransition.lastSegment.end_junction_uuid")===col("DIV.post_div_traversed_segment.end_junction_uuid")&&
                                         col("TTC.suggestedTransition.lastSegment.segment_uuid")===col("DIV.post_div_suggested_segment.segment_uuid")&&
                                         col("TTC.suggestedTransition.lastSegment.start_junction_uuid")===col("DIV.post_div_suggested_segment.start_junction_uuid")&&
                                         col("TTC.suggestedTransition.lastSegment.end_junction_uuid")===col("DIV.post_div_suggested_segment.end_junction_uuid")
                                         )

In [None]:
tdf.count()

In [None]:
/**
Compute features for turn restrictions.
*/
val features = tdf.alias("TDF").joinWith(stc.alias("STC"), 
                          col("TDF._2.post_div_suggested_segment.segment_uuid")===col("STC.segment.segment_uuid")&&
                          col("TDF._2.post_div_suggested_segment.start_junction_uuid")===col("STC.segment.start_junction_uuid")&&
                          col("TDF._2.post_div_suggested_segment.end_junction_uuid")===col("STC.segment.end_junction_uuid")
                         ).cache
features.count()

In [None]:
/**
Map features for turn restrictions to TurnRestrictionFeature.
*/
val trIssues = features.map(ft => {
    
    var segments = ListBuffer[String]()
    segments += ft._1._1.suggestedTransition.firstSegment.segment_uuid
    segments += ft._1._1.suggestedTransition.viaSegment.segment_uuid
    segments += ft._1._1.suggestedTransition.lastSegment.segment_uuid
    
    TurnRestrictionFeature(
        segmentIds = segments.toList,
        actualTraversalsCountOnTransition = ft._1._1.suggestedTransition.actualCount,
        suggestedTraversalsCountOnTransition = ft._1._1.suggestedTransition.suggestedCount,
        actualTraversalsCountOnSuggestedSegment = ft._2.actualCount,
        sampleTripsUuids = ft._1._2.trip_id
    )
})

trIssues.count()

In [None]:
/**
aggregate features for turn restrictions to generate unique issues.
*/
val aggIssues = trIssues.
groupBy(col("segmentIds")).
agg(max(col("actualTraversalsCountOnTransition")).alias("actualTraversalsCountOnTransition"),
    sum(col("suggestedTraversalsCountOnTransition")).alias("suggestedTraversalsCountOnTransition"),
    max(col("actualTraversalsCountOnSuggestedSegment")).alias("actualTraversalsCountOnSuggestedSegment"),
    collect_list(col("sampleTripsUuids")).alias("sampleTripsUuids")).
map(row => {
    
    var segments = ListBuffer[String]()
    row.getAs[Seq[String]]("segmentIds").foreach(value => segments += value)
    
//     var trips = ListBuffer[String]()
//     row.getAs[Seq[String]]("sampleTripsUuids").foreach(value => trips += value)
    
    TurnRestrictionFeature(
        segmentIds = segments.toList,
        actualTraversalsCountOnTransition = row.getAs[Int]("actualTraversalsCountOnTransition").toInt,
        suggestedTraversalsCountOnTransition = row.getAs[Long]("suggestedTraversalsCountOnTransition").toInt,
        actualTraversalsCountOnSuggestedSegment = row.getAs[Int]("actualTraversalsCountOnSuggestedSegment").toInt,
        sampleTripsUuids = List[String]()
    )
    
})

aggIssues.count()

In [None]:
/** Filter features satisfying following criteria
 1. Suggested Transition suggested count > 21
 2. Suggested Transition actual count = 0
 3. Suggested Segment traversal count > 6
*/

val turnRestrictionIssues = aggIssues.filter(
                f => f.suggestedTraversalsCountOnTransition >= 21 &&
                f.actualTraversalsCountOnTransition <= 0 &&
                f.actualTraversalsCountOnSuggestedSegment >= 6).cache

turnRestrictionIssues.count()

In [None]:
val uniqueIssues = turnRestrictionIssues.
map(tr => (tr._1._1.suggestedTransition, tr)).
dropDuplicates("_1").count()

In [None]:
/**
publish features to HDFS in csv format
*/
features.
orderBy(col("_1._2.preDivSegment"), col("_1._2.divSegment"), col("_1._2.postDivSuggestedSegment")).
map(row => {
    
    val feature = row._1._2
    
    Output(
        divergence_segment_uuid = feature.divSegment.segment_uuid,
        divergence_segment_start_junction_uuid = feature.divSegment.start_junction_uuid,
        divergence_segment_end_junction_uuid = feature.divSegment.end_junction_uuid,
        
        post_divergence_segment_uuid = feature.postDivSuggestedSegment.segment_uuid,
        post_divergence_segment_start_junction_uuid = feature.postDivSuggestedSegment.start_junction_uuid,
        post_divergence_segment_end_junction_uuid = feature.postDivSuggestedSegment.end_junction_uuid,
        
        pre_divergence_segment_uuid = feature.postDivTraversedSegment.segment_uuid,
        pre_divergence_segment_start_junction_uuid = feature.postDivTraversedSegment.start_junction_uuid,
        pre_divergence_segment_end_junction_uuid = feature.postDivTraversedSegment.end_junction_uuid,
        
        observations = feature.observations,
        
        sampleTrips = feature.sampleTrips.mkString(",")
    )
    
}).
distinct.
limit(100).
write.
mode(SaveMode.Overwrite).
option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false").
option("header","true").
csv("turn_restriction.csv")

In [None]:
/**
this class loads turn restriction issues from UMM issues table
*/
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import spark.implicits._
import org.apache.spark.sql.functions._
import scala.collection.mutable.ListBuffer
import org.apache.spark.sql._


object UmmIssueLoader {

  /** Run query to load trips from the table by city and day
    * @param utcDateStr
    * @param cityIds
    * */
  def loadUmmIssues(utcFromDateStr: String, utcToDateStr: String): DataFrame = {

    var query =
      s"""select 
         | issueuuid,
         | ummbuilduuid,
         | latitude,
         | longitude,
         | sampletripuuids,
         | featureuuids,
         | numberoftrips,
         | cityid
         | from map_creation.meds_umm_issues
         | where createddate between '$utcFromDateStr' and '$utcToDateStr'
         | and productionrun = false
         | and detectorname = 'TurnRestrictionDetector'""".stripMargin
        .replaceAll("\n", " ")

    spark.sql(query)
  }

  /** Store dataset in the right schema
    * @param rawDataset
    * */
  def makeDataset(rawDataset: DataFrame): Dataset[UMMIssue] = {

    rawDataset.map(r => {
        var segments = ListBuffer[String]()
        r.getAs[Seq[String]]("featureuuids").foreach(row => segments += row)
        
        var trips = ListBuffer[String]()
        r.getAs[Seq[String]]("sampletripuuids").foreach(row => trips += row)
            
        
        UMMIssue(
            issueuuid = r.getAs[String]("issueuuid"),
            ummbuilduuid = r.getAs[String]("ummbuilduuid"),
            latitude = r.getAs[Double]("latitude"),
            longitude = r.getAs[Double]("longitude"),
            sampletripuuids = trips.toList,
            featureuuids = segments.toList,
            numberoftrips = r.getAs[Int]("numberoftrips"),
            cityid = r.getAs[Int]("cityid")
          )
    })
      
  }
}

In [None]:
/**
load turn restriction issues from UMM issues table
*/
val ummIssuesRaw = UmmIssueLoader.loadUmmIssues("2022-07-07", "2022-07-08")
val ummIssues = UmmIssueLoader.makeDataset(ummIssuesRaw)
ummIssues.count()

In [None]:
/**
find common issues from previous run and current run
*/
val commonIssues = turnRestrictionIssues.map(issue => {
    val segments = ListBuffer[String]()
    segments += issue._1._1.suggestedTransition.firstSegment.segment_uuid
    segments += issue._1._1.suggestedTransition.viaSegment.segment_uuid
    segments += issue._1._1.suggestedTransition.lastSegment.segment_uuid
    
    (segments, issue)
}).alias("T").joinWith(ummIssues.alias("U"), 
                       col("T._1")===col("U.featureuuids"),
                       "left")

commonIssues.count()

In [None]:
/**
display top common issues
*/
commonIssues.filter(row => row._2 == null).limit(10).collect.foreach(println)