In [None]:
# Spark Configuration

%%configure -f
{
  "pyFiles": [], 
  "kind": "pyspark", 
  "proxyUser": "dhruven.vora", 
  "sparkEnv": "SPARK_24",
  "queue": "maps_trueta",
  "numExecutors": 800,
  "driverMemory": "12g",
  "executorMemory": "12g",
  "driverCores": 4,
  "executorCores": 1, 
  "jars": [], 
  "conf": {
    "spark.executor.memoryOverhead": "4g",
    "spark.driver.memoryOverhead": "4g",
    "spark.driver.maxResultSize": "10g",
    "hive.exec.dynamic.partition": "true",
    "hive.exec.dynamic.partition.mode": "nonstrict",
    "spark.locality.wait": "6s",
    "spark.maxRemoteBlockSizeFetchToMem": "200m",
    "spark.network.timeout": "2400s",
    "spark.executor.heartbeatInterval": "120s",
    "spark.yarn.scheduler.heartbeat.interval-ms": 120000,
    "spark.driver.extraJavaOptions": "-XX:+UseCompressedOops -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps",
    "spark.executor.extraJavaOptions": "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=6 -XX:+UseCompressedOops -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintHeapAtGC",
    "spark.sql.autoBroadcastJoinThreshold":-1
    }, 
  "drogonHeaders": {
    "X-DROGON-CLUSTER": "DCA1/NonSecure"  
  }    
}

In [None]:
%%spark

In [None]:
from pyspark.sql.functions import concat, col
import pyspark.sql.functions as F
from pyspark.sql.functions import when

In [None]:
# Read the current prod model for TOMTOM
tt_model = spark.read.parquet("hdfs:///user/dhruven.vora/osm_bike_1031_parquet")

In [None]:
# print schema for the model read
tt_model.printSchema()

In [None]:
# print element count for the model read
tt_model.count()

In [None]:
# Fetch the segments from rawdata_user.kafka_hp_maps_historical_streaks_tomtom_nodedup table

tt_segments = spark.sql("""
select 
    msg.graphsegment.segmentuuid as segmentid, msg.cityid as cityid
from 
    rawdata_user.kafka_hp_maps_historical_streaks_osm_nodedup
where
    datestr between '2021-12-16' and '2022-01-02'
    and msg.classification = 'valid'
    and msg.vehicletype = 'BICYCLE'
    and msg.lengthmeters > 0
    and msg.speedkmph > 0
""")

In [None]:
# Display segments schema
tt_segments.printSchema()

In [None]:
# Display segments count
tt_segments.count()

In [None]:
# Find out what % of segments are present in the model.
# compute (Segments in kafka - segments in model) * 100 / Segments in kafka
joined_tt = tt_segments.join(tt_model, tt_model.segmentuuid == tt_segments.segmentid, 'left')

In [None]:
# add a new column in tomtom model which sets 1 if segment id is present
result_tt = joined_tt.withColumn('prod_model', when(joined_tt.segmentuuid.isNull(), 0).otherwise(1))

In [None]:
# print schema 
result_tt.printSchema()

In [None]:
# compute total streaks by segments group by city ids
final_result_tt = result_tt \
    .groupBy('cityid') \
    .agg(F.count('prod_model').alias('total_streaks'), F.sum('prod_model').alias('covered_streaks'))

In [None]:
# print schema 
final_result_tt.printSchema()

In [None]:
# show snippet of final aggregated result
final_result_tt.show()

In [None]:
# add a column for coverage percentage
final_result_tt = final_result_tt.withColumn('% covered', col('covered_streaks') / col('total_streaks'))

In [None]:
# show top 1000 values
final_result_tt.orderBy('cityid').show(1000, False)

In [None]:
# stop spark session
spark.stop()