In [1]:
# Read from Cosmos DB analytical store into a Spark DataFrame and display 10 rows from the DataFrame
# To select a preferred list of regions in a multi-region Cosmos DB account, add .option("spark.cosmos.preferredRegions", "<Region1>,<Region2>")

df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", "ls_cosmos_db_nyc_taxi_data")\
    .option("spark.cosmos.container", "Heartbeat")\
    .load()

display(df.limit(10))

In [3]:
# Define a threshold for idleness (e.g., if time to destination is greater than 10 mins)
idle_threshold = 10

In [4]:
# Aggregations
total_hired_distance = df.filter(df["hired"] == True).agg({"distanceToDestination": "sum"}).collect()[0][0]
average_time_to_destination = df.agg({"timeToDestinationMins": "avg"}).collect()[0][0]
hired_drivers_count = df.filter(df["hired"] == True).count()
idle_drivers_count = df.filter((df["timeToDestinationMins"] > idle_threshold) & (df["distanceToDestination"] < 1)).count()

In [5]:
# Display results
print(f"Total Distance for Hired Drivers: {total_hired_distance:.2f} units")
print(f"Average Time to Destination: {average_time_to_destination:.2f} mins")
print(f"Count of Hired Drivers: {hired_drivers_count}")
print(f"Count of Idle Drivers: {idle_drivers_count}")