## Project Template

In [5]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Be sure to start the stream on Kafka!

In [16]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),  
    StructField("pickup_datetime", TimestampType(), True),  
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])


In [17]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
# Load the DataFrame
)
# Parse JSON data from Kafka's 'value' column
parsed_df = lines.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("parsed_value")) 

df = parsed_df.select("parsed_value.*") 


## The project starts here

You can create a

## [Query 1] Utilization over a window of 5, 10, and 15 minutes per taxi/driver. This can be computed by computing the idle time per taxi. How does it change? Is there an optimal window?

In [8]:
from pyspark.sql.window import Window

window_sizes = [5, 10, 15]  # Minutes
for size in window_sizes:
    window_duration = f"{size} minutes"

    utilization_df = df \
        .withWatermark("pickup_datetime", "10 minutes") \
        .groupBy(window("pickup_datetime", window_duration), "license") \
        .agg(
            (max("dropoff_datetime") - min("pickup_datetime")).cast("long").alias("total_time"),
            (min("pickup_datetime") - lag("dropoff_datetime").over(Window.partitionBy("license").orderBy("pickup_datetime"))).cast("long").alias("idle_time")
        ) \
        .withColumn("utilization", (100 * (col("total_time") - coalesce(col("idle_time"), lit(0))) / col("total_time")).cast("double"))

    # Write to sink (e.g., console, file, or in-memory table for a dashboard)
    utilization_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .trigger(processingTime='10 seconds')\
        .start()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `pickup_datetime` cannot be resolved. Did you mean one of the following? [`license`, `total_amount`, `timestamp`].;
'EventTimeWatermark 'pickup_datetime, 10 minutes
+- Project [parsed_value#54.license AS license#56, parsed_value#54.total_amount AS total_amount#57, parsed_value#54.timestamp AS timestamp#58]
   +- Project [from_json(StructField(license,StringType,true), StructField(total_amount,DoubleType,true), StructField(timestamp,TimestampType,true), value#52, Some(Etc/UTC)) AS parsed_value#54]
      +- Project [cast(value#39 as string) AS value#52]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@1d1b055e, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@6eb68be6, [startingOffsets=earliest, kafka.bootstrap.servers=kafka1:9092, subscribe=stock, maxOffsetsPerTrigger=100], [key#38, value#39, topic#40, partition#41, offset#42L, timestamp#43, timestampType#44], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@12239223,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9092, subscribe -> stock, startingOffsets -> earliest, maxOffsetsPerTrigger -> 100),None), kafka, [key#31, value#32, topic#33, partition#34, offset#35L, timestamp#36, timestampType#37]


## [Query 2] The average time it takes for a taxi to find its next fare(trip) per destination borough. This can be computed by finding the time difference, e.g. in seconds, between the trip's drop off and the next trip's pick up within a given unit of time

In [10]:
!pip install geopandas shapely

Collecting geopandas
  Using cached geopandas-0.14.4-py3-none-any.whl.metadata (1.5 kB)
Collecting shapely
  Using cached shapely-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting fiona>=1.8.21 (from geopandas)
  Using cached fiona-1.9.6-cp311-cp311-manylinux2014_x86_64.whl.metadata (50 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Using cached pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting click-plugins>=1.0 (from fiona>=1.8.21->geopandas)
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting cligj>=0.5 (from fiona>=1.8.21->geopandas)
  Using cached cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Using cached geopandas-0.14.4-py3-none-any.whl (1.1 MB)
Using cached shapely-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Using cached fiona-1.9.6-cp311-cp311-manylinux2014_x86_64.whl (15.7 MB)
Using cached pyproj-3.6.1-cp311-cp311-manyl

In [11]:

import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# Load the GeoJSON file
boroughs_gdf = gpd.read_file('nyc-boroughs.geojson')

# Load the taxi data CSV file
taxi_df = pd.read_csv('sample.csv')

# Create GeoDataFrames for pickup and dropoff locations
pickup_gdf = gpd.GeoDataFrame(
    taxi_df, geometry=gpd.points_from_xy(taxi_df.pickup_longitude, taxi_df.pickup_latitude))

dropoff_gdf = gpd.GeoDataFrame(
    taxi_df, geometry=gpd.points_from_xy(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))

# Set the coordinate reference system (CRS) to match the GeoJSON file
pickup_gdf.set_crs(epsg=4326, inplace=True)
dropoff_gdf.set_crs(epsg=4326, inplace=True)

# Perform spatial join to find the borough for each pickup and dropoff location
pickup_with_borough = gpd.sjoin(pickup_gdf, boroughs_gdf, how='left', op='within')
dropoff_with_borough = gpd.sjoin(dropoff_gdf, boroughs_gdf, how='left', op='within')

# Extract the borough information
taxi_df['pickup_borough'] = pickup_with_borough['borough']
taxi_df['dropoff_borough'] = dropoff_with_borough['borough']

# Display the DataFrame with borough information
print(taxi_df.head(20))


  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


                           medallion                      hack_license  \
0   89D227B655E5C82AECF13C3F540D4CF4  BA96DE419E711691B9445D6A6307C170   
1   0BD7C8F5BA12B88E0B67BED28BEA73D8  9FD8F69F0804BDB5549F40E9DA1BE472   
2   0BD7C8F5BA12B88E0B67BED28BEA73D8  9FD8F69F0804BDB5549F40E9DA1BE472   
3   DFD2202EE08F7A8DC9A57B02ACB81FE2  51EE87E3205C985EF8431D850C786310   
4   DFD2202EE08F7A8DC9A57B02ACB81FE2  51EE87E3205C985EF8431D850C786310   
5   20D9ECB2CA0767CF7A01564DF2844A3E  598CCE5B9C1918568DEE71F43CF26CD2   
6   496644932DF3932605C22C7926FF0FE0  513189AD756FF14FE670D10B92FAF04C   
7   0B57B9633A2FECD3D3B1944AFC7471CF  CCD4367B417ED6634D986F573A552A62   
8   2C0E91FF20A856C891483ED63589F982  1DA2F6543A62B8ED934771661A9D2FA0   
9   2D4B95E2FA7B2E85118EC5CA4570FA58  CD2F522EEE1FF5F5A8D8B679E23576B3   
10  E12F6AF991172EAC3553144A0AF75A19  06918214E951FA0003D1CC54955C2AB0   
11  E12F6AF991172EAC3553144A0AF75A19  06918214E951FA0003D1CC54955C2AB0   
12  78FFD9CD0CDA541F335EF8B38FB494D6  

## [Query 3] The number of trips that started and ended within the same borough in the last hour

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import geopandas as gpd

# Define the schema for the incoming data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", StringType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", StringType(), True),
    StructField("trip_time_in_secs", StringType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Initialize Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Trips Analysis") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0") \
    .getOrCreate()

# Read the geojson file with GeoPandas
boroughs_gdf = gpd.read_file('nyc-boroughs.geojson')

# Broadcast the boroughs dataframe
boroughs_broadcast = spark.sparkContext.broadcast(boroughs_gdf)

# Read data from Kafka
kafka_server = "kafka1:9092"
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribe", "taxi_topic") \
    .option("startingOffsets", "latest") \
    .load()

# Parse the JSON data and apply schema
parsed_df = df.selectExpr("CAST(value AS STRING) as json").select(from_json("json", schema).alias("data")).select("data.*")

# Filter trips within the last hour
current_time = current_timestamp()
trips_last_hour = parsed_df.withColumn("current_time", current_time).filter(
    (col("pickup_datetime") >= (col("current_time") - expr("INTERVAL 1 HOUR"))) &
    (col("dropoff_datetime") <= col("current_time"))
)

# Convert to Pandas for spatial join
def to_pandas(df):
    return df.toPandas()

def spatial_join(pandas_df):
    # Create GeoDataFrames for pickup and dropoff locations
    pickup_gdf = gpd.GeoDataFrame(
        pandas_df, geometry=gpd.points_from_xy(pandas_df.pickup_longitude, pandas_df.pickup_latitude))

    dropoff_gdf = gpd.GeoDataFrame(
        pandas_df, geometry=gpd.points_from_xy(pandas_df.dropoff_longitude, pandas_df.dropoff_latitude))

    # Set the CRS to match the GeoJSON file
    pickup_gdf.set_crs(epsg=4326, inplace=True)
    dropoff_gdf.set_crs(epsg=4326, inplace=True)

    # Perform spatial join to find the borough for each pickup and dropoff location
    pickup_with_borough = gpd.sjoin(pickup_gdf, boroughs_broadcast.value, how='left', op='within')
    dropoff_with_borough = gpd.sjoin(dropoff_gdf, boroughs_broadcast.value, how='left', op='within')

    # Add borough information to the original DataFrame
    pandas_df['pickup_borough'] = pickup_with_borough['borough']
    pandas_df['dropoff_borough'] = dropoff_with_borough['borough']

    return pandas_df

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = to_pandas(trips_last_hour)

# Perform spatial join
joined_df = spatial_join(pandas_df)

# Convert back to Spark DataFrame
spark_df = spark.createDataFrame(joined_df)

# Filter trips that start and end in the same borough
same_borough_trips = spark_df.filter(col("pickup_borough") == col("dropoff_borough"))

# Group by borough and count trips
borough_trip_counts = same_borough_trips.groupBy("pickup_borough").count()

# Write the results to the console
query = borough_trip_counts.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()


NameError: name 'expr' is not defined

In [13]:

# Query 3 (Corrected)
trips_within_borough_df = df_with_borough \
    .withWatermark("pickup_datetime", "10 minutes") \
    .filter(col("pickup_borough") == col("dropoff_borough")) \
    .groupBy(window("pickup_datetime", "1 hour"), "pickup_borough") \
    .count() \
    .withColumnRenamed("count", "num_trips")
    
trips_within_borough_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()




NameError: name 'df_with_borough' is not defined

## [Query 4] The number of trips that started in one borough and ended in another one in the last hour

In [14]:
trips_between_boroughs_df = df \
    .withWatermark("pickup_datetime", "10 minutes") \
    .filter(col("pickup_borough") != col("dropoff_borough")) \
    .groupBy(window("pickup_datetime", "1 hour"), "pickup_borough", "dropoff_borough") \
    .count() \
    .withColumnRenamed("count", "num_trips")
    
trips_between_boroughs_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `pickup_datetime` cannot be resolved. Did you mean one of the following? [`partition`, `key`, `offset`, `topic`, `value`].;
'EventTimeWatermark 'pickup_datetime, 10 minutes
+- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@64ebf539, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@587fc8cc, [startingOffsets=latest, kafka.bootstrap.servers=kafka1:9092, subscribe=taxi_topic], [key#69, value#70, topic#71, partition#72, offset#73L, timestamp#74, timestampType#75], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@12239223,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9092, subscribe -> taxi_topic, startingOffsets -> latest),None), kafka, [key#62, value#63, topic#64, partition#65, offset#66L, timestamp#67, timestampType#68]


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Spark configuration
spark = SparkSession.builder \
    .appName("NYC Taxi Data Analysis") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .getOrCreate()

# Define schema for the incoming data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092,kafka2:9093") \
    .option("subscribe", "taxi_topic") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse the JSON data and apply the schema
parsed_df = df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")

# Query 1: Utilization over a window of 5, 10, and 15 minutes per taxi/driver
utilization_df = parsed_df.withWatermark("pickup_datetime", "1 minute") \
    .groupBy(
        col("hack_license"),
        window(col("pickup_datetime"), "5 minutes", "1 minute")
    ).count()

# Query 2: Average time to find next fare per destination borough (simplified example)
from pyspark.sql.functions import lag
from pyspark.sql.window import Window

window_spec = Window.partitionBy("hack_license").orderBy("pickup_datetime")
parsed_df = parsed_df.withColumn("next_pickup_datetime", lag("pickup_datetime", -1).over(window_spec))
parsed_df = parsed_df.withColumn("time_to_next_fare", col("next_pickup_datetime").cast("long") - col("dropoff_datetime").cast("long"))

avg_time_next_fare_df = parsed_df.groupBy("hack_license").agg({"time_to_next_fare": "avg"})

# Query 3: Number of trips within the same borough in the last hour
same_borough_trips_df = parsed_df.filter(
    (col("pickup_longitude") == col("dropoff_longitude")) & (col("pickup_latitude") == col("dropoff_latitude"))
).groupBy(window(col("pickup_datetime"), "1 hour")).count()

# Query 4: Number of trips from one borough to another in the last hour
inter_borough_trips_df = parsed_df.filter(
    (col("pickup_longitude") != col("dropoff_longitude")) & (col("pickup_latitude") != col("dropoff_latitude"))
).groupBy(window(col("pickup_datetime"), "1 hour")).count()

# Start the streaming queries
utilization_query = utilization_df.writeStream.outputMode("append").format("console").start()
avg_time_next_fare_query = avg_time_next_fare_df.writeStream.outputMode("append").format("console").start()
same_borough_trips_query = same_borough_trips_df.writeStream.outputMode("append").format("console").start()
inter_borough_trips_query = inter_borough_trips_df.writeStream.outputMode("append").format("console").start()

# Await termination
utilization_query.awaitTermination()
avg_time_next_fare_query.awaitTermination()
same_borough_trips_query.awaitTermination()
inter_borough_trips_query.awaitTermination()


AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;
Aggregate [hack_license#155], [hack_license#155, avg(time_to_next_fare#195L) AS avg(time_to_next_fare)#217]
+- Project [medallion#154, hack_license#155, pickup_datetime#156, dropoff_datetime#157, pickup_longitude#158, pickup_latitude#159, dropoff_longitude#160, dropoff_latitude#161, next_pickup_datetime#185, (cast(next_pickup_datetime#185 as bigint) - cast(dropoff_datetime#157 as bigint)) AS time_to_next_fare#195L]
   +- Project [medallion#154, hack_license#155, pickup_datetime#156, dropoff_datetime#157, pickup_longitude#158, pickup_latitude#159, dropoff_longitude#160, dropoff_latitude#161, next_pickup_datetime#185]
      +- Project [medallion#154, hack_license#155, pickup_datetime#156, dropoff_datetime#157, pickup_longitude#158, pickup_latitude#159, dropoff_longitude#160, dropoff_latitude#161, next_pickup_datetime#185, next_pickup_datetime#185]
         +- Window [lag(pickup_datetime#156, 1, null) windowspecdefinition(hack_license#155, pickup_datetime#156 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS next_pickup_datetime#185], [hack_license#155], [pickup_datetime#156 ASC NULLS FIRST]
            +- Project [medallion#154, hack_license#155, pickup_datetime#156, dropoff_datetime#157, pickup_longitude#158, pickup_latitude#159, dropoff_longitude#160, dropoff_latitude#161]
               +- Project [data#152.medallion AS medallion#154, data#152.hack_license AS hack_license#155, data#152.pickup_datetime AS pickup_datetime#156, data#152.dropoff_datetime AS dropoff_datetime#157, data#152.pickup_longitude AS pickup_longitude#158, data#152.pickup_latitude AS pickup_latitude#159, data#152.dropoff_longitude AS dropoff_longitude#160, data#152.dropoff_latitude AS dropoff_latitude#161]
                  +- Project [from_json(StructField(medallion,StringType,true), StructField(hack_license,StringType,true), StructField(pickup_datetime,TimestampType,true), StructField(dropoff_datetime,TimestampType,true), StructField(pickup_longitude,DoubleType,true), StructField(pickup_latitude,DoubleType,true), StructField(dropoff_longitude,DoubleType,true), StructField(dropoff_latitude,DoubleType,true), cast(value#139 as string), Some(Etc/UTC)) AS data#152]
                     +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@3bc1c94c, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@aae7a92, [startingOffsets=earliest, kafka.bootstrap.servers=kafka1:9092,kafka2:9093, subscribe=taxi_topic], [key#138, value#139, topic#140, partition#141, offset#142L, timestamp#143, timestampType#144], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@12239223,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9092,kafka2:9093, subscribe -> taxi_topic, startingOffsets -> earliest),None), kafka, [key#131, value#132, topic#133, partition#134, offset#135L, timestamp#136, timestampType#137]
