## Project Template

In [20]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Be sure to start the stream on Kafka!

In [21]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType
# TO MODIFY FOR YOUR SCHEMA
schema = StructType([
    StructField("license", StringType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("timestamp", TimestampType(), True)
])

In [22]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
# Load the DataFrame
)
# Parse JSON data from Kafka's 'value' column
parsed_df = lines.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("parsed_value")) 

df = parsed_df.select("parsed_value.*") 


## The project starts here

You can create a

## [Query 1] Utilization over a window of 5, 10, and 15 minutes per taxi/driver. This can be computed by computing the idle time per taxi. How does it change? Is there an optimal window?

In [23]:
from pyspark.sql.window import Window

window_sizes = [5, 10, 15]  # Minutes
for size in window_sizes:
    window_duration = f"{size} minutes"

    utilization_df = df \
        .withWatermark("pickup_datetime", "10 minutes") \
        .groupBy(window("pickup_datetime", window_duration), "license") \
        .agg(
            (max("dropoff_datetime") - min("pickup_datetime")).cast("long").alias("total_time"),
            (min("pickup_datetime") - lag("dropoff_datetime").over(Window.partitionBy("license").orderBy("pickup_datetime"))).cast("long").alias("idle_time")
        ) \
        .withColumn("utilization", (100 * (col("total_time") - coalesce(col("idle_time"), lit(0))) / col("total_time")).cast("double"))

    # Write to sink (e.g., console, file, or in-memory table for a dashboard)
    utilization_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .option("truncate", "false") \
        .trigger(processingTime='10 seconds')\
        .start()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `pickup_datetime` cannot be resolved. Did you mean one of the following? [`license`, `total_amount`, `timestamp`].;
'EventTimeWatermark 'pickup_datetime, 10 minutes
+- Project [parsed_value#341.license AS license#343, parsed_value#341.total_amount AS total_amount#344, parsed_value#341.timestamp AS timestamp#345]
   +- Project [from_json(StructField(license,StringType,true), StructField(total_amount,DoubleType,true), StructField(timestamp,TimestampType,true), value#339, Some(Etc/UTC)) AS parsed_value#341]
      +- Project [cast(value#326 as string) AS value#339]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@48c6c486, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@477aff08, [startingOffsets=earliest, kafka.bootstrap.servers=kafka1:9092, subscribe=stock, maxOffsetsPerTrigger=100], [key#325, value#326, topic#327, partition#328, offset#329L, timestamp#330, timestampType#331], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@a21da24,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> kafka1:9092, subscribe -> stock, startingOffsets -> earliest, maxOffsetsPerTrigger -> 100),None), kafka, [key#318, value#319, topic#320, partition#321, offset#322L, timestamp#323, timestampType#324]


## [Query 2] The average time it takes for a taxi to find its next fare(trip) per destination borough. This can be computed by finding the time difference, e.g. in seconds, between the trip's drop off and the next trip's pick up within a given unit of time

In [26]:
!pip install geopandas
import geopandas as gpd

# Read the GeoJSON file (replace with the actual path to your file)
filepath = 'nyc-boroughs.geojson'
gdf = gpd.read_file(filepath)

# Print the GeoDataFrame (contains the data in a tabular format)
print(gdf)

# Print specific columns or rows (example)
print(gdf['boro_name'])  # Print borough names
print(gdf.head())       # Print the first 5 rows


Collecting geopandas
  Downloading geopandas-0.14.4-py3-none-any.whl.metadata (1.5 kB)
Collecting fiona>=1.8.21 (from geopandas)
  Downloading fiona-1.9.6-cp311-cp311-manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m458.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting shapely>=1.8.0 (from geopandas)
  Downloading shapely-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting click-plugins>=1.0 (from fiona>=1.8.21->geopandas)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting cligj>=0.5 (from fiona>=1.8.21->geopandas)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Downloading geopandas-0.14.4-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

KeyError: 'boro_name'

In [27]:
import geopandas as gpd

# Read the GeoJSON file (replace with the actual path to your file)
filepath = 'nyc-boroughs.geojson'
gdf = gpd.read_file(filepath)

# Find the actual column name for borough
boro_name_col = 'borough' # Replace with the correct name found in the GeoJSON file

# Print the GeoDataFrame (contains the data in a tabular format)
print(gdf)

# Print specific columns or rows (example)
print(gdf[boro_name_col])  # Use the correct column name
print(gdf.head())       # Print the first 5 rows


     boroughCode        borough  \
0              5  Staten Island   
1              5  Staten Island   
2              5  Staten Island   
3              5  Staten Island   
4              4         Queens   
..           ...            ...   
99             2          Bronx   
100            2          Bronx   
101            2          Bronx   
102            2          Bronx   
103            2          Bronx   

                                                   @id  \
0    http://nyc.pediacities.com/Resource/Borough/St...   
1    http://nyc.pediacities.com/Resource/Borough/St...   
2    http://nyc.pediacities.com/Resource/Borough/St...   
3    http://nyc.pediacities.com/Resource/Borough/St...   
4    http://nyc.pediacities.com/Resource/Borough/Qu...   
..                                                 ...   
99   http://nyc.pediacities.com/Resource/Borough/Bronx   
100  http://nyc.pediacities.com/Resource/Borough/Bronx   
101  http://nyc.pediacities.com/Resource/Borough/Bronx   

In [None]:
import geopandas as gpd
from shapely.geometry import Point
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Load borough boundaries (replace with the correct path to your GeoJSON file)
boroughs_gdf = gpd.read_file('nyc-boroughs.geojson')

# Function to determine borough from coordinates (same as before)
def get_borough(longitude, latitude):
    point = Point(longitude, latitude)
    for idx, borough in boroughs_gdf.iterrows():
        if borough['geometry'].contains(point):
            return borough['boro_name']
    return 'Unknown'

# UDF to add dropoff_borough column
get_borough_udf = udf(get_borough, StringType())
df_with_borough = df.withColumn("dropoff_borough", get_borough_udf("dropoff_longitude", "dropoff_latitude"))

# Calculate average time to next fare (using df_with_borough instead of df)
time_to_next_fare_df = df_with_borough \
    .withWatermark("pickup_datetime", "10 minutes") \
    .select("license", "dropoff_datetime", "dropoff_borough") \
    .join(
        df_with_borough.selectExpr("license", "pickup_datetime AS next_pickup_datetime"), 
        on=["license"], how="inner"
    ) \
    .filter(col("dropoff_datetime") < col("next_pickup_datetime")) \
    .groupBy(window("dropoff_datetime", "1 hour"), "dropoff_borough") \
    .agg(
        (mean(col("next_pickup_datetime").cast("long") - col("dropoff_datetime").cast("long")) / 1000).alias("avg_time_to_next_fare")
    )

# Output results (same as before)
time_to_next_fare_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()

In [29]:
!pip install geopandas shapely
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# Load the GeoJSON file
boroughs_gdf = gpd.read_file('nyc-boroughs.geojson')

# Load the taxi data CSV file
taxi_df = pd.read_csv('sample.csv')

# Create GeoDataFrames for pickup and dropoff locations
pickup_gdf = gpd.GeoDataFrame(
    taxi_df, geometry=gpd.points_from_xy(taxi_df.pickup_longitude, taxi_df.pickup_latitude))

dropoff_gdf = gpd.GeoDataFrame(
    taxi_df, geometry=gpd.points_from_xy(taxi_df.dropoff_longitude, taxi_df.dropoff_latitude))

# Set the coordinate reference system (CRS) to match the GeoJSON file
pickup_gdf.set_crs(epsg=4326, inplace=True)
dropoff_gdf.set_crs(epsg=4326, inplace=True)

# Perform spatial join to find the borough for each pickup and dropoff location
pickup_with_borough = gpd.sjoin(pickup_gdf, boroughs_gdf, how='left', op='within')
dropoff_with_borough = gpd.sjoin(dropoff_gdf, boroughs_gdf, how='left', op='within')

# Extract the borough information
taxi_df['pickup_borough'] = pickup_with_borough['borough']
taxi_df['dropoff_borough'] = dropoff_with_borough['borough']

# Display the DataFrame with borough information
print(taxi_df.head())




  if await self.run_code(code, result, async_=asy):
  if await self.run_code(code, result, async_=asy):


                          medallion                      hack_license  \
0  89D227B655E5C82AECF13C3F540D4CF4  BA96DE419E711691B9445D6A6307C170   
1  0BD7C8F5BA12B88E0B67BED28BEA73D8  9FD8F69F0804BDB5549F40E9DA1BE472   
2  0BD7C8F5BA12B88E0B67BED28BEA73D8  9FD8F69F0804BDB5549F40E9DA1BE472   
3  DFD2202EE08F7A8DC9A57B02ACB81FE2  51EE87E3205C985EF8431D850C786310   
4  DFD2202EE08F7A8DC9A57B02ACB81FE2  51EE87E3205C985EF8431D850C786310   

  vendor_id  rate_code store_and_fwd_flag      pickup_datetime  \
0       CMT          1                  N  2013-01-01 15:11:48   
1       CMT          1                  N  2013-01-06 00:18:35   
2       CMT          1                  N  2013-01-05 18:49:41   
3       CMT          1                  N  2013-01-07 23:54:15   
4       CMT          1                  N  2013-01-07 23:25:03   

      dropoff_datetime  passenger_count  trip_time_in_secs  trip_distance  \
0  2013-01-01 15:18:10                4                382            1.0   
1  2013-01

## [Query 3] The number of trips that started and ended within the same borough in the last hour

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
import geopandas as gpd

# Define the schema for the incoming data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("rate_code", StringType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", StringType(), True),
    StructField("trip_time_in_secs", StringType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Initialize Spark session
spark = SparkSession.builder \
    .appName("NYC Taxi Trips Analysis") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0") \
    .getOrCreate()

# Read the geojson file with GeoPandas
boroughs_gdf = gpd.read_file('nyc-boroughs.geojson')

# Broadcast the boroughs dataframe
boroughs_broadcast = spark.sparkContext.broadcast(boroughs_gdf)

# Read data from Kafka
kafka_server = "kafka1:9092"
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_server) \
    .option("subscribe", "taxi_topic") \
    .option("startingOffsets", "latest") \
    .load()

# Parse the JSON data and apply schema
parsed_df = df.selectExpr("CAST(value AS STRING) as json").select(from_json("json", schema).alias("data")).select("data.*")

# Filter trips within the last hour
current_time = current_timestamp()
trips_last_hour = parsed_df.withColumn("current_time", current_time).filter(
    (col("pickup_datetime") >= (col("current_time") - expr("INTERVAL 1 HOUR"))) &
    (col("dropoff_datetime") <= col("current_time"))
)

# Convert to Pandas for spatial join
def to_pandas(df):
    return df.toPandas()

def spatial_join(pandas_df):
    # Create GeoDataFrames for pickup and dropoff locations
    pickup_gdf = gpd.GeoDataFrame(
        pandas_df, geometry=gpd.points_from_xy(pandas_df.pickup_longitude, pandas_df.pickup_latitude))

    dropoff_gdf = gpd.GeoDataFrame(
        pandas_df, geometry=gpd.points_from_xy(pandas_df.dropoff_longitude, pandas_df.dropoff_latitude))

    # Set the CRS to match the GeoJSON file
    pickup_gdf.set_crs(epsg=4326, inplace=True)
    dropoff_gdf.set_crs(epsg=4326, inplace=True)

    # Perform spatial join to find the borough for each pickup and dropoff location
    pickup_with_borough = gpd.sjoin(pickup_gdf, boroughs_broadcast.value, how='left', op='within')
    dropoff_with_borough = gpd.sjoin(dropoff_gdf, boroughs_broadcast.value, how='left', op='within')

    # Add borough information to the original DataFrame
    pandas_df['pickup_borough'] = pickup_with_borough['borough']
    pandas_df['dropoff_borough'] = dropoff_with_borough['borough']

    return pandas_df

# Convert Spark DataFrame to Pandas DataFrame
pandas_df = to_pandas(trips_last_hour)

# Perform spatial join
joined_df = spatial_join(pandas_df)

# Convert back to Spark DataFrame
spark_df = spark.createDataFrame(joined_df)

# Filter trips that start and end in the same borough
same_borough_trips = spark_df.filter(col("pickup_borough") == col("dropoff_borough"))

# Group by borough and count trips
borough_trip_counts = same_borough_trips.groupBy("pickup_borough").count()

# Write the results to the console
query = borough_trip_counts.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

query.awaitTermination()


In [None]:

# Query 3 (Corrected)
trips_within_borough_df = df_with_borough \
    .withWatermark("pickup_datetime", "10 minutes") \
    .filter(col("pickup_borough") == col("dropoff_borough")) \
    .groupBy(window("pickup_datetime", "1 hour"), "pickup_borough") \
    .count() \
    .withColumnRenamed("count", "num_trips")
    
trips_within_borough_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()




## [Query 4] The number of trips that started in one borough and ended in another one in the last hour

In [None]:
trips_between_boroughs_df = df \
    .withWatermark("pickup_datetime", "10 minutes") \
    .filter(col("pickup_borough") != col("dropoff_borough")) \
    .groupBy(window("pickup_datetime", "1 hour"), "pickup_borough", "dropoff_borough") \
    .count() \
    .withColumnRenamed("count", "num_trips")
    
trips_between_boroughs_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .trigger(processingTime='10 seconds') \
    .start()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Spark configuration
spark = SparkSession.builder \
    .appName("NYC Taxi Data Analysis") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .getOrCreate()

# Define schema for the incoming data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True)
])

# Read from Kafka
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9092,kafka2:9093") \
    .option("subscribe", "taxi_topic") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse the JSON data and apply the schema
parsed_df = df.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")

# Query 1: Utilization over a window of 5, 10, and 15 minutes per taxi/driver
utilization_df = parsed_df.withWatermark("pickup_datetime", "1 minute") \
    .groupBy(
        col("hack_license"),
        window(col("pickup_datetime"), "5 minutes", "1 minute")
    ).count()

# Query 2: Average time to find next fare per destination borough (simplified example)
from pyspark.sql.functions import lag
from pyspark.sql.window import Window

window_spec = Window.partitionBy("hack_license").orderBy("pickup_datetime")
parsed_df = parsed_df.withColumn("next_pickup_datetime", lag("pickup_datetime", -1).over(window_spec))
parsed_df = parsed_df.withColumn("time_to_next_fare", col("next_pickup_datetime").cast("long") - col("dropoff_datetime").cast("long"))

avg_time_next_fare_df = parsed_df.groupBy("hack_license").agg({"time_to_next_fare": "avg"})

# Query 3: Number of trips within the same borough in the last hour
same_borough_trips_df = parsed_df.filter(
    (col("pickup_longitude") == col("dropoff_longitude")) & (col("pickup_latitude") == col("dropoff_latitude"))
).groupBy(window(col("pickup_datetime"), "1 hour")).count()

# Query 4: Number of trips from one borough to another in the last hour
inter_borough_trips_df = parsed_df.filter(
    (col("pickup_longitude") != col("dropoff_longitude")) & (col("pickup_latitude") != col("dropoff_latitude"))
).groupBy(window(col("pickup_datetime"), "1 hour")).count()

# Start the streaming queries
utilization_query = utilization_df.writeStream.outputMode("append").format("console").start()
avg_time_next_fare_query = avg_time_next_fare_df.writeStream.outputMode("append").format("console").start()
same_borough_trips_query = same_borough_trips_df.writeStream.outputMode("append").format("console").start()
inter_borough_trips_query = inter_borough_trips_df.writeStream.outputMode("append").format("console").start()

# Await termination
utilization_query.awaitTermination()
avg_time_next_fare_query.awaitTermination()
same_borough_trips_query.awaitTermination()
inter_borough_trips_query.awaitTermination()
