In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f
findspark.init()

# Creating a SparkSession in Python
spark = SparkSession.builder\
    .master("local")\
    .appName("Spark Streaming")\
    .getOrCreate()

# keep the size of shuffles small
spark.conf.set("spark.sql.shuffle.partitions", "2")

24/05/15 14:58:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
import os
cwd = os.getcwd()
cwd

'/home/dat_21127240/BigData/Big-Data-With-Seaborn-Bokeh-Plotly/Lab_03/src'

In [3]:
full_schema = StructType([StructField(f"_c{i}", StringType(), nullable=True) for i in range(22)])

# define schema for yellow taxi trips
yellow_taxi_schema = StructType([
    StructField("type", StringType(), nullable=False),
    StructField("VendorID", IntegerType()),
    StructField("tpep_pickup_datetime", TimestampType()),
    StructField("tpep_dropoff_datetime", TimestampType()),
    StructField("passenger_count", IntegerType()),
    StructField("trip_distance", FloatType()),
    StructField("pickup_longitude", FloatType()),
    StructField("pickup_latitude", FloatType()),
    StructField("RatecodeID", FloatType()),
    StructField("store_and_fwd_flag", StringType()),
    StructField("dropoff_longitude", FloatType()),
    StructField("dropoff_latitude", FloatType()),
    StructField("payment_type", IntegerType()),
    StructField("fare_amount", FloatType()),
    StructField("extra", FloatType()),
    StructField("mta_tax", FloatType()),
    StructField("tip_amount", FloatType()),
    StructField("tolls_amount", FloatType()),
    StructField("improvement_surcharge", FloatType()),
    StructField("total_amount", FloatType())
])
# define schema for green taxi trips
green_taxi_schema = StructType([
    StructField("type", StringType(), nullable=False),
    StructField("VendorID", IntegerType()),
    StructField("lpep_pickup_datetime", TimestampType()),
    StructField("Lpep_dropoff_datetime", TimestampType()),
    StructField("Store_and_fwd_flag", StringType()),
    StructField("RateCodeID", IntegerType()),
    StructField("Pickup_longitude", FloatType()),
    StructField("Pickup_latitude", FloatType()),
    StructField("Dropoff_longitude", FloatType()),
    StructField("Dropoff_latitude", FloatType()),
    StructField("Passenger_count", IntegerType()),
    StructField("Trip_distance", FloatType()),
    StructField("Fare_amount", FloatType()),
    StructField("Extra", FloatType()),
    StructField("MTA_tax", FloatType()),
    StructField("Tip_amount", FloatType()),
    StructField("Tolls_amount", FloatType()),
    StructField("Ehail_fee", FloatType()),
    StructField("improvement_surcharge", FloatType()),
    StructField("Total_amount", FloatType()),
    StructField("Payment_type", IntegerType()),
    StructField("Trip_type", IntegerType())
])

In [4]:
# read the entire data files first
full_df = spark.readStream\
    .option("maxFilesPerTrigger", 1)\
    .csv(f"file://{cwd}/../data/taxi-data",
         header=False,
         schema=full_schema)

# Filter rows based on the 'type' column
# yellow_df = full_df.where(full_df["_c0"] == "yellow").selectExpr(
#     *[f"cast(_c{i} as {field.dataType.simpleString()}) as {field.name}" \
#         for i, field in enumerate(yellow_taxi_schema.fields)]
# )
# green_df = full_df.where(full_df["_c0"] == "green").selectExpr(
#     *[f"cast(_c{i} as {field.dataType.simpleString()}) as {field.name}" \
#         for i, field in enumerate(green_taxi_schema.fields)]
# )

In [7]:
def foreach_batch_function(batch_df, epoch_id):
    # get endHour column
    tmp_df = batch_df.withColumn("startHour", f.hour(f.col("window.start")))
    
    # for each endHour, write to different directory
    start_hours = tmp_df.select("startHour").distinct().collect()
    
    if len(start_hours) != 0:
        for start_hour in start_hours:
            h_num = start_hour['startHour']
            sub_df = tmp_df.where(f.col("startHour") == f.lit(h_num))
            output_dir = f"file://{cwd}/../output/output-{(h_num + 1) * 60 * 60 * 1000}"
            
            sub_df.select("count").write.mode("append").json(output_dir)

In [8]:
# Query
full_df = full_df.withColumn("dropoff", f.expr("cast(_c3 as timestamp)"))
by_dropoff = full_df\
    .withWatermark("dropoff", "1 hour")\
    .groupBy(f.window(f.col("dropoff"), "1 hour"))\
    .count()

query = by_dropoff.writeStream\
    .outputMode("append")\
    .foreachBatch(foreach_batch_function)\
    .queryName("Event_Count")\
    .option("checkpointLocation", f"file://{cwd}/../checkpoint")\
    .start()
query.awaitTermination()

24/05/15 14:58:31 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/05/15 14:58:33 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/15 14:58:33 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/15 14:58:33 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/15 14:58:33 WARN HDFSBackedStateStoreProvider: The state for version 3 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first b

KeyboardInterrupt: 

In [9]:
query.stop()