In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f
findspark.init()

# Creating a SparkSession in Python
spark = SparkSession.builder\
    .master("local")\
    .appName("Spark Streaming Task 2")\
    .getOrCreate()

# Configures the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.shuffle.partitions", "10")

24/05/17 15:41:04 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
import os
cwd = os.getcwd()
cwd

'/home/dat_21127240/BigData/Big-Data-With-Seaborn-Bokeh-Plotly/Lab_03/src'

In [3]:
inputPath = cwd + "/../data/taxi-data"
outputPath = cwd + "/../output_2"
checkpointPath = cwd + "/../checkpoint_2"

In [4]:
full_schema = StructType([StructField(f"_c{i}", StringType(), nullable=True) for i in range(22)])

# define schema for yellow taxi trips
yellow_taxi_schema = StructType([
    StructField("type", StringType(), nullable=False),
    StructField("VendorID", IntegerType()),
    StructField("tpep_pickup_datetime", TimestampType()),
    StructField("tpep_dropoff_datetime", TimestampType()),
    StructField("passenger_count", IntegerType()),
    StructField("trip_distance", FloatType()),
    StructField("pickup_longitude", FloatType()),
    StructField("pickup_latitude", FloatType()),
    StructField("RatecodeID", FloatType()),
    StructField("store_and_fwd_flag", StringType()),
    StructField("dropoff_longitude", FloatType()),
    StructField("dropoff_latitude", FloatType()),
    StructField("payment_type", IntegerType()),
    StructField("fare_amount", FloatType()),
    StructField("extra", FloatType()),
    StructField("mta_tax", FloatType()),
    StructField("tip_amount", FloatType()),
    StructField("tolls_amount", FloatType()),
    StructField("improvement_surcharge", FloatType()),
    StructField("total_amount", FloatType())
])
# define schema for green taxi trips
green_taxi_schema = StructType([
    StructField("type", StringType(), nullable=False),
    StructField("VendorID", IntegerType()),
    StructField("lpep_pickup_datetime", TimestampType()),
    StructField("Lpep_dropoff_datetime", TimestampType()),
    StructField("Store_and_fwd_flag", StringType()),
    StructField("RateCodeID", IntegerType()),
    StructField("Pickup_longitude", FloatType()),
    StructField("Pickup_latitude", FloatType()),
    StructField("Dropoff_longitude", FloatType()),
    StructField("Dropoff_latitude", FloatType()),
    StructField("Passenger_count", IntegerType()),
    StructField("Trip_distance", FloatType()),
    StructField("Fare_amount", FloatType()),
    StructField("Extra", FloatType()),
    StructField("MTA_tax", FloatType()),
    StructField("Tip_amount", FloatType()),
    StructField("Tolls_amount", FloatType()),
    StructField("Ehail_fee", FloatType()),
    StructField("improvement_surcharge", FloatType()),
    StructField("Total_amount", FloatType()),
    StructField("Payment_type", IntegerType()),
    StructField("Trip_type", IntegerType())
])

In [5]:
# read the entire data files first
full_df = spark.readStream\
    .option("maxFilesPerTrigger", 100)\
    .csv(f"file://{inputPath}",
         header=False,
         schema=full_schema)

# Filter rows based on the 'type' column
yellow_df = full_df.where(full_df["_c0"] == "yellow").selectExpr(
    *[f"cast(_c{i} as {field.dataType.simpleString()}) as {field.name}" \
        for i, field in enumerate(yellow_taxi_schema.fields)]
)

green_df = full_df.where(full_df["_c0"] == "green").selectExpr(
    *[f"cast(_c{i} as {field.dataType.simpleString()}) as {field.name}" \
        for i, field in enumerate(green_taxi_schema.fields)]
)

In [6]:
sub_yellow = yellow_df.selectExpr("tpep_dropoff_datetime as dropoff_datetime")
sub_green = green_df.selectExpr("Lpep_dropoff_datetime as dropoff_datetime")

sub_df = sub_yellow.union(sub_green)

In [7]:
by_dropoff = sub_df.groupBy(f.window(f.col("dropoff_datetime"), "1 hour"))\
    .count()

In [8]:
def foreach_batch_function(batch_df, epoch_id):
    # get endHour column
    tmp_df = batch_df.withColumn("startHour", f.hour(f.col("window.start")))
    
    # for each endHour, write to different directory
    start_hours = tmp_df.select("startHour").distinct().collect()
    
    if len(start_hours) != 0:
        for start_hour in start_hours:
            h_num = start_hour['startHour']
            hour_df = tmp_df.where(f.col("startHour") == f.lit(h_num)).select("count")
            output_dir = f"file://{outputPath}/output-{(h_num + 1) * 60 * 60 * 1000}"
            
            hour_df.write.mode("overwrite").csv(output_dir)

In [None]:
query = by_dropoff.writeStream\
    .outputMode("update")\
    .foreachBatch(foreach_batch_function)\
    .queryName("Event_Count")\
    .option("checkpointLocation", f"file://{checkpointPath}")\
    .start()
query.awaitTermination()

24/05/17 15:41:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/05/17 15:41:15 WARN HDFSBackedStateStoreProvider: The state for version 1 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/17 15:41:15 WARN HDFSBackedStateStoreProvider: The state for version 1 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/17 15:41:15 WARN HDFSBackedStateStoreProvider: The state for version 1 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first batch of starting query.
24/05/17 15:41:15 WARN HDFSBackedStateStoreProvider: The state for version 1 doesn't exist in loadedMaps. Reading snapshot file and delta files if needed...Note that this is normal for the first b

In [10]:
query.stop()
spark.stop()