In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timezone

# Locations
bronzeFilePath = "/mnt/data/bronze/events"
silverFilePath = "/mnt/data/silver/events"
quarantineFilePath = "/mnt/data/quarantine/events"
checkpointPath = "/mnt/data/checkpoints/events"
checkpointPathQuarantine = "/mnt/data/checkpoints/events_quarantine"
silverMoviesPath = "/mnt/data/silver/imdb/titles"
 
#TODO: Data Quality Checks

# Please fill-in gaps 
# Here is a dictionary "rules" created where all rules will be stored as key-value pairs where key is a rule name and value is an expression to be run in dataframe, 
# e.g. {"valid_event_timestamp":"(event_timestamp IS NOT NULL)", ...}
# Then in "quarantine_rules" variable stored a string with expression generated with all checks 
# like "NOT((event_timestamp IS NOT NULL) AND (other rule) AND (other rule))"
# Your goal is to fill in rules for now

# • Rule#1: event_type in “START” or “FINISH” state.
# • Rule#2: event_uid is not empty.
# • Rule#3: event_id is not empty.
# • Rule#4: user_subscription_device_id is not empty.
# • Rule#5: movie_id is not empty.
# • Rule#6: movie_id value exists in "silver/imdb/titles/" entity (title_id column).

rules = {}
rules["valid_event_type"] = "(event_type='START' OR event_type='FINISH')"
rules["valid_event_uid"] = "(event_uid IS NOT NULL)"
rules["valid_event_id"] = "(event_id IS NOT NULL)"
rules["valid_user_subscription_device_id"] = "(user_subscription_device_id IS NOT NULL)"
rules["valid_movie_id"] = "(movie_id IS NOT NULL)" 
quarantine_rules = "NOT({0})".format(" AND ".join(rules.values()))

#TODO: Read movies dataset from silver for dq check. 
# This dataframe will be used for Rule#6
# <read "title_id" column dataset from 'silver/imdb/titles' dataset>
moviesDF = spark.read.parquet(silverMoviesPath).select("title_id") 

#TODO: Read bronze layer using Auto Loader
bronzeDF = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "parquet")
    .option("cloudFiles.schemaLocation", checkpointPath)
    .option("cloudFiles.checkpointLocation", checkpointPath)
    .option("cloudFiles.validateOptions", "false")
    .load(bronzeFilePath)
    .withColumn("is_quarantined", expr(quarantine_rules))
)

bronzeDF = bronzeDF.join(moviesDF, bronzeDF.movie_id == moviesDF.title_id, "left")

#Finalize dataset for silver layer 
silverDF = (
        bronzeDF.filter((bronzeDF.is_quarantined == False) & (bronzeDF.title_id.isNotNull()))
        .drop("is_quarantined", "title_id", "_rescued_data")
        .withColumn("year", date_format(col("event_timestamp"), "yyyy"))
        .withColumn("month", date_format(col("event_timestamp"), "MM"))
        .withColumn("day", date_format(col("event_timestamp"), "dd"))
)

#TODO: Finalize dataset for quarantine
# Rows which do not meet the data quality rules should be written to the quarantine folder
# <fill in code to gerenerate dataframe with invalid data>
silverDF_quarantine = (
        bronzeDF.filter(bronzeDF.is_quarantined == True)
        .withColumn("year", date_format(col("event_timestamp"), "yyyy"))
        .withColumn("month", date_format(col("event_timestamp"), "MM"))
        .withColumn("day", date_format(col("event_timestamp"), "dd"))
)

#TODO: Write to silver layer
#fill in code to write data from stream into silverFilePath in parquet format
#use checkpointPath for checkpoint location
#partition data by year, month, day
(
        silverDF.writeStream
        .option("checkpointLocation", checkpointPath)
        .partitionBy("year", "month", "day")
        .format("parquet")
        .outputMode("append")
        .start(silverFilePath)
)

#TODO: Write to quarantine
#fill in code to write data from stream into quarantineFilePath folder into subdirectory named by loading datetime (format should be 2023-03-01-050000)in parquet format
loading_datetime = datetime.now().strftime("%Y-%m-%d-%H%M%S")
quarantineFilePathWithDatetime = f"{quarantineFilePath}/{loading_datetime}"

(
        silverDF_quarantine.writeStream
        .option("checkpointLocation", checkpointPathQuarantine)
        .format("parquet")
        .outputMode("append")
        .start(quarantineFilePathWithDatetime)
)


<pyspark.sql.streaming.query.StreamingQuery at 0x7fd5f0b0cc10>