In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 pyspark-shell'

import pyspark 
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


sc = SparkContext()
sc.setLogLevel("WARN")
spark = SparkSession(sc)


df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers","localhost:9092") \
    .option("subscribe", "clicks-cleaned") \
    .option("startingOffsets", "latest") \
    .option("failOnDataLoss", "false") \
    .load()
# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

schema = StructType([
    StructField("visitor_platform", StringType()),
    StructField("ts_ingest", TimestampType()),
    StructField("article_title", StringType()),
    StructField("visitor_country", StringType()),
    StructField("visitor_os", StringType()),
    StructField("article", StringType()),
    StructField("visitor_browser", StringType()),
    StructField("visitor_page_timer", IntegerType()),
    StructField("visitor_page_height", IntegerType()),
])

dfs = df.selectExpr("CAST(value AS STRING)") \
      .select(from_json(col("value"), schema) \
      .alias("clicks"))

df_data = dfs.select("clicks.*")

@udf
def forbidden_clicks(click_url):
    return click_url.endswith('/admin')

# For every article url let's check if it is a forbidden url
# We can not use the map() function here, dataframes do not support this anymore since version 2.0
# Under the hood calling map() on a dataframe would transform it to an rdd which is not allowed in structured streaming
# It means you can use only DataFrame or SQL, conversion to RDD (or DStream or local collections) are not supported.
# For this we will use a User Defined Function (UDF) to execute some Pyhton code on a column.
df_forbidden = df_data.select('article', forbidden_clicks('article').cast('boolean').alias('forbidden'))

# DDOS
# Window over last X minutes, count number of 'visitor_page_timer' and 'visitor_page_height' == 0
@pandas_udf('boolean', PandasUDFType.SCALAR)
def ddos_flagged(page_timer, page_height):
    return (page_timer == 0) & (page_height == 0)

df_ddos = df_data.select("*", ddos_flagged('visitor_page_timer', 'visitor_page_height').alias('flagged'))

df_ddos_window = df_ddos.groupBy(
    window(df_ddos.ts_ingest, '30 seconds'),
    df_ddos.flagged
).count()

# Debug dataframes in terminal
# query_data = df_data.writeStream.outputMode("append").option("truncate", "false").format("console").start()
# query_forbidden = df_forbidden.writeStream.outputMode("append").option("truncate", "false").format("console").start()
# query_ddos = df_ddos_window.writeStream.outputMode("update").option("truncate", "true").format("console").start()

query_forbidden = df_forbidden.selectExpr("to_json(struct(*)) as value") \
    .writeStream.format("kafka") \
    .outputMode('update') \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "clicks-calculated-forbidden") \
    .option("checkpointLocation", "checkpointsforbidden") \
    .start()

query_ddos = df_ddos_window.selectExpr("to_json(struct(*)) as value") \
    .writeStream.format("kafka") \
    .trigger(processingTime='30 seconds') \
    .outputMode('update') \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("topic", "clicks-calculated-ddos") \
    .option("checkpointLocation", "checkpointsddos") \
    .start()



In [None]:
spark.streams.awaitAnyTermination()