In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 pyspark-shell'

import pyspark 
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


sc = SparkContext()
sc.setLogLevel("WARN")
spark = SparkSession(sc)


df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers","10.10.139.63:9092") \
    .option("subscribe", "calculated") \
    .option("startingOffsets", "earliest") \
    .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

schema = StructType([
    StructField("visitor_platform", StringType()),
    StructField("ts_ingest", TimestampType()),
    StructField("article_title", StringType()),
    StructField("visitor_country", StringType()),
    StructField("visitor_os", StringType()),
    StructField("article", StringType()),
    StructField("visitor_browser", StringType()),
    StructField("visitor_page_timer", IntegerType()),
    StructField("visitor_page_height", IntegerType()),
])

dfs = df.selectExpr("CAST(value AS STRING)") \
      .select(from_json(col("value"), schema) \
      .alias("clicks"))

df_data = dfs.select("clicks.*")

# TODO use Mailgun and prevent spamming yourself with emails
@udf
def forbidden_clicks(click_url):
    print(click_url)
    if click_url.endswith('/admin'):
        print('SECURITY BREACH! Someone is trying to access {}\n'.format(click_url))
        return True
    else:
        return False

# For every article url let's check if it is a forbidden url
# We can not use the map() function here, dataframes do not support this anymore since version 2.0
# Under the hood calling map() on a dataframe would transform it to an rdd which is not allowed in structured streaming
# It means you can use only DataFrame or SQL, conversion to RDD (or DStream or local collections) are not supported.
# For this we will use a User Defined Function (UDF) to execute some Pyhton code on a column.
df_forbidden = df_data.select('article', forbidden_clicks('article').alias('forbidden'))

# DDOS
# Window over last X minutes, count number of 'visitor_page_timer' and 'visitor_page_height' == 0
# If over threshold then alert


# TODO use Mailgun and prevent spamming yourself with emails
@udf
def ddos_clicks(count):
    if count > 1:
        print('DDOS?\n')
        return "DDOS"
    return "--"

df_ddos = df_data.groupBy(
    window(df_data.ts_ingest, '10 seconds', '10 seconds'),
    df_data.visitor_page_timer,
    df_data.visitor_page_height    
).count().select("window", "visitor_page_timer", "visitor_page_height",'count', ddos_clicks('count').alias('ddos')) \
    .where('visitor_page_timer = 0 and visitor_page_height = 0')

# Debug dataframes in terminal
query_data = df_data.writeStream.outputMode("append").option("truncate", "false").format("console").start()
query_forbidden = df_forbidden.writeStream.outputMode("append").option("truncate", "false").format("console").start()
query_ddos = df_ddos.writeStream.outputMode("update").option("truncate", "false").format("console").start()

spark.streams.awaitAnyTermination()