In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3 pyspark-shell'

import pyspark 
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


sc = SparkContext()
sc.setLogLevel("WARN")
spark = SparkSession(sc)


df = spark.readStream.format("kafka") \
    .option("kafka.bootstrap.servers","10.10.139.63:9092") \
    .option("subscribe", "inputs.testing") \
    .option("startingOffsets", "earliest") \
    .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

schema = StructType([
    StructField("visitor_platform", StringType()),
    StructField("ts_ingest", LongType()),
    StructField("article_title", StringType()),
    StructField("visitor_country", StringType()),
    StructField("visitor_os", StringType()),
    StructField("article", StringType()),
    StructField("visitor_browser", StringType()),
    StructField("visitor_page_timer", IntegerType()),
    StructField("visitor_page_height", IntegerType()),
])

dfs = df.selectExpr("CAST(value AS STRING)") \
      .select(from_json(col("value"), schema) \
      .alias("clicks"))

df_data = dfs.select("clicks.*")
# Drop row if it has a null field (https://spark.apache.org/docs/2.2.0/api/java/org/apache/spark/sql/DataFrameNaFunctions.html)
df_data = df_data.na.drop()

# Alter column ts_ingest to convert epochs (milli) to yyyy-mm-dd HH:MM:ss format
df_DateConvertedString = df_data.withColumn("ts_ingest", from_unixtime(df_data['ts_ingest']/1000))
# ts_ingest column is of type String, needs to be converted to TimestampType
df_DateConverted = df_DateConvertedString.withColumn('ts_ingest', to_timestamp(df_DateConvertedString['ts_ingest'].cast(dataType=TimestampType())))

# Ensure that country codes are upper case
df_end = df_DateConverted.withColumn("visitor_country", upper(df_DateConverted['visitor_country']))

# Filter out only BE and NL clicks
df_be = df_end.filter((col("visitor_country") == 'BE') | (col("visitor_country") == 'NL'))

#Debug to terminal
# query = df_be.writeStream.outputMode("append").option("truncate", "false").format("console").start()

# Prepare df for Kafka and write to kafka
query = df_be.selectExpr("to_json(struct(*)) as value") \
    .writeStream.format("kafka") \
    .option("kafka.bootstrap.servers", "10.10.139.63:9092") \
    .option("topic", "calculated") \
    .option("checkpointLocation", "checkpoints") \
    .start()

query.awaitTermination()
