In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassificationModel


In [2]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.2").\
        getOrCreate()        
        

In [3]:
gender_model = DecisionTreeClassificationModel.load(
        "hdfs://namenode:9000//user/data/spark_ml_101/ec_web_logs_analysis/models/model_age_group_prediction/")

In [4]:
# connect to Kafka stream
logs_stream = spark.readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", "broker:29092") \
                .option("subscribe", "ec_web_logs_stream") \
                .load()

# # schema
logs_stream = logs_stream.select(logs_stream["value"].cast("string")) \
                     .selectExpr("split(value,',')[0] as device_id",
                                 "split(value,',')[1] as timestamp",
                                 "cast(split(value,',')[2] as int) as product_category_id",
                                 "split(value,',')[3] as ip",
                                 "cast(split(value,',')[6] as int) device_type",
                                 "cast(split(value,',')[7] as int) connection_type")

# Prepare features and preprocessing
data_prep = logs_stream.select("device_id", "timestamp", "product_category_id", "device_type", "connection_type")

data_prep = VectorAssembler(inputCols=["product_category_id", "device_type", "connection_type"],
                            outputCol="features").transform(data_prep)

inferred_gender_added = gender_model.transform(data_prep)\
        .select(col("device_id"), col("timestamp"), col("prediction").alias("inferred_gender"), col("features"))

In [5]:
result = inferred_gender_added.selectExpr("cast(timestamp as string) as key",  
                                             "cast(concat(device_id, ',', timestamp, ',', inferred_gender, ',', inferred_gender) as string) as value")



In [6]:
 query = inferred_gender_added.select("device_id", "timestamp", "inferred_gender").\
                .writeStream \
                .format("csv") \
                .option("format", "append") \
                .option("path", "hdfs://namenode:9000//user/data/spark_ml_101/ec_web_logs_analysis/streaming") \
                .option("checkpointLocation", "hdfs://namenode:9000//user/data/checkpoint") \
                .outputMode("append") \
                .start()\
                .awaitTermination()


# .option("checkpointLocation", "/checkpoint_path") \

# query = inferred_gender_added.writeStream\
#                     .outputMode("append")\
#                     .format("console") \
#                     .option("truncate","false") \
#                     .start()\
#                     .awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = f7f568da-1425-4de7-9ef1-a58278e49533, runId = 803085ff-7b53-4c00-b7b1-ecf6e105f412] terminated with exception: [UNSUPPORTED_DATA_TYPE_FOR_DATASOURCE] The CSV datasource doesn't support the column `features` of the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>".