In [1]:
#import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans, LDA
from pyspark.ml.feature import VectorAssembler, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
import joblib
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType
import pyspark.sql.functions as f



In [2]:
# Set up Spark Session
spark = SparkSession.builder \
    .appName("RealTimeModelTraining") \
    .config('spark.jars.packages','org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4')\
    .getOrCreate()



In [3]:
# Read data from Kafka

streaming = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "filtered_stream") \
    .option("startingOffsets", "latest").load()


In [4]:
# Extract the value (message) from the Kafka stream
messages = streaming.selectExpr("CAST(value AS STRING)")



In [5]:
# Parse the JSON message
schema = StructType([
    StructField("text", StringType(), True),
    StructField("timestamp", StringType(), True)
])



In [6]:
# Parse the JSON
messages_parsed = messages.select(from_json(messages["value"], schema).alias("data"))
df = messages_parsed.select("data.text", "data.timestamp")



In [7]:
# Clean the data (your pipeline steps here)
df_cleaned = df.withColumn("datetime_clean", f.regexp_replace("timestamp", r"^[A-Za-z]{3} ", "")) \
               .withColumn("datetime_column2", f.regexp_replace("datetime_clean", r"\s[A-Za-z]+$", "")) \
               .withColumn("datetime_final", f.to_timestamp("datetime_column2", "MMM dd HH:mm:ss z yyyy")) \
               .drop("datetime_clean", "datetime_column2")  # Drop unnecessary columns



In [9]:
from pyspark.ml.feature import CountVectorizerModel
from pyspark.ml.clustering import LDA, LDAModel, KMeansModel
from pyspark.ml.clustering import LocalLDAModel

# Load models
cv_model = CountVectorizerModel.load("bluesky_cv_model")
lda_model = LocalLDAModel.load("bluesky_lda_model")
kmeans_model = KMeansModel.load("bluesky_kmeans_model")

In [12]:
# # Process text (RegexTokenizer, StopWordsRemover, etc.)
# regex_token = RegexTokenizer(inputCol="text", outputCol="content_filtered", pattern="\\W+", toLowercase=True)

# default_stopwords = StopWordsRemover.loadDefaultStopWords('english')
# additional_stopwords = ['rt', 'via', 'amp', 'https', 'http', 'co', 'bsky', 'u', 'app', 's', 'www', 'com', 'de']
# all_stopwords = list(set(default_stopwords + additional_stopwords))

# stop_words_remover = StopWordsRemover(inputCol="content_filtered", outputCol="filtered_words", stopWords=all_stopwords)

# count_vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")

# vector_assembler = VectorAssembler(inputCols=["features"], outputCol="final_features")

# # Build the pipeline
# pipeline = Pipeline(stages=[regex_token, stop_words_remover, count_vectorizer, vector_assembler])



In [10]:
# 2. Preprocessing Text (no fitting here)
regex_token = RegexTokenizer(inputCol="text", outputCol="content_filtered", pattern="\\W+", toLowercase=True)
df_tokenized = regex_token.transform(df_cleaned)

default_stopwords = StopWordsRemover.loadDefaultStopWords('english')
additional_stopwords = ['rt', 'via', 'amp', 'https', 'http', 'co', 'bsky', 'u', 'app', 's', 'www', 'com', 'de']
all_stopwords = list(set(default_stopwords + additional_stopwords))

stop_words_remover = StopWordsRemover(inputCol="content_filtered", outputCol="filtered_words", stopWords=all_stopwords)
df_filtered = stop_words_remover.transform(df_tokenized)



In [12]:
# After tokenizing and removing stopwords
df_vectorized = cv_model.transform(df_filtered)


In [13]:
# --- For LDA ---
lda_output = lda_model.transform(df_vectorized) 



In [14]:
# --- For KMeans ---
# Assembling 'features' into 'final_features' first (because KMeans was trained on final_features)
vector_assembler = VectorAssembler(inputCols=["features"], outputCol="final_features")
df_final = vector_assembler.transform(df_vectorized)

kmeans_output = kmeans_model.transform(df_final) 



In [15]:
# --- Output what you want ---
output = kmeans_output.select("text", "timestamp", "cluster") 



In [17]:
query = output.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

StreamingQueryException: [STREAM_FAILED] Query [id = 03025bea-e486-4983-9f84-7047d0c2a26b, runId = 5c1ec37e-e769-4f03-a651-1804c363f7c7] terminated with exception: Job aborted due to stage failure: Task 0 in stage 12.0 failed 1 times, most recent failure: Lost task 0.0 in stage 12.0 (TID 12) (recovery-vm.us-east4-a.c.strange-theme-447800-g8.internal executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`KMeansModel$$Lambda/0x00007fc55d15a960`: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => int).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$5(WriteToDataSourceV2Exec.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:491)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.lang.IllegalArgumentException: requirement failed: Both vectors should have same length, found v1 is 4286 while v2 is 4285
	at scala.Predef$.require(Predef.scala:337)
	at org.apache.spark.mllib.util.MLUtils$.fastSquaredDistance(MLUtils.scala:541)
	at org.apache.spark.mllib.clustering.EuclideanDistanceMeasure$.fastSquaredDistance(DistanceMeasure.scala:414)
	at org.apache.spark.mllib.clustering.EuclideanDistanceMeasure.findClosest(DistanceMeasure.scala:309)
	at org.apache.spark.mllib.clustering.DistanceMeasure.findClosest(DistanceMeasure.scala:132)
	at org.apache.spark.mllib.clustering.KMeansModel.predict(KMeansModel.scala:91)
	at org.apache.spark.ml.clustering.KMeansModel.predict(KMeans.scala:177)
	at org.apache.spark.ml.clustering.KMeansModel.$anonfun$transform$1(KMeans.scala:159)
	at org.apache.spark.ml.clustering.KMeansModel.$anonfun$transform$1$adapted(KMeans.scala:159)
	... 21 more

Driver stacktrace: