In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, when, window, length, lit
from pyspark.sql.types import StructType, StructField, StringType
import time, os

# IMPORTANT : Spark tourne sur l’hôte -> Kafka = localhost:9092 (pas "kafka:9092")
KAFKA_BOOTSTRAP = "localhost:9092"

JDBC_URL  = "jdbc:postgresql://localhost:5433/mastodon"
JDBC_PROPS = {"user":"mastodon","password":"mastodon","driver":"org.postgresql.Driver"}

spark = (SparkSession.builder
    .appName("MastodonStreamProcessing")
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.postgresql:postgresql:42.7.3")
    .getOrCreate())

schema = StructType([
    StructField("username", StringType()),
    StructField("text",     StringType()),
    StructField("content",  StringType()),
])

# checkpoint unique à chaque lancement pour éviter l’état bloqué
checkpoint_dir = f"/tmp/chkpt_masto_nb_{int(time.time())}"
checkpoint_dir

'/tmp/chkpt_masto_nb_1759837367'

In [5]:
# 1) lire Kafka (depuis MAINTENANT)
df_raw = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP)
    .option("subscribe", "mastodon_stream")
    .option("startingOffsets", "latest")
    .load())

# 2) parser JSON {username, text|content}
parsed = (df_raw.selectExpr("CAST(value AS STRING) AS json")
  .select(from_json(col("json"), schema).alias("j"))
  .select(
      col("j.username").alias("username"),
      when(col("j.text").isNotNull(), col("j.text")).otherwise(col("j.content")).alias("content")
  )
  .filter(col("username").isNotNull() & col("content").isNotNull())
)

# 3) foreachBatch -> écrit dans 2 tables :
#    - streamed_toot_counts : comptage / minute
#    - avg_toot_length_by_user : longueur moyenne par user
def sink(batch_df, batch_id):
    base = batch_df.select("username","content")
    # comptage / fenêtre 60s
    toots_per_minute = (base
        .withWatermark("timestamp", "0 seconds")  # watermark neutre (on n'a pas de vrai timestamp d'événement)
        .withColumn("ts_now", lit(None).cast("timestamp")) # placeholder pour window()
        .groupBy(window(lit("now").cast("timestamp"), "60 seconds").alias("window"))
        .count()
        .select(
            lit(batch_id).alias("batch_id"),
            col("window.start").alias("window_start"),
            col("window.end").alias("window_end"),
            col("count").alias("cnt")
        )
    )
    toots_per_minute.write.mode("append").jdbc(JDBC_URL, "streamed_toot_counts", properties=JDBC_PROPS)

    # longueur moyenne par utilisateur
    avg_length_per_user = (base
        .withColumn("length", length(col("content")))
        .groupBy("username")
        .avg("length")
        .select(
            lit(batch_id).alias("batch_id"),
            col("username"),
            col("avg(length)").alias("avg_length")
        )
    )
    avg_length_per_user.write.mode("append").jdbc(JDBC_URL, "avg_toot_length_by_user", properties=JDBC_PROPS)

# 4) démarrer le stream
query = (parsed.writeStream
    .foreachBatch(sink)
    .option("checkpointLocation", checkpoint_dir)
    .start())

query.isActive, query.status

25/10/07 13:43:01 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


(True,
 {'message': 'Initializing sources',
  'isDataAvailable': False,
  'isTriggerActive': False})

25/10/07 13:43:01 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


In [None]:
import pandas as pd, sqlalchemy as sa, time
from IPython.display import clear_output, display, HTML

engine = sa.create_engine("postgresql+psycopg2://mastodon:mastodon@localhost:5433/mastodon")

for _ in range(120):  # ~2 minutes
    clear_output(wait=True)
    try:
        df_counts = pd.read_sql(
            "SELECT * FROM streamed_toot_counts ORDER BY window_end DESC LIMIT 10", engine
        )
        df_avg = pd.read_sql(
            "SELECT * FROM avg_toot_length_by_user ORDER BY batch_id DESC, username LIMIT 10", engine
        )
    except Exception as e:
        display(HTML(f"<b>En attente de données…</b><br/>{e}"))
        time.sleep(1)
        continue

    display(HTML("<h3>streamed_toot_counts (dernières fenêtres)</h3>"))
    display(df_counts)
    display(HTML("<h3>avg_toot_length_by_user</h3>"))
    display(df_avg)
    time.sleep(1)

Unnamed: 0,batch_id,window_start,window_end,cnt


Unnamed: 0,batch_id,username,avg_length
