In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.sql.window import Window as W
from pyspark.ml import Pipeline
import os
import uuid
from pyspark.sql.types import (
    ArrayType,
    DoubleType
)
from pyspark.sql.functions import col, window, collect_list, from_unixtime, concat_ws, explode,udf, array, lit, struct, row_number
from pyspark.sql import Row

In [2]:
spark = (
    SparkSession.builder.appName("Reddit TF-IDF Processing")
    .master("local[*]")
    .getOrCreate()
)

24/06/12 13:26:20 WARN Utils: Your hostname, Exporo-MBP-219.local resolves to a loopback address: 127.0.0.1; using 10.10.5.83 instead (on interface en0)
24/06/12 13:26:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/12 13:26:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/12 13:26:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
input_dir="data/raw"

In [4]:
df = spark.read.parquet("data/raw/*.parquet")

                                                                                

In [5]:
df.show(5)

                                                                                

+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+
|     id|            author|  created_utc|score| parent_id|subreddit|           permalink|                text|          timestamp|
+-------+------------------+-------------+-----+----------+---------+--------------------+--------------------+-------------------+
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l70kegt|5chrodingers_pussy|1.717472683E9|    2|t1_l70ii9i| JoeRogan|/r/JoeRogan/comme...|You bring up a co...|2024-06-04 05:44:43|
|l6z1s7d|     Sardoodledome|1.717450311E9|    1|t1_l6z1cw9| JoeRogan|/r/JoeRogan/comme...|Well I forgot abo...|2024-06-03 23:31:51|
|l6z1s7d|     Sardoodledome|1.717450311E9|    1|t1_l6z1cw9| JoeRogan|/r/JoeR

In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [7]:
# Apply CountVectorizer to get term frequency
cv = CountVectorizer(
    inputCol="words", outputCol="rawFeatures", vocabSize=1000, minDF=1.0
)

In [8]:
# Apply IDF to get TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [9]:
# Define a pipeline
pipeline = Pipeline(stages=[tokenizer, cv, idf])

In [10]:
# Apply windowing
windowed_df = df.groupBy(window(col("timestamp"), "60 seconds", "5 seconds")).agg(
    collect_list("text").alias("texts")
)

In [11]:
windowed_df.show(5)



CodeCache: size=131072Kb used=33140Kb max_used=33177Kb free=97931Kb
 bounds [0x0000000104a64000, 0x0000000106af4000, 0x000000010ca64000]
 total_blobs=13152 nmethods=12163 adapters=899
 compilation: disabled (not enough contiguous free space left)




+--------------------+--------------------+
|              window|               texts|
+--------------------+--------------------+
|{2024-06-03 21:11...|            [, , , ]|
|{2024-06-03 21:11...|            [, , , ]|
|{2024-06-03 21:12...|            [, , , ]|
|{2024-06-03 21:12...|            [, , , ]|
|{2024-06-03 21:17...|[As someone with ...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [12]:
row = windowed_df.collect()[17]

                                                                                

In [13]:
row

Row(window=Row(start=datetime.datetime(2024, 6, 3, 21, 20, 45), end=datetime.datetime(2024, 6, 3, 21, 21, 45)), texts=['Maybe AI can finally explain exactly what “woke” is supposed to mean, since the people using it constantly have so far been unable to.', 'Maybe AI can finally explain exactly what “woke” is supposed to mean, since the people using it constantly have so far been unable to.', 'Maybe AI can finally explain exactly what “woke” is supposed to mean, since the people using it constantly have so far been unable to.', 'Maybe AI can finally explain exactly what “woke” is supposed to mean, since the people using it constantly have so far been unable to.', 'It is already my dude', 'It is already my dude', 'It is already my dude', 'It is already my dude'])

In [14]:
window_start, window_end = row["window"]["start"], row["window"]["end"]
texts = row["texts"]

In [15]:
texts_df = spark.createDataFrame([(text,) for text in texts], ["text"])

In [16]:
model = pipeline.fit(texts_df)

                                                                                

In [17]:
tfidf_df = model.transform(texts_df)

In [18]:
vocab = model.stages[1].vocabulary

In [19]:
vocab

['it',
 'is',
 'supposed',
 'constantly',
 'explain',
 'the',
 'been',
 'people',
 'since',
 'mean,',
 'unable',
 'far',
 'dude',
 '“woke”',
 'can',
 'to',
 'using',
 'finally',
 'have',
 'to.',
 'my',
 'maybe',
 'exactly',
 'already',
 'what',
 'ai',
 'so']

In [20]:
def extract_top_words(features):
    top_indices = features.indices
    top_values = features.values
    top_words = [(vocab[i], v) for i, v in zip(top_indices, top_values)]
    top_words = sorted(top_words, key=lambda x: x[1], reverse=True)[:10]
    return top_words

In [21]:
top_words = (
    tfidf_df.select("features")
    .rdd.flatMap(lambda row: extract_top_words(row["features"]))
    .collect()
)

                                                                                

In [23]:
top_words

[('supposed', 0.5877866649021191),
 ('constantly', 0.5877866649021191),
 ('explain', 0.5877866649021191),
 ('the', 0.5877866649021191),
 ('been', 0.5877866649021191),
 ('people', 0.5877866649021191),
 ('since', 0.5877866649021191),
 ('mean,', 0.5877866649021191),
 ('unable', 0.5877866649021191),
 ('far', 0.5877866649021191),
 ('supposed', 0.5877866649021191),
 ('constantly', 0.5877866649021191),
 ('explain', 0.5877866649021191),
 ('the', 0.5877866649021191),
 ('been', 0.5877866649021191),
 ('people', 0.5877866649021191),
 ('since', 0.5877866649021191),
 ('mean,', 0.5877866649021191),
 ('unable', 0.5877866649021191),
 ('far', 0.5877866649021191),
 ('supposed', 0.5877866649021191),
 ('constantly', 0.5877866649021191),
 ('explain', 0.5877866649021191),
 ('the', 0.5877866649021191),
 ('been', 0.5877866649021191),
 ('people', 0.5877866649021191),
 ('since', 0.5877866649021191),
 ('mean,', 0.5877866649021191),
 ('unable', 0.5877866649021191),
 ('far', 0.5877866649021191),
 ('supposed', 0.587

In [24]:
def to_array(v):
        if v is None:
            return None
        return v.toArray().tolist()

to_array_udf = udf(to_array, ArrayType(DoubleType()))
tfidf_df = tfidf_df.withColumn("tfidf_values", to_array_udf(col("features")))

In [25]:
tfidf_df.show(10)

                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|               words|         rawFeatures|            features|        tfidf_values|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Maybe AI can fina...|[maybe, ai, can, ...|(27,[0,1,2,3,4,5,...|(27,[0,1,2,3,4,5,...|[0.0, 0.0, 0.5877...|
|Maybe AI can fina...|[maybe, ai, can, ...|(27,[0,1,2,3,4,5,...|(27,[0,1,2,3,4,5,...|[0.0, 0.0, 0.5877...|
|Maybe AI can fina...|[maybe, ai, can, ...|(27,[0,1,2,3,4,5,...|(27,[0,1,2,3,4,5,...|[0.0, 0.0, 0.5877...|
|Maybe AI can fina...|[maybe, ai, can, ...|(27,[0,1,2,3,4,5,...|(27,[0,1,2,3,4,5,...|[0.0, 0.0, 0.5877...|
|It is already my ...|[it, is, already,...|(27,[0,1,12,20,23...|(27,[0,1,12,20,23...|[0.0, 0.0, 0.0, 0...|
|It is already my ...|[it, is, already,...|(27,[0,1,12,20,23...|(27,[0,1,12,20,23...|[0.0, 0.0, 0.0, 0...|
|It is already my ...|[it, is, alread

In [26]:
exploded_df = tfidf_df.select(explode(
            array([struct(lit(vocab[i]).alias("word"), col("tfidf_values")[i].alias("tfidf")) for i in range(len(vocab))])
        ).alias("word_tfidf"))

In [27]:
exploded_df.show(10)

24/06/11 21:11:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+
|          word_tfidf|
+--------------------+
|           {it, 0.0}|
|           {is, 0.0}|
|{supposed, 0.5877...|
|{constantly, 0.58...|
|{explain, 0.58778...|
|{the, 0.587786664...|
|{been, 0.58778666...|
|{people, 0.587786...|
|{since, 0.5877866...|
|{mean,, 0.5877866...|
+--------------------+
only showing top 10 rows



In [28]:
top_words_df = exploded_df.select("word_tfidf.word", "word_tfidf.tfidf")

In [29]:
window_spec = W.orderBy(col("tfidf").desc())
top_words_df = top_words_df.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= 10)

In [30]:
top_words_df = top_words_df.withColumn("window_start", lit(window_start)).withColumn("window_end", lit(window_end))

In [32]:
top_words_df.show(10)

24/06/11 21:11:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/11 21:11:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/11 21:11:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+----------+------------------+----+-------------------+-------------------+
|      word|             tfidf|rank|       window_start|         window_end|
+----------+------------------+----+-------------------+-------------------+
|    unable|0.5877866649021191|   1|2024-06-03 21:20:45|2024-06-03 21:21:45|
|     maybe|0.5877866649021191|   2|2024-06-03 21:20:45|2024-06-03 21:21:45|
|       far|0.5877866649021191|   3|2024-06-03 21:20:45|2024-06-03 21:21:45|
|  supposed|0.5877866649021191|   4|2024-06-03 21:20:45|2024-06-03 21:21:45|
|    “woke”|0.5877866649021191|   5|2024-06-03 21:20:45|2024-06-03 21:21:45|
|constantly|0.5877866649021191|   6|2024-06-03 21:20:45|2024-06-03 21:21:45|
|       can|0.5877866649021191|   7|2024-06-03 21:20:45|2024-06-03 21:21:45|
|       the|0.5877866649021191|   8|2024-06-03 21:20:45|2024-06-03 21:21:45|
|        to|0.5877866649021191|   9|2024-06-03 21:20:45|2024-06-03 21:21:45|
|    people|0.5877866649021191|  10|2024-06-03 21:20:45|2024-06-03 21:21:45|

                                                                                

    def process_window(row):
        window_start, window_end = row["window"]["start"], row["window"]["end"]
        texts = row["texts"]

        # Create a DataFrame for the texts in the window
        texts_df = spark.createDataFrame([(text,) for text in texts], ["text"])

        # Fit the pipeline to the data
        model = pipeline.fit(texts_df)

        # Transform the data
        tfidf_df = model.transform(texts_df)

        # Extract the vocabulary and TF-IDF features
        vocab = model.stages[1].vocabulary

        # Convert sparse vector to dense vector
        def to_array(v):
            if v is None:
                return None
            return v.toArray().tolist()

        to_array_udf = udf(to_array, ArrayType(DoubleType()))
        tfidf_df = tfidf_df.withColumn("tfidf_values", to_array_udf(col("features")))

        # Explode the features column to get individual words and their TF-IDF scores
        exploded_df = tfidf_df.select(
            explode(
                array(
                    [
                        struct(
                            lit(vocab[i]).alias("word"),
                            col("tfidf_values")[i].alias("tfidf"),
                        )
                        for i in range(len(vocab))
                    ]
                )
            ).alias("word_tfidf")
        )

        # Select word and tfidf score
        top_words_df = exploded_df.select("word_tfidf.word", "word_tfidf.tfidf")

        # Get top 10 words based on TF-IDF scores
        window_spec = W.orderBy(col("tfidf").desc())
        top_words_df = top_words_df.withColumn(
            "rank", row_number().over(window_spec)
        ).filter(col("rank") <= 10)

        # Add window information
        top_words_df = top_words_df.withColumn(
            "window_start", lit(window_start)
        ).withColumn("window_end", lit(window_end))

        return top_words_df

In [None]:
def process_window(row):
    window_start, window_end = row["window"]["start"], row["window"]["end"]
    texts = row["texts"]

    # Create a DataFrame for the texts in the window
    texts_df = spark.createDataFrame([(text,) for text in texts], ["text"])

    # Fit the pipeline to the data
    model = pipeline.fit(texts_df)

    # Transform the data
    tfidf_df = model.transform(texts_df)

    # Extract the vocabulary and TF-IDF features
    vocab = model.stages[1].vocabulary

    # Convert sparse vector to dense vector
    def to_array(v):
        if v is None:
            return None
        return v.toArray().tolist()

    to_array_udf = udf(to_array, ArrayType(DoubleType()))
    tfidf_df = tfidf_df.withColumn("tfidf_values", to_array_udf(col("features")))

    # Explode the features column to get individual words and their TF-IDF scores
    exploded_df = tfidf_df.select(
        explode(
            array(
                [
                    struct(
                        lit(vocab[i]).alias("word"),
                        col("tfidf_values")[i].alias("tfidf"),
                    )
                    for i in range(len(vocab))
                ]
            )
        ).alias("word_tfidf")
    )

    # Select word and tfidf score
    top_words_df = exploded_df.select("word_tfidf.word", "word_tfidf.tfidf")

    # Get top 10 words based on TF-IDF scores
    window_spec = W.orderBy(col("tfidf").desc())
    top_words_df = top_words_df.withColumn(
        "rank", row_number().over(window_spec)
    ).filter(col("rank") <= 10)

    # Add window information
    top_words_df = top_words_df.withColumn(
        "window_start", lit(window_start)
    ).withColumn("window_end", lit(window_end))

    # Write the results to Parquet files
    # partition_columns = ["window_start", "window_end"]
    top_words_df.write.mode("append").partitionBy(*partition_columns).parquet(output_dir)

for row in windowed_df.rdd.collect():
        process_window(row)

test

In [3]:
df_result = spark.read.parquet("data/tfidf/*.parquet")

                                                                                

In [4]:
df_result.show(20)

+--------+-----+----+-------------------+-------------------+
|    word|tfidf|rank|       window_start|         window_end|
+--------+-----+----+-------------------+-------------------+
|      ai|  0.0|   1|2024-06-03 21:16:55|2024-06-03 21:17:55|
|    take|  0.0|   2|2024-06-03 21:16:55|2024-06-03 21:17:55|
|  jobs….|  0.0|   3|2024-06-03 21:16:55|2024-06-03 21:17:55|
|creative|  0.0|   4|2024-06-03 21:16:55|2024-06-03 21:17:55|
|      ha|  0.0|   5|2024-06-03 21:16:55|2024-06-03 21:17:55|
|  menial|  0.0|   6|2024-06-03 21:16:55|2024-06-03 21:17:55|
|     ha.|  0.0|   7|2024-06-03 21:16:55|2024-06-03 21:17:55|
|    away|  0.0|   8|2024-06-03 21:16:55|2024-06-03 21:17:55|
|  mocked|  0.0|   9|2024-06-03 21:16:55|2024-06-03 21:17:55|
|   years|  0.0|  10|2024-06-03 21:16:55|2024-06-03 21:17:55|
| someone|  0.0|   1|2024-06-03 21:17:10|2024-06-03 21:18:10|
|      ha|  0.0|   2|2024-06-03 21:17:10|2024-06-03 21:18:10|
|      ai|  0.0|   3|2024-06-03 21:17:10|2024-06-03 21:18:10|
|  mocke