##Dependencies

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
#!apt install default-jre
#!apt install default-jdk
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=c5780489fa8ab57bf79f8e49653460aad63b02b513cd0751dbb6a6b72d3d4f3a
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Ign:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu 

In [4]:
import pyspark
import string
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer
from pyspark.ml.clustering import LDA

# Data processing
import pandas as pd
import numpy as np

## Text processing

In [None]:
import re

punct = {'💁', '[', '’', '~', '💪', '📚', '🏡', '-', '🐣', '🇺', '”', '̶', '\u200a', ';', '🍕', ' ', '!', '%', ',', '👇', '®', '🌈', '?', '🏽', '=', '💨', '✅', '✔', ')', '|', '‘', '\xa0', '🗽', '&', '🏼', '¿', '…', '🎓', '👉', '❌', '🎧', '👈', '🚂', '+', '🤖', '👎', '→', '¡', '🤔', '️', '👸', '@', '🇸', ':', '“', '•', '🏿', '🏻', '👀', '👏', '—', ']', '✓', '"', '\u200b', '🎤', '\n', '.', '(', '$', '❤', '⬇', '#', '👍', "'", '/', '*', '🏾', '–', '👿'}

punct.remove(' ')  # keep spaces
punct.remove('\'') # keep single quotes (in order to retain I'm, isn't, etc.)

def clean(text):
    temp_text = text.lower()
    temp_text = re.sub(r'https?://\S+', '', temp_text)
    temp_text = re.sub(r'\d+', '0', temp_text)
    temp_text = temp_text.replace('’', '\'')  # some single quotes are slanted, and we want to retain them
    for p in punct:
        temp_text = temp_text.replace(p, ' ')
    temp_text = re.sub(r'\s+', ' ', temp_text)
    cleaned_text = temp_text.strip()
    return cleaned_text

In [None]:
def tokenize(text_vector, col_input , col_output):
  tknzr = Tokenizer(inputCol=col_input, outputCol=col_output)
  tokens=tknzr.transform(text_vector)
  return tokens

## Train LDA model

In [None]:
# Open dataframe with pandas
data = pd.read_csv('Nombre_documento')
data["cleaned"] = data["description"].apply(clean)

# Initialize Spark session
spark = SparkSession.builder.appName("LDA").getOrCreate()
spark_df = spark.createDataFrame(data)

# Tokenize the text
words_data = tokenize(spark_df, "cleaned", "key_words")

# Apply CountVectorizer to convert text to a vector of word counts
vectorizer = CountVectorizer(inputCol="key_words", outputCol="vectors")
cv_model = vectorizer.fit(words_data)
count_vectors = cv_model.transform(words_data)

# Train LDA model
lda = LDA(k=3, maxIter=10, featuresCol="features")
lda_model = lda.fit(count_vectors)

# Topic-Word Distributions
topics = lda_model.describeTopics()
print("Topics with their top-weighted words:")
topics.show(truncate=False)

# Extracting top `k` most relevant words for each topic
k = 5  # Set the number of top words you want for each topic
vocab = cv_model.vocabulary  # Vocabulary learned by CountVectorizer

topic_words_dict = {}
for row in topics.collect():
    topic_id = row['topic']
    word_indices = row['termIndices'][:k]  # Get the indices of the top `k` words
    topic_words = [vocab[index] for index in word_indices]  # Map indices to words
    topic_words_dict[topic_id] = topic_words

# Print the dictionary
for topic, words in topic_words_dict.items():
    print(f"Topic {topic}: {', '.join(words)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Infer the topic distribution for each document
document_topics = lda_model.transform(count_vectors)

# Define a function to get the most likely topic
def get_most_likely_topic(topic_distribution):
    return int(topic_distribution.argmax())

# Apply the function to extract the most likely topic
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

most_likely_topic_udf = udf(get_most_likely_topic, IntegerType())
document_topics_with_id = document_topics.withColumn("most_likely_topic", most_likely_topic_udf(document_topics["topicDistribution"]))

# Show the DataFrame with the new topic column
document_topics_with_id.select("id", "most_likely_topic").show(truncate=False)

# If you want to convert this back to Pandas for further processing or saving:
pandas_df_with_topics = document_topics_with_id.select("id", "most_likely_topic").toPandas()

# Join with original pandas DataFrame to add the topic column
final_data = data.merge(pandas_df_with_topics, on='id')

## Assign topic to new articles

In [None]:
# Open the new dataframe with pandas
new_data["cleaned"] = new_data["description"].apply(clean)

# Convert the new data to a Spark DataFrame
new_spark_df = spark.createDataFrame(new_data)

# Tokenize the new text
new_words_data = tokenize(new_spark_df, "cleaned", "key_words")

# Transform the text into the same vector space using the trained CountVectorizer model
new_count_vectors = cv_model.transform(new_words_data)

# Use the trained LDA model to infer topics for the new article
new_document_topic_distribution = lda_model.transform(new_count_vectors)

# Show the topic distribution for the new article
print("New Document-Topic Distribution:")
new_document_topic_distribution.select("id", "topicDistribution").show(truncate=False)

# Assuming you want to find the most likely topic
topic_distribution = new_document_topic_distribution.select("topicDistribution").collect()[0]["topicDistribution"]
most_likely_topic = topic_distribution.argmax()

print(f"The new article is most likely about Topic {most_likely_topic}, which has the top words: {', '.join(topic_words_dict[most_likely_topic])}")

Topic number: 0
mensen 0.008319326916815514
anderhalve 0.0062602265915572336
elkaar 0.005896582071747818
weer 0.0041825075002934675
houd 0.004121815009317227
gaan 0.0028879658252392282
blijf 0.0027956829993769435
moeten 0.0027135327847427207
thuis 0.0026636756851713393
alleen 0.002623671716643883
iedereen 0.0026234541293194058
buiten 0.0025252146255743475
goed 0.0024155903307100713
waar 0.002264940968694007
corona 0.0021445912007319197
Topic number: 1
vanaf 0.00028354046761189814
elkaar 0.000254256304623373
miljoen 0.0002514556022981041
maatschappij 0.0002496479282338025
apart 0.0002460248185941758
goed 0.00023863462194057008
euro 0.00021877171869036244
deelt 0.00021402801117738433
vijf 0.00021016247253925847
onderdrukt 0.0002076505284230324
mits 0.00020756021660438232
vrouw 0.00020705828805122345
zien 0.00020604245333889163
compensatie 0.00020293267794979677
vervoerders 0.000201908703458855
Topic number: 2
anderhalve 0.0008817454351102398
hond 0.0006011425200842562
maurice 0.000459616