In [3]:
# uncomment if needed

# !pip install spacy preprocessor twint nltk
# !python -m spacy download es_core_news_md


In [4]:
import spacy
import preprocessor as p
import re
from nltk.corpus import stopwords
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
# Download Spanish stopwords from nltk
nltk.download('stopwords')

# Load Spanish language model in spaCy
nlp = spacy.load('es_core_news_md')

# Load Spanish stopwords
stop_words = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# add some custom stop words based on LDA results - keep adding 
custom_stopwords = set([
    'ser', 'haber', 'hacer', 'tener', 'poder', 'ir', 'q', 'si', 'solo', 'saber', 'decir',
    'dar', 'querer', 'ver', 'así', 'sos', 'maje', 'dejar', 'si', 'solo', 'si', 'op'
])
stop_words.update(custom_stopwords)

In [7]:
def preprocess_text(text):      
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and len(token.lemma_) > 2]
    return tokens

In [8]:
df = pd.read_csv('merged_file.csv')

In [9]:
# create new column for tokenized comments
df['cmt_processed_text'] = df['comment_processed_text'].apply(preprocess_text)

In [10]:
# Convert preprocessed text to document-term matrix
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=5)  # Custom tokenizer to use our preprocessed tokens
dtm = vectorizer.fit_transform(df['cmt_processed_text'])

# Get feature names (vocabulary)
terms = vectorizer.get_feature_names_out()



In [11]:
# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)  # Change n_components to the number of topics you want

# Fit the model to the document-term matrix
lda_model.fit(dtm)

# Get the topics
topics = lda_model.components_

In [12]:
def print_topics(model, terms, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx+1}:")
        print([terms[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

# Print the top words for each topic
print_topics(lda_model, terms)

Topic 1:
['día', 'presidente', 'pueblo', 'bukele', 'gracias', 'bueno', 'cada', 'año', 'dios', 'hoy']
Topic 2:
['gobierno', 'deber', 'diputado', 'entender', 'creer', 'marciano', 'partido', 'ahora', 'quedar', 'arena']
Topic 3:
['pueblo', 'mas', 'salvador', 'poner', 'verdad', 'nunca', 'creer', 'deber', 'bien', 'pasar']
Topic 4:
['mismo', 'vos', 'gente', 'pobre', 'pandillero', 'país', 'loco', 'llorar', 'hablar', 'claudia']
Topic 5:
['trabajo', 'excelente', 'ministro', 'gracias', 'presidente', 'bien', 'buen', 'señor', 'seguir', 'mejor']
