In [4]:
# uncomment if needed

!pip install spacy preprocessor twint nltk
!python -m spacy download es_core_news_md


Collecting preprocessor
  Downloading preprocessor-1.1.3.tar.gz (4.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting twint
  Downloading twint-2.1.20.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting aiodns (from twint)
  Downloading aiodns-3.2.0-py3-none-any.whl (5.7 kB)
Collecting cchardet (from twint)
  Downloading cchardet-2.1.7.tar.gz (653 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting elasticsearch (from twint)
  Downloading elasticsearch-8.14.0-py3-none-any.whl (480 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.2/480.2 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp_socks (from twint)
  Downloading aiohttp_socks-0.8.4-py3-none-any.whl (9.6 kB)
Collecting schedule (from twint)
  Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Collecting f

In [5]:
import spacy
import preprocessor as p
import re
from nltk.corpus import stopwords
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
# Download Spanish stopwords from nltk
nltk.download('stopwords')

# Load Spanish language model in spaCy
nlp = spacy.load('es_core_news_md')

# Load Spanish stopwords
stop_words = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [56]:
# add some custom stop words based on LDA results - keep adding
custom_stopwords = set([
    'ser', 'haber', 'hacer', 'tener', 'poder', 'ir', 'q', 'si', 'solo', 'saber', 'decir',
    'dar', 'querer', 'ver', 'así', 'sos', 'maje', 'dejar', 'si', 'solo', 'si', 'op', 'vos',
    'cada', 'mismo', 'usted', 'mas', 'pues', 'andar', 'ahora', 'claro', 'nunca', 'quedar', 'pasar',
    'venir', 'poner', 'dio', 'señora', 'señor', 'ahí', 'asi', 'vez', 'jajaja'
])
stop_words.update(custom_stopwords)

In [57]:
def preprocess_text(text):
    # Tokenize and lemmatize using spaCy
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and len(token.lemma_) > 2]
    return tokens

In [58]:
df = pd.read_csv('merged_file.csv')

In [59]:
# create new column for tokenized comments
df['cmt_processed_text'] = df['comment_processed_text'].apply(preprocess_text)

In [60]:
# Convert preprocessed text to document-term matrix
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x, min_df=5)  # Custom tokenizer to use our preprocessed tokens
dtm = vectorizer.fit_transform(df['cmt_processed_text'])

# Get feature names (vocabulary)
terms = vectorizer.get_feature_names_out()



In [61]:
# Initialize LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)  # Change n_components to the number of topics you want

# Fit the model to the document-term matrix
lda_model.fit(dtm)

# Get the topics
topics = lda_model.components_

In [62]:
def print_topics(model, terms, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx+1}:")
        print([terms[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

# Print the top words for each topic
print_topics(lda_model, terms)

Topic 1:
['diputado', 'marciano', 'gente', 'loco', 'viejo', 'pobre', 'salir', 'ganar', 'día', 'ladrón']
Topic 2:
['presidente', 'gracias', 'mejor', 'seguir', 'bukele', 'salvador', 'salvadoreño', 'país', 'año', 'nuevo']
Topic 3:
['pueblo', 'dios', 'salvadoreño', 'día', 'presidente', 'siempre', 'creer', 'deber', 'país', 'gracia']
Topic 4:
['pagar', 'deber', 'salir', 'ley', 'padre', 'tipo', 'llorar', 'hijo', 'verdad', 'bien']
Topic 5:
['trabajo', 'excelente', 'bien', 'ministro', 'buen', 'gobierno', 'esperar', 'hoy', 'bueno', 'deber']
