# Entrenamiento modelo LDA 

## Librerías y carga de datos

In [28]:
import numpy as np
import pandas as pd
import csv
import glob
import json
import matplotlib.pyplot as plt
#Gensim

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel

#spacy Lemmatization processes

import nltk
import spacy
from nltk.corpus import stopwords

#Visualization


import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim

In [29]:
def load_data(file):
    csv.field_size_limit(1000000)
    with open (file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        data = [row for row in reader]
    return data

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data,f,indent=4 )

In [30]:
data = load_data("clean_bulletins.csv")


In [31]:
words = []
for row in data:
    words.append(row["Clean_Report"])

print(words[0][0:100])

claudia sheinbaum pardo protesta jefa claudia sheinbaum pardo protesta jefa claudia sheinbaum mujer 


Convertimos el texto en una lista de tokens, preprocesando el texto e ignorando los tokens que sean muy cortos o muy largos.

In [32]:
# Convert a document into a list of tokens
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(words)

print (data_words[0][0:20])

['claudia', 'sheinbaum', 'pardo', 'protesta', 'jefa', 'claudia', 'sheinbaum', 'pardo', 'protesta', 'jefa', 'claudia', 'sheinbaum', 'mujer', 'mujer', 'mujer', 'honesta', 'materia', 'lunes', 'primera', 'jefa']


## Aplicando Bigramas y trigramas

Juntamos las palabras en bigramas y trigramas posibles para obtener un mejor resultado más específico en los términos más sobresalientes.

In [33]:
#Bigramas y trigramas
bigrams_phrases = gensim.models.Phrases(data, min_count = 5, threshold = 50)
trigram_phrases = gensim.models.Phrases(bigrams_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigrams_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return(trigram[bigram[doc]] for doc in texts)

data_bigrams =  list(make_bigrams(data_words))
data_bigrams_trigrams = list(make_trigrams(data_bigrams))


In [34]:
print(data_bigrams_trigrams[0])
# Solo imprimir palabras que contengan "_"


['claudia_sheinbaum', 'pardo', 'protesta', 'jefa_claudia', 'sheinbaum_pardo', 'protesta', 'jefa_claudia', 'sheinbaum', 'mujer', 'mujer', 'mujer', 'honesta', 'materia', 'lunes', 'primera', 'jefa', 'reuniones', 'gabinete', 'lunes', 'atender', 'dias', 'republica', 'terna', 'ministro', 'scjn', 'republica', 'terna', 'ministro', 'justicia', 'scjn', 'articulos', 'politica', 'terna', 'enviar', 'hoy', 'honesta', 'primer', 'van', 'decidir', 'subrayo', 'egresada', 'libre', 'derecho', 'universidad', 'garcia', 'tribunal', 'superior', 'justicia', 'queretaro', 'egresada', 'universidad_autonoma', 'queretaro', 'derecho', 'inicio', 'poder', 'queretaro', 'gonzalez', 'derecho', 'universidad_autonoma', 'tribunal', 'superior', 'justicia', 'tuxpan', 'anuncia', 'cuatro', 'reconstruccion', 'reconstruccion', 'vivienda', 'credito', 'derecho', 'familias', 'recibiran', 'economico', 'continuar', 'limpieza', 'viviendas', 'desazolve', 'damnificados', 'recibiran', 'equivalentes', 'inicia', 'reconstruccion', 'urbana', 

## TF-IDF

Por medio de la técnica TF-IDF (Term Frecuency-Inverse Document Frecuency) sobre los bigramas y trigramas obtenidos, logramos eliminar palabras de baja frecuencia y/o poca relevancia, definiendo un valor mínimo para filtrar palabras que tengan menor puntuación a dicho valor.

In [35]:
print(corpus)

[[(0, 2), (1, 2), (2, 4), (3, 5), (5, 6), (9, 2), (10, 3), (12, 2), (13, 2), (15, 5), (16, 2), (17, 3), (18, 3), (19, 2), (27, 2), (29, 3), (33, 2), (34, 5), (35, 6), (36, 2), (37, 14), (38, 3), (39, 2), (41, 2), (42, 3), (44, 10), (48, 2), (49, 2), (57, 2), (59, 4), (63, 6), (64, 2), (69, 2), (70, 8), (71, 3), (73, 2), (74, 4), (76, 2), (77, 2), (84, 2), (86, 2), (87, 2), (89, 3), (91, 2), (93, 2), (94, 2), (95, 2), (97, 2), (100, 2), (102, 2), (104, 2), (106, 4), (112, 3), (113, 3), (114, 2), (115, 2), (116, 6), (117, 2), (118, 2), (120, 2), (121, 2), (122, 2), (124, 2), (128, 9), (129, 2), (130, 2), (131, 5), (132, 4), (135, 2), (136, 3), (138, 7), (139, 6), (140, 2), (142, 2), (143, 2), (144, 2), (145, 2), (146, 2), (148, 2), (149, 3), (150, 2), (153, 3), (154, 2), (155, 2), (156, 2), (157, 2), (160, 3), (161, 5), (162, 6), (164, 2), (168, 2), (169, 6), (170, 7), (171, 6), (172, 4), (173, 2), (175, 2), (176, 2), (177, 2), (178, 7), (179, 4), (180, 2), (181, 2), (182, 2), (183, 2), 

## Entrenamiento y visualización del modelo LDA

In [36]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, 
                                            id2word = id2word, 
                                            num_topics= 17, 
                                            random_state = 100, 
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha="auto")

In [37]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = "mmds", R = 30)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Guardamos el modelo para posteriormente utilizarlo.

In [38]:
lda_model.save('my_lda_model')

## Uso del modelo

Cargamos el modelo.

In [39]:
# Load the trained LDA model
lda_model = gensim.models.ldamodel.LdaModel.load('my_lda_model')

# Load the new data
new_data = pd.read_csv('clean_bulletins.csv')

# Load the SpaCy model for tokenization
nlp = spacy.load('es_core_news_sm')

In [40]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_bigrams_trigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.39825955940289287


In [41]:
# Show graph
limit=30; start=10; step=4;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Passes")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

NameError: name 'coherence_values' is not defined

In [None]:
topics = lda_model.show_topics(num_topics=18, num_words=10)
for topic in topics:
    print(f"Topic {topic[0]}: {topic[1]}")

Topic 0: 0.032*"paises" + 0.025*"america" + 0.020*"primer_ministro" + 0.020*"relaciones_exteriores" + 0.019*"america_norte" + 0.017*"biden" + 0.017*"canada" + 0.017*"cooperacion" + 0.015*"naciones" + 0.011*"pueblos"
Topic 1: 0.007*"mujeres" + 0.006*"california" + 0.005*"violencia" + 0.005*"casos" + 0.005*"registro" + 0.005*"reduccion" + 0.004*"abril" + 0.004*"comparacion" + 0.004*"empresas" + 0.004*"economia"
Topic 2: 0.008*"marzo" + 0.008*"febrero" + 0.007*"democracia" + 0.007*"militar" + 0.005*"imss" + 0.005*"enero" + 0.005*"inflacion" + 0.005*"aeropuerto_internacional" + 0.004*"electoral" + 0.004*"mujeres"
Topic 3: 0.030*"justicia" + 0.022*"septiembre" + 0.016*"republica" + 0.011*"historia" + 0.011*"yaquis" + 0.010*"familias" + 0.010*"pueblos" + 0.010*"pueblos_yaquis" + 0.010*"acto" + 0.009*"consumacion"
Topic 4: 0.008*"hospitales" + 0.006*"justicia" + 0.006*"inflacion" + 0.005*"julio" + 0.005*"beneficio" + 0.005*"nunca" + 0.005*"acoto" + 0.005*"familiares" + 0.004*"permite" + 0.004

A partir de las palabras asociadas a un tópico, nombramos cada uno de los tópicos

In [None]:
# Crear una bolsa de representación de palabras del nuevo corpus preprocesado
new_corpus = []
for doc in new_data['Clean_Report']:
    new_corpus.append(doc.split())

new_bow_corpus = []
for doc in new_corpus:
    new_bow_corpus.append(lda_model.id2word.doc2bow(doc))

# Inferir temas para el nuevo corpus usando el modelo LDA entrenado
new_topics = []
for doc in new_bow_corpus:
    topic_dist = lda_model.get_document_topics(doc, minimum_probability=0.0)
    new_topics.append(topic_dist)

topics_dict = {
    1: "Extranjero",
    2: "Violencia contra la mujer",
    3: "Aeropuerto",
    4: "Pueblo Yaqui",
    5: "Salud y Medicina",
    6: "Hospitales",
    7: "Proyectos de infraestructura",
    8: "Vacunación",
    9: "Justicia penal",
    10: "Contexto histórico",
    11: "Problemas sociales",
    12: "Pandemia Covid-19",
    13: "Tren Maya",
    14: "Petróleo y minería",
    15: "Comisión de lectricidad",
    16: "Redes Sociales",
    17: "Inflación",
}

In [None]:
# Obtener probabilidades de los tópicos 
topic_probs = []
for topics in new_topics:
    probs = [topic[1] for topic in topics]
    topic_probs.append(probs)

# Crear un dataframe de probabilidades de los tópicos
df_prob = pd.DataFrame(topic_probs, columns=topics_dict.values())
df_prob.insert(0, "Week", new_data["Week"])

In [None]:
df_prob

Unnamed: 0,Week,Extranjero,Violencia contra la mujer,Aeropuerto,Pueblo Yaqui,Salud y Medicina,Hospitales,Proyectos de infraestructura,Vacunación,Justicia penal,Contexto histórico,Problemas sociales,Pandemia Covid-19,Tren Maya,Petróleo y minería,Comisión de lectricidad,Redes Sociales,Inflación
0,2018-49,0.000036,0.097470,0.046472,0.012362,0.000051,0.387669,0.033771,0.056075,0.000023,0.013608,0.022956,0.000019,0.000046,0.014473,0.233255,0.000020,0.081695
1,2018-50,0.000188,0.229851,0.000430,0.072680,0.000268,0.000416,0.000135,0.034114,0.000121,0.000141,0.286772,0.000098,0.284244,0.060037,0.000264,0.000106,0.030134
2,2018-51,0.000445,0.095713,0.001016,0.000260,0.000633,0.000985,0.895733,0.000755,0.000286,0.000334,0.000501,0.000232,0.000573,0.000587,0.000625,0.000250,0.001073
3,2018-52,0.000093,0.098377,0.047874,0.000054,0.000132,0.025606,0.033439,0.640857,0.006772,0.000070,0.048490,0.000048,0.025875,0.000123,0.035155,0.000052,0.036984
4,2019-1,0.000545,0.517971,0.180136,0.000318,0.000774,0.168322,0.000389,0.000924,0.000350,0.000408,0.098272,0.000284,0.028205,0.000718,0.000764,0.000305,0.001313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,2023-11,0.010494,0.200740,0.209316,0.000011,0.032081,0.055367,0.040568,0.041870,0.002520,0.033017,0.029091,0.008535,0.035515,0.158030,0.024703,0.003577,0.114564
190,2023-12,0.034213,0.205655,0.138848,0.000019,0.011062,0.013374,0.052167,0.079517,0.005623,0.015453,0.037236,0.004415,0.218090,0.006903,0.019553,0.011952,0.145919
191,2023-13,0.000041,0.183429,0.231540,0.000024,0.032188,0.103204,0.036868,0.018911,0.009791,0.024000,0.006240,0.000021,0.106098,0.024678,0.000058,0.029553,0.193356
192,2023-14,0.025104,0.173012,0.135020,0.000030,0.010529,0.074031,0.024466,0.024294,0.003539,0.074075,0.000058,0.000027,0.057329,0.024445,0.044454,0.000029,0.329560


Guardamos el dataframe para su utilización

In [None]:
df_prob.to_csv('prob_topicos.csv', index=False)