In [26]:
import pandas as pd
import numpy as np
import os

Cargar todos los datos y juntarlos

In [27]:
def getAllPaths(data_directory):
    #Input: directorio de los archivos.
    #Return: Arreglo con las rutas de los archivos.
    file_name = [os.path.join(data_directory,f) 
                 for f in os.listdir(data_directory)
                 if f.endswith(".csv")]
    return file_name

In [28]:
all_files = getAllPaths('../data_sophia2/')
df = (pd.read_csv(f) for f in all_files)
df = pd.concat(df, ignore_index=True)

Tomamos una muestra del 75% de los datos, alrededor de 50000 noticias

In [29]:
df = df.sample(frac=0.75)

Eliminamos algunos datos basura

In [30]:
df = df['text'].str.lower().str.strip();
df = df.replace('', np.nan)
df = df.dropna();

In [31]:
df

65128    queda cerca de un mes para que comience el tor...
23866    el gobierno anunció cambios en comunas de todo...
14101    con un promedio de un 25% de desempleo, latino...
36205    ¿los aficionados al queso pueden seguir disfru...
34455    el ministerio de salud aseguró este jueves que...
                               ...                        
48727    durante la tarde de este martes, efectivos de ...
68373    el primer tribunal de juicio oral en lo penal ...
28039    este viernes las autoridades de salud entregar...
63657    el ministro de salud,  enrique paris,  reproch...
4071     el diputado y presidente de la federación regi...
Name: text, Length: 50470, dtype: object

Cargar SpaCy

In [32]:
import string
import spacy

spacy.prefer_gpu()
nlp = spacy.load("es_core_news_lg", disable=["tagger", "parser", "ner"])

Funcion para genererar tokens de los textos

In [33]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
stop_words = spacy.lang.es.stop_words.STOP_WORDS
nlp.Defaults.stop_words.update(['.', ',', ':', ';', 'y', 'a', '?', '¿', '...', 'instagrama', 'post', 'on', 'by', 'shared', 'tercero'])

# Creating our tokenizer function
def spacy_tokenizer(text):
    allowed_postags=['NOUN', 'ADJ', 'ADV', 'VERB']

    # Creating our token object, which is used to create documents with linguistic annotations.
    #tokens = nlp(text)
    tokens = text

    # Lemmatizing each token and converting each token into lowercase
    tokens = [ word.lemma_ for word in tokens if word.pos_ in allowed_postags ]

    # Removing stop words
    tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return tokens

Procesar textos con SpaCy

In [34]:
tokens = [spacy_tokenizer(text) for text in nlp.pipe(df)]

Las siguientes celdas comentadas se utilizaron para saber cuales palabras eran demasiado frecuentes en los textos e influenciaban de manera exagerada los textos

In [35]:
#from collections import Counter
#from functools import reduce
#import operator

#combined_list = reduce(operator.add, tokens)
#count = Counter(combined_list)

In [36]:
#count.most_common()

In [37]:
docs = tokens

Calcular bigrams utilizando gensim

In [38]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

Preparar datos para utilizaros en LDA y filtrar aun mas algunos tokens

In [39]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [40]:
#assume the word 'b' is to be deleted, put its id in a variable
del_ids = [k for k,v in dictionary.items() if v in [u'año','comentario', 'acceder', 'sesión', 'persona', 'país', 'by', 'shared_by', 'exclusivo_suscriptor', 'comentario_sección', 'comentario_sección_exclusivo_suscriptor', 'favor_iniciar_sesión_acceder', 'sesión_acceder', 'favor_iniciar']]
dictionary.filter_tokens(bad_ids=del_ids)

In [41]:
del_ids

[5, 41, 164, 512, 529, 538, 570, 571, 693, 803]

In [42]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [43]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 20260
Number of documents: 50470


Entrenamiento de modelo LDA

In [45]:
# Train LDA model.
from gensim.models import LdaMulticore

num_topics = 17
chunksize = 1000
passes = 100
iterations = 200
eval_every = 10

temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(corpus=corpus,
                           id2word=id2word,
                           workers=8,
                           num_topics=num_topics,
                           chunksize=chunksize,
                           alpha='symmetric',
                           eta='auto',
                           passes=passes,
                           iterations=iterations,
                           per_word_topics=True,
                           eval_every=eval_every)


Diagnostico de topicos y coherencia

In [46]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.7531.
[([(0.017032454, 'ministro'),
   (0.011705034, 'presidente'),
   (0.011571151, 'gobierno'),
   (0.010035354, 'diputado'),
   (0.0068519185, 'creer'),
   (0.006299358, 'querer'),
   (0.0061466126, 'tema'),
   (0.0058934563, 'parlamentario'),
   (0.005560741, 'asegurar'),
   (0.0055432934, 'sostener'),
   (0.005468731, 'constitucional'),
   (0.005338372, 'acusación'),
   (0.0052553206, 'decisión'),
   (0.004880685, 'agregar'),
   (0.0048755095, 'señalar'),
   (0.004491899, 'oposición'),
   (0.004259834, 'político'),
   (0.004137199, 'pedir'),
   (0.0040749316, 'cargo'),
   (0.0040589734, 'situación')],
  -1.3967230477439898),
 ([(0.029101565, 'región'),
   (0.022835629, 'caso'),
   (0.016512288, 'comuna'),
   (0.014987486, 'salud'),
   (0.011063113, 'sanitario'),
   (0.009081565, 'cuarentena'),
   (0.008592278, 'metropolitano'),
   (0.008290429, 'semana'),
   (0.008210213, 'activo'),
   (0.0074296286, 'permiso'),
   (0.0073567014, 'paso'),
   (0.00734105

In [47]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


1 = Politica y Conflictos
2 = Politica y Conflictos
3 = Red Social?
4 = Crimen, Delitos y Justicia
5 = Mundo (Internacional)
6 = Salud (Educacion en pandemia)
7 = Economia
8 = Salud (Plan paso a paso, cuarentenas)
9 = Salud (Cifras Covid19)
10 = Crimen, Delitos y Justicia
11 = Ecologia y Planeta
12 = Deporte
13 = Crimes, Delitos y Justicia (Estallido Social)
14 = Salud (Vacunas)
15 = Cultura y Artes
16 = Politica y Conflictos
17 = Catastrofes y Accidentes

REF:
1- Mundo*, 2- Economía*, 3- Política y Conflictos*, 4- Ciencias y Tecnología, 5- Catástrofes y Accidentes*, 6- Cultura y Artes*, 7- Deporte*, 8- Ecología y Planeta*, 9- Crimen, delitos y Justicia*, 10- Salud*

Guardar modelo diccionario, corpora y modelo LDA

In [53]:
from gensim.corpora.mmcorpus import MmCorpus

dictionary.save("dictionary_23112021")
MmCorpus.serialize("corpus_23112021", corpus)
model.save("model_23112021")

Formatear topicos de LDA junto con los datos de textos iniciales

In [54]:
def format_topics_documents(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [55]:
df_topic_sents_keywords = format_topics_documents(ldamodel=model, corpus=corpus, texts=df.values)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6.0,0.9859,"partido, equipo, jugar, jugador, club, chileno...",queda cerca de un mes para que comience el tor...
1,1,14.0,0.97,"región, caso, comuna, salud, sanitario, cuaren...",el gobierno anunció cambios en comunas de todo...
2,2,5.0,0.4317,"millón, empresa, económico, mes, mercado, dóla...","con un promedio de un 25% de desempleo, latino..."
3,3,10.0,0.3685,"agua, zona, proyecto, vehículo, ambiental, exp...",¿los aficionados al queso pueden seguir disfru...
4,4,15.0,0.938,"vacuna, salud, dosis, vacunación, médico, estu...",el ministerio de salud aseguró este jueves que...
5,5,7.0,0.737,"serie, película, juego, iniciar, historia, fav...",un teaser lo había prometido y durante este lu...
6,6,14.0,0.9838,"región, caso, comuna, salud, sanitario, cuaren...",en su 77° informe epidemiológico el ministerio...
7,7,13.0,0.3952,"ministro, presidente, gobierno, diputado, cree...",fue cuando vieron arder el céntrico edificio c...
8,8,1.0,0.7796,"carabinero, encontrar, vehículo, policía, homb...",un hombre atacó a su pareja en su domicilio en...
9,9,2.0,0.5855,"carabinero, violencia, derechos, humanos, soci...","el general director de carabineros, mario roza..."


Asociar datos con topicos relevantes definidos anterioremente

In [67]:
df_dominant_topic_name = df_dominant_topic

In [78]:
df_dominant_topic_name['Dominant_Topic_Name'] = ''
df_dominant_topic_name['Dominant_Topic_REF'] = 0

In [94]:
for index, row in df_dominant_topic_name.iterrows():
    if row["Dominant_Topic"] in [3.0, 12.0, 13.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Politica y Conflictos"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 3
    if row["Dominant_Topic"] in [1.0, 2.0, 16.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Crimen, Delitos y Justicia"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 9
    if row["Dominant_Topic"] in [4.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Mundo"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 1
    if row["Dominant_Topic"] in [8.0, 9.0, 14.0, 15.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Salud"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 10
    if row["Dominant_Topic"] in [5.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Economia"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 2
    if row["Dominant_Topic"] in [10.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Ecologia y Planeta"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 8
    if row["Dominant_Topic"] in [6.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Deporte"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 7
    if row["Dominant_Topic"] in [0.0, 7.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Cultura y Artes"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 6
    if row["Dominant_Topic"] in [11.0]:
        df_dominant_topic_name["Dominant_Topic_Name"][index] = "Catastrofes y Accidentes"
        df_dominant_topic_name["Dominant_Topic_REF"][index] = 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dominant_topic_name["Dominant_Topic_Name"][index] = "Deporte"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dominant_topic_name["Dominant_Topic_REF"][index] = 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dominant_topic_name["Dominant_Topic_Name"][index] = "Salud"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

Guardar dataset para clasificacion

In [95]:
df_dominant_topic_name.to_csv('sophia2_data_23112021.csv')

***Algunos experimentos anteriores***

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Build the bigram and trigram models
bigram = gensim.models.Phrases(tokens, min_count=1, threshold=1) # higher threshold fewer phrases.
#trigram = gensim.models.Phrases(bigram[tokens], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
#def remove_stopwords(texts):
#    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

#def make_trigrams(texts):
#    return [trigram_mod[bigram_mod[doc]] for doc in texts]

#def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#    texts_out = []
#    for sent in texts:
#        doc = nlp(" ".join(sent)) 
#        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#    return texts_out

In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(tokens)

# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [14]:
data_words_bigrams

[['pandemia_causa',
  'coronavirus_mascarilla',
  'volver',
  'elemento_esencial',
  'prevención_contagio',
  'contexto_viralizar',
  'mes_noviembre',
  'video',
  'abogado',
  'multado',
  'utilizar_cubrir',
  'boca',
  'centro',
  'cauquén_región',
  'maule_registro',
  'profesional_señalar',
  'fiscalizar',
  'trabajadora',
  'ministerio',
  'cursar_infracción',
  'apelar',
  'legalidad',
  'medida',
  'raíz',
  'alcance',
  'video_conversar',
  'abogado_organización',
  'abogado',
  'hombre',
  'mencionar',
  'video',
  'determinar_afirmación',
  'entregado',
  'persona',
  'falso_identidad',
  'protagonista_video',
  'instancia_mencionado',
  'aseguraró',
  'ciudadano_pertenecer',
  'institución_julio',
  'publicar_diario',
  'oficial_resolución',
  'establecer_medida',
  'prevención',
  'convivencia',
  'plan_paso',
  'paso',
  'número',
  'resolución',
  'detalla',
  'dispóngase',
  'obligatorio_mascarilla',
  'persona_encontrar',
  'vía_público',
  'zona_urbano',
  'poblado',
 

In [15]:
id2word_tokens = corpora.Dictionary(tokens)
id2word_bigrams = corpora.Dictionary(data_words_bigrams)

In [22]:
# Term Document Frequency
corpus = [id2word_bigrams.doc2bow(text) for text in texts]
[[(id2word_bigrams[id], freq) for id, freq in cp] for cp in corpus]

[[('abogado', 6),
  ('abril', 1),
  ('actual', 1),
  ('alcance', 1),
  ('apelar', 3),
  ('aplicar', 1),
  ('arbitrario', 2),
  ('aseguraró', 1),
  ('base', 2),
  ('boca', 1),
  ('caminar', 1),
  ('caso', 5),
  ('centro', 1),
  ('certificado', 1),
  ('comer', 1),
  ('completo', 1),
  ('conclusión', 1),
  ('contar', 1),
  ('convivencia', 1),
  ('demostrar', 1),
  ('desobediencia', 1),
  ('detalla', 1),
  ('dispóngase', 1),
  ('entregado', 1),
  ('experto', 1),
  ('expresar', 1),
  ('facultar', 1),
  ('fiscalizar', 1),
  ('hablar', 1),
  ('hombre', 3),
  ('ilegalidad', 1),
  ('infracción', 3),
  ('legalidad', 1),
  ('ley', 4),
  ('medida', 3),
  ('mencionar', 1),
  ('ministerio', 2),
  ('moral', 2),
  ('multa', 7),
  ('multado', 1),
  ('necesario', 1),
  ('necesitar', 1),
  ('número', 1),
  ('obstante', 1),
  ('paso', 2),
  ('penal', 2),
  ('perseguir', 1),
  ('persona', 3),
  ('pesos', 1),
  ('pic.twitter.com/gm4aa9qgbu', 1),
  ('poblado', 1),
  ('precisar', 1),
  ('presidente', 1),
  ('

In [25]:
# Build LDA model
#lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
#                                           id2word=id2word_bigrams,
#                                           num_topics=12, 
#                                           random_state=100,
#                                           chunksize=100,
#                                           passes=1,
#                                           per_word_topics=True)

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word_bigrams,
                                           num_topics=12, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)