# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*

In [1]:
## Install Python package for LDA
# http://pythonhosted.org/lda/getting_started.html

!pip install lda
!pip install nltk
!pip install scikit-learn



In [23]:
## Importing basic packages

import os
import numpy as np



In [3]:
#os.chdir('/Volumes/benson/Mellon/cirma_related/Topic_Modeling_Workshop_Materials/cirma_texts/')

In [4]:
os.getcwd()

'/home/jovyan/gensim_pyldavis'

In [5]:
os.chdir('/home/jovyan/gensim_pyldavis/cirma_texts/')

In [6]:
document_list = []

for filename in [item for item in os.listdir('./') if '.txt' in item]:
    text_data = open(filename).read()
    document_list.append(text_data)

In [32]:
## Importing NLTK stop words
import nltk
import gensim
import pyLDAvis
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pyLDAvis.gensim
from gensim.corpora.mmcorpus import MmCorpus

stopword = '/home/jovyan/gensim_pyldavis/spanish.txt'
stopwords = open(stopword).read()
stop_words = stopwords.split('\\n') + ["'s", "'re", '”', '“', '’', '—'] + list(string.punctuation)
 
# = stopwords.words('spanish') + ["'s", "'re", '”', '“', '’', '—'] + list(string.punctuation)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
document_list

['EN EL PERIODO POST-ELECTORAL:  \n\n72 PERSONAS HAN SIDO TORTURADAS Y ASESINADAS A TIROS\n\nLAS VICTIMAS, A EXCEPCION DE 20, NO HAN PODIDO SER IDENTIFICADAS Y SU LAPIDA ESTA\nMARCADA CON "X"\n\nDel período electoral —5 de\nmarzo anterior— hasta hoy ha\nhabido una verdadera carnicería\nhumana en el país. La escalada de\nla violencia, principalmente\naquella de carácter político, ha\narrojado la escandalosa cifra de 72\npersonas asesinadas a tiros y\ntortura, siendo identificadas 20\nvíctimas.\nLos informes policíacos dados\na conocer en este lapso —muchos\ncasos no son reportados\noficialmente—, según las\npublicaciones periodísticas y\nnuestros archivos señalan el\nincremento de la violencia y la\nrepresión política, especialmente\nen el interior del país.\nPor las características de los\ncrímenes y las condiciones en\nque han sido localizadas las\nvíctimas\nidentificadas— se presume que es\n“obra” de grupos debidamente\narmados y entrenados para el\ncrimen y la tortura.\n\n\nUNA DE L

In [9]:
## Tokenizing and removing stop words from our list of documents

documents_filtered = []

for document in document_list:
    token_list = word_tokenize(document.lower())
    tokens_filtered = [item for item in token_list if (item not in stop_words)]
    documents_filtered.append(' '.join(tokens_filtered))

In [10]:
## Viewing a preprocessed document

documents_filtered[1]

'fuerzas mixtas buscan en zacapa rastros de quien dio muerte al g-2 se busca evitar que escape hacia algun pais vecino en la región de zacapa fuerzas mixtas del gobierno se encuentran rastreando para localizar al ex—detective melsiveo valladares godoy de 27 años de edad a quien se acusa de la muerte de un agente del servicio de inteligencia del ejército en la puerta de la emergencia del igss hasta ahora no ha sido posible la captura de melsiveo aun cuando las autoridades lo buscan desde la noche del jueves cuando se produjeron los hechos en el seguro social donde el perseguido disparó contra adelso guerra y guerra matándolo de un balazo aun cuando se había dicho que el responsable de la muerte de adelso guerra y guerra había sido apresado ayer la policía dijo que era buscado pues tras dar muerte al agente g-2 tuvo tiempo de subir a un vehículo y encaminarse a la zona 6. posteriormente según las investigaciones el responsable de aquella muerte buscó ganar el departamento de zacapa de do

In [11]:
## Vectorizing preprocessed essays

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents_filtered) 

In [12]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

vocabulary[1140:1160]

['civiles',
 'clama',
 'claman',
 'clandestina',
 'clandestinas',
 'clandestino',
 'clarificado',
 'clarifique',
 'claro',
 'clase',
 'clases',
 'claustro',
 'clave',
 'clemente',
 'clientes',
 'co',
 'coacción',
 'coatepeque',
 'cobarde',
 'cobraba']

In [13]:
## Initializing an LDA model: 10 topics and 1500 iterations

import lda

model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)

In [14]:
## Fitting the model using our list of vectors

model.fit(X)

INFO:lda:n_documents: 99
INFO:lda:vocab_size: 5791
INFO:lda:n_words: 38728
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -387986
INFO:lda:<10> log likelihood: -317228
INFO:lda:<20> log likelihood: -309445
INFO:lda:<30> log likelihood: -306055
INFO:lda:<40> log likelihood: -303817
INFO:lda:<50> log likelihood: -302496
INFO:lda:<60> log likelihood: -301870
INFO:lda:<70> log likelihood: -300086
INFO:lda:<80> log likelihood: -299151
INFO:lda:<90> log likelihood: -298824
INFO:lda:<100> log likelihood: -297820
INFO:lda:<110> log likelihood: -296612
INFO:lda:<120> log likelihood: -296236
INFO:lda:<130> log likelihood: -296025
INFO:lda:<140> log likelihood: -295192
INFO:lda:<150> log likelihood: -295611
INFO:lda:<160> log likelihood: -294957
INFO:lda:<170> log likelihood: -294733
INFO:lda:<180> log likelihood: -293949
INFO:lda:<190> log likelihood: -293232
INFO:lda:<200> log likelihood: -293025
INFO:lda:<210> log likelihood: -292720
INFO:lda:<220> log likelihood: -29

<lda.lda.LDA at 0x7f50131de6d8>

In [15]:
## Viewing the top 50 words in each 'topic'

topic_word = model.topic_word_

n_top_words = 10

for i, topic_distribution in enumerate(topic_word):
    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
    print('Topic ' + str(i) + ':')
    print(' '.join(topic_words))
    print()

Topic 0:
que de los del la por para vecinos es han

Topic 1:
de los desconocidos aldea las hombres hasta grupo campesinos verde

Topic 2:
de del los alcalde iztapa policía bar fueron heridos cruz

Topic 3:
la de los policía del dos no que tres muerte

Topic 4:
los pick up vehículo camión del garcía placas estudiante ruta

Topic 5:
de del la fue hospital cuerpo atentado policía detectives balazos

Topic 6:
de los las han sido víctimas fueron escuintla unos cadáveres

Topic 7:
del alcalde electo ejército mixco maldonado esposa pudo atentado mi

Topic 8:
de el en la que se al por las un

Topic 9:
de los del colonia estudiante corado estudiantes policías cadáver fue



In [16]:
#Make stopwords in a single variable and make a class that 
#reprocesses the documents into a dictionary for gensim format with tokenization. 
import logging 
stops = '/home/jovyan/nltk_data/corpora/stopwords/spanish.txt'

stoplist = open(stops).read()
stoplist = stoplist.split('\n')
class MyCorpus(object):

    def __init__(self, topdir, stoplist):
        self.topdir = topdir
        self.stoplist = stoplist
        self.dictionary = gensim.corpora.Dictionary(iter_docs(topdir, stoplist))
        
    def __iter__(self):
        for tokens in iter_docs(self.topdir, self.stoplist):
            yield self.dictionary.doc2bow(tokens)


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [17]:
#separate function to iterate through the documents separately to tokenize and preprocess with lower and accent remove
def iter_docs(topdir, stoplist):
    for fn in os.listdir(topdir):
        fin = open(os.path.join(topdir, fn), 'rb')
        text = fin.read()
        fin.close()
        yield (x for x in 
            gensim.utils.tokenize(text, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stoplist)

In [18]:
#gensim doesn't like the texts directory to be two separate so they were combined
#try: os.mkdir('/sharedfolder/models/dir')
#except: pass
TEXTS_DIR = "/home/jovyan/gensim_pyldavis/cirma_texts"
MODELS_DIR = "/home/jovyan/gensim_pyldavis/models/dir"

In [19]:
#the fist is a bit of overkill since it reapplies stopwords, it also generates tokenization again
#the issue is that I can't bring the other variables over because it dislikes nltk's tokenization for utf-8
#the second creates a dictionary which is just a id per word list
#the third creates pairings between document, id, and word with a frequency count in a dense matrix
corpus = MyCorpus(TEXTS_DIR, stoplist)
corpus.dictionary.save(os.path.join(MODELS_DIR, 'cirma.dict'))
gensim.corpora.MmCorpus.serialize(os.path.join(MODELS_DIR, "cirma.mm"), 
                                  corpus)

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:built Dictionary(5368 unique tokens: ['abandonados', 'acribillados', 'actual', 'acudido', 'ahora']...) from 99 documents (total 21691 corpus positions)
INFO:gensim.utils:saving Dictionary object under /home/jovyan/gensim_pyldavis/models/dir/cirma.dict, separately None
INFO:gensim.utils:saved /home/jovyan/gensim_pyldavis/models/dir/cirma.dict
INFO:gensim.corpora.mmcorpus:storing corpus in Matrix Market format to /home/jovyan/gensim_pyldavis/models/dir/cirma.mm
INFO:gensim.matutils:saving sparse matrix to /home/jovyan/gensim_pyldavis/models/dir/cirma.mm
INFO:gensim.matutils:PROGRESS: saving document #0
INFO:gensim.matutils:saved 99x5368 matrix, density=2.899% (15404/531432)
INFO:gensim.corpora.indexedcorpus:saving MmCorpus index to /home/jovyan/gensim_pyldavis/models/dir/cirma.mm.index


In [51]:
#logging just to keep track of time for these, much slower than Mallet or lda
#dictionary gets loaded from dict file
#corpus is loaded from path and linked to matrix of values for frequency, id and doc
#50 topics seems to work best and keeping the passes(iterations) at the same number as Mallet (painfully slow)
numtopics = 5
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, 
                                            "cirma.dict"))
corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "cirma.mm"))

# Project to LDA space
polylingualmodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, 
                                              workers=8, random_state=1, passes=5)

INFO:gensim.utils:loading Dictionary object from /home/jovyan/gensim_pyldavis/models/dir/cirma.dict
INFO:gensim.utils:loaded /home/jovyan/gensim_pyldavis/models/dir/cirma.dict
INFO:gensim.corpora.indexedcorpus:loaded corpus index from /home/jovyan/gensim_pyldavis/models/dir/cirma.mm.index
INFO:gensim.corpora._mmreader:initializing cython corpus reader from /home/jovyan/gensim_pyldavis/models/dir/cirma.mm
INFO:gensim.corpora._mmreader:accepted corpus with 99 documents, 5368 features, 15404 non-zero entries
INFO:gensim.models.ldamodel:using symmetric alpha at 0.2
INFO:gensim.models.ldamodel:using symmetric eta at 0.2
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamulticore:running online LDA training, 5 topics, 5 passes over the supplied corpus of 99 documents, updating every 16000 documents, evaluating every ~99 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamulticore:training LDA model using 8 processes
I

In [53]:
#saving the model for persistency
polylingualmodel.save('polylingualmodel')

INFO:gensim.utils:saving LdaState object under polylingualmodel.state, separately None
INFO:gensim.utils:saved polylingualmodel.state
INFO:gensim.utils:saving LdaMulticore object under polylingualmodel, separately ['expElogbeta', 'sstats']
INFO:gensim.utils:storing np array 'expElogbeta' to polylingualmodel.expElogbeta.npy
INFO:gensim.utils:not storing attribute id2word
INFO:gensim.utils:not storing attribute state
INFO:gensim.utils:not storing attribute dispatcher
INFO:gensim.utils:saved polylingualmodel


In [None]:
os.listdir()

In [54]:
#creating a bag-of-words from the corpus with doc id, word, frequency count array
bow_corpus = MmCorpus("/home/jovyan/gensim_pyldavis/models/dir/cirma.mm")

INFO:gensim.corpora.indexedcorpus:loaded corpus index from /home/jovyan/gensim_pyldavis/models/dir/cirma.mm.index
INFO:gensim.corpora._mmreader:initializing cython corpus reader from /home/jovyan/gensim_pyldavis/models/dir/cirma.mm
INFO:gensim.corpora._mmreader:accepted corpus with 99 documents, 5368 features, 15404 non-zero entries


In [55]:
#This shows a random sorting of topics with the weight across the entire corpus 
#"topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples"
get_documents_topics = polylingualmodel.get_document_topics(bow_corpus, 
                                                            minimum_probability=.05, 
                                                            minimum_phi_value=0.01, 
                                                            per_word_topics=True)
get_documents_topics[1]

([(1, 0.73396295), (3, 0.26155916)],
 [(4, [1, 3]),
  (6, [1, 3]),
  (14, [1, 3]),
  (25, [1, 3]),
  (26, [1, 3]),
  (27, [1, 3]),
  (72, [1, 3]),
  (110, [1, 3]),
  (156, [1, 3]),
  (160, [1, 3]),
  (173, [1, 3]),
  (205, [1, 3]),
  (230, [1, 3]),
  (234, [1, 3]),
  (263, [1, 3]),
  (264, [1, 3]),
  (265, [1]),
  (266, [1]),
  (267, [1, 3]),
  (268, [1, 3]),
  (269, [1, 3]),
  (270, [1]),
  (271, [3, 1]),
  (272, [1, 3]),
  (273, [1]),
  (274, [1, 3]),
  (275, [1, 3]),
  (276, [1, 3]),
  (277, [1, 3]),
  (278, [1, 3]),
  (279, [1, 3]),
  (280, [1, 3]),
  (281, [1]),
  (282, [1, 3]),
  (283, [1, 3]),
  (284, [1, 3]),
  (285, [3, 1]),
  (286, [3, 1]),
  (287, [1, 3]),
  (288, [1, 3]),
  (289, [1, 3]),
  (290, [1, 3]),
  (291, [1, 3]),
  (292, [1, 3]),
  (293, [1, 3]),
  (294, [1, 3]),
  (295, [1, 3]),
  (296, [1]),
  (297, [1, 3]),
  (298, [1]),
  (299, [1, 3]),
  (300, [3, 1]),
  (301, [1, 3]),
  (302, [1]),
  (303, [3, 1]),
  (304, [1]),
  (305, [3, 1]),
  (306, [1, 3]),
  (307, [1, 3

In [56]:
#This prints a topic at a time and is useful for inspecting topics one by one. The one listed here
#bears some slight resemblance to 21 in Mallet
print_topics = polylingualmodel.print_topics(num_topics=1, num_words=35)

INFO:gensim.models.ldamodel:topic #4 (0.200): 0.009*"alcalde" + 0.004*"habia" + 0.004*"vecinos" + 0.004*"campesinos" + 0.004*"atentado" + 0.004*"policia" + 0.004*"mixco" + 0.003*"ayer" + 0.003*"horas" + 0.003*"dos" + 0.003*"senora" + 0.003*"varios" + 0.003*"hombres" + 0.003*"tarde" + 0.003*"personas" + 0.003*"puerta" + 0.003*"senor" + 0.003*"armados" + 0.003*"despues" + 0.002*"mas" + 0.002*"ramirez" + 0.002*"casa" + 0.002*"calle" + 0.002*"perez" + 0.002*"puede" + 0.002*"dijo" + 0.002*"joven" + 0.002*"hecho" + 0.002*"ser" + 0.002*"hospital" + 0.002*"familias" + 0.002*"terreno" + 0.002*"anos" + 0.002*"garcia" + 0.002*"lugar"


In [57]:
#From gensim site "tuples with (topic_repr, coherence_score), where topic_repr is a list of 
#representations of the topn terms for the topic. 
#The terms are represented as tuples of (membership_in_topic, token). 
#The coherence_score is a float".
top_topics = polylingualmodel.top_topics(corpus=corpus, texts=None, dictionary='cirma.dict', 
                                         window_size=None, coherence='u_mass', 
                                         topn=50, processes=-1)
top_topics

[([(0.017367586, 'policia'),
   (0.0075122332, 'nacional'),
   (0.0055703283, 'zona'),
   (0.0054931156, 'cuerpo'),
   (0.0053462735, 'anos'),
   (0.0050776261, 'dos'),
   (0.0049360297, 'senor'),
   (0.0045618187, 'hospital'),
   (0.0042891623, 'guerra'),
   (0.0042649368, 'detectives'),
   (0.0041650538, 'casa'),
   (0.003982774, 'general'),
   (0.0037777352, 'personas'),
   (0.0035501509, 'autoridades'),
   (0.0035471108, 'carlos'),
   (0.0035084856, 'horas'),
   (0.0034034618, 'jefe'),
   (0.0033633332, 'lugar'),
   (0.0033526265, 'sido'),
   (0.0032733972, 'edad'),
   (0.0032104515, 'bala'),
   (0.0032018574, 'agentes'),
   (0.0031210373, 'crimen'),
   (0.0030601805, 'desconocidos'),
   (0.0029923066, 'ayer'),
   (0.0029850851, 'segun'),
   (0.0028531577, 'varios'),
   (0.0028337161, 'calle'),
   (0.0028174117, 'centro'),
   (0.0027230412, 'balazos'),
   (0.0026889504, 'habia'),
   (0.0026601362, 'atentado'),
   (0.0026443715, 'despues'),
   (0.0026332105, 'dijo'),
   (0.002623127

In [58]:
#even with pprint this is quite difficult to sort through but there seems to be an issue with the weights
#all being too low at .00*
from pprint import pprint
pprint(top_topics)

[([(0.017367586, 'policia'),
   (0.0075122332, 'nacional'),
   (0.0055703283, 'zona'),
   (0.0054931156, 'cuerpo'),
   (0.0053462735, 'anos'),
   (0.0050776261, 'dos'),
   (0.0049360297, 'senor'),
   (0.0045618187, 'hospital'),
   (0.0042891623, 'guerra'),
   (0.0042649368, 'detectives'),
   (0.0041650538, 'casa'),
   (0.003982774, 'general'),
   (0.0037777352, 'personas'),
   (0.0035501509, 'autoridades'),
   (0.0035471108, 'carlos'),
   (0.0035084856, 'horas'),
   (0.0034034618, 'jefe'),
   (0.0033633332, 'lugar'),
   (0.0033526265, 'sido'),
   (0.0032733972, 'edad'),
   (0.0032104515, 'bala'),
   (0.0032018574, 'agentes'),
   (0.0031210373, 'crimen'),
   (0.0030601805, 'desconocidos'),
   (0.0029923066, 'ayer'),
   (0.0029850851, 'segun'),
   (0.0028531577, 'varios'),
   (0.0028337161, 'calle'),
   (0.0028174117, 'centro'),
   (0.0027230412, 'balazos'),
   (0.0026889504, 'habia'),
   (0.0026601362, 'atentado'),
   (0.0026443715, 'despues'),
   (0.0026332105, 'dijo'),
   (0.002623127

In [60]:
os.chdir('/home/jovyan/gensim_pyldavis/models/dir/')
d = gensim.corpora.Dictionary.load('cirma.dict')
c = gensim.corpora.MmCorpus('cirma.mm')
vizmod = gensim.models.LdaMulticore.load('/home/jovyan/gensim_pyldavis/models/dir/polylingualmodel')

INFO:gensim.utils:loading Dictionary object from cirma.dict
INFO:gensim.utils:loaded cirma.dict
INFO:gensim.corpora.indexedcorpus:loaded corpus index from cirma.mm.index
INFO:gensim.corpora._mmreader:initializing cython corpus reader from cirma.mm
INFO:gensim.corpora._mmreader:accepted corpus with 99 documents, 5368 features, 15404 non-zero entries
INFO:gensim.utils:loading LdaMulticore object from /home/jovyan/gensim_pyldavis/models/dir/polylingualmodel
INFO:gensim.utils:loading expElogbeta from /home/jovyan/gensim_pyldavis/models/dir/polylingualmodel.expElogbeta.npy with mmap=None
INFO:gensim.utils:setting ignored attribute id2word to None
INFO:gensim.utils:setting ignored attribute state to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:loaded /home/jovyan/gensim_pyldavis/models/dir/polylingualmodel
INFO:gensim.utils:loading LdaState object from /home/jovyan/gensim_pyldavis/models/dir/polylingualmodel.state
INFO:gensim.utils:loaded /home/jovyan

In [61]:
import pyLDAvis.gensim as gensimvis

In [62]:
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(vizmod, c, d)
data

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
