# *Unsupervised learning: Latent Dirichlet allocation (LDA) topic modeling*

In [14]:
## Install Python package for LDA
# http://pythonhosted.org/lda/getting_started.html

!pip install lda
!pip install nltk
!pip install scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/cf/db/f6375ee4b604209d88447bffab074f236d5357a4f6fa38901362311ed18d/scikit_learn-0.19.2-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (7.1MB)
[K    100% |████████████████████████████████| 7.1MB 4.4MB/s eta 0:00:011
[?25hInstalling collected packages: scikit-learn
Successfully installed scikit-learn-0.19.2


In [2]:
## Importing basic packages

import os
import numpy as np


In [3]:
os.chdir('/Volumes/benson/Mellon/cirma_related/Topic_Modeling_Workshop_Materials/cirma_texts/')

In [4]:
document_list = []

for filename in [item for item in os.listdir('./') if '.txt' in item]:
    text_data = open(filename).read()
    document_list.append(text_data)

In [9]:
## Importing NLTK stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

 
stop_words = stopwords.words('spanish') + ["'s", "'re", '”', '“', '’', '—'] + list(string.punctuation)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
## Tokenizing and removing stop words from our list of documents

documents_filtered = []

for document in document_list:
    token_list = word_tokenize(document.lower())
    tokens_filtered = [item for item in token_list if (item not in stop_words)]
    documents_filtered.append(' '.join(tokens_filtered))

In [11]:
## Viewing a preprocessed document

documents_filtered[30]

"distintos lugares pais localizan ocho cadaveres baleados vejados cuatro víctimas podido ser identificadas gabinete identificación policía ocho cadáveres perforaciones bala vejámenes continuando carnicería humana sufre país algún tiempo localizados distintas partes siendo identificadas únicamente cuatro víctimas camino terracería aldea san miguel pajapa municipio pajapita conduce cantón ixcahuin municipio nuevo progreso san marcos localizado cadáver perforado tiros víctima supuestamente secuestrada identificada angel paulo orozco lópez 33 años edad siendo trasladada anfiteatro coatepeque parte carretera ciudad escuintla conduce colonia portales encontrado asesinado dos balazos tórax rostro señor andrés lópez 42 años edad autores crimen utilizaron armas calibre 38 milímetros informó mañana policía nacional realiza investigaciones terrenos finca agua caliente —propiedad señor porfirio orellana— ubicada jurisdicción monjas departamento jalapa descubierto cadáver varias perforaciones bala 

In [15]:
## Vectorizing preprocessed essays

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(documents_filtered) 

In [16]:
## Creating a vocabulary list corresponding to the vectors we created above

vocabulary = vectorizer.get_feature_names()

vocabulary[1140:1160]

['citadino',
 'citado',
 'citados',
 'citar',
 'ciudad',
 'ciudadano',
 'ciudadanos',
 'ciudadanía',
 'civil',
 'civiles',
 'clama',
 'claman',
 'clandestina',
 'clandestinas',
 'clandestino',
 'clarificado',
 'clarifique',
 'claro',
 'clase',
 'clases']

In [17]:
## Initializing an LDA model: 10 topics and 1500 iterations

import lda

model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)

In [18]:
## Fitting the model using our list of vectors

model.fit(X)

INFO:lda:n_documents: 100
INFO:lda:vocab_size: 5711
INFO:lda:n_words: 22174
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -264743
INFO:lda:<10> log likelihood: -204742
INFO:lda:<20> log likelihood: -201040
INFO:lda:<30> log likelihood: -198871
INFO:lda:<40> log likelihood: -197530
INFO:lda:<50> log likelihood: -196882
INFO:lda:<60> log likelihood: -196205
INFO:lda:<70> log likelihood: -195863
INFO:lda:<80> log likelihood: -195257
INFO:lda:<90> log likelihood: -195037
INFO:lda:<100> log likelihood: -194634
INFO:lda:<110> log likelihood: -194547
INFO:lda:<120> log likelihood: -194444
INFO:lda:<130> log likelihood: -194068
INFO:lda:<140> log likelihood: -193933
INFO:lda:<150> log likelihood: -193910
INFO:lda:<160> log likelihood: -193912
INFO:lda:<170> log likelihood: -193952
INFO:lda:<180> log likelihood: -193653
INFO:lda:<190> log likelihood: -193713
INFO:lda:<200> log likelihood: -193534
INFO:lda:<210> l

<lda.lda.LDA at 0x118ece470>

In [20]:
## Viewing the top 50 words in each 'topic'

topic_word = model.topic_word_

n_top_words = 10

for i, topic_distribution in enumerate(topic_word):
    topic_words = np.array(vocabulary)[np.argsort(topic_distribution)][:-(n_top_words+1):-1]
    print('Topic ' + str(i) + ':')
    print(' '.join(topic_words))
    print()

Topic 0:
pick up camión vehículo ruta zona secuestradores estudiante edad placas

Topic 1:
años edad sido cadáver autoridades escuintla balazos cadáveres víctimas san

Topic 2:
policía colonia nacional agentes estudiante carlos tres corado cuerpo policías

Topic 3:
policía dos señor desconocidos según varios ayer lugar muerte autoridades

Topic 4:
hombres aldea campesinos armados verde grupo vecinos olivo departamento miembros

Topic 5:
alcalde iztapa bar cruz ortega puerto luis gonzález hospital lópez

Topic 6:
atentado alcalde detectives herido cuerpo casa zona guerra hospital nacional

Topic 7:
vecinos presidente terreno escuela empresa problema sector instalaciones diciendo comité

Topic 8:
ser puede problemas señor cantidad casos viene hacer oficina agua

Topic 9:
pues zona hombre ejército llegó familia hace evitar gobierno militares



In [23]:
#Make stopwords in a single variable and make a class that 
#reprocesses the documents into a dictionary for gensim format with tokenization. 
import logging 
stops = '/Users/jgo384/nltk_data/corpora/stopwords/spanish.txt'

stoplist = open(stops).read()
stoplist = stoplist.split('\n')
class MyCorpus(object):

    def __init__(self, topdir, stoplist):
        self.topdir = topdir
        self.stoplist = stoplist
        self.dictionary = gensim.corpora.Dictionary(iter_docs(topdir, stoplist))
        
    def __iter__(self):
        for tokens in iter_docs(self.topdir, self.stoplist):
            yield self.dictionary.doc2bow(tokens)


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

In [24]:
#separate function to iterate through the documents separately to tokenize and preprocess with lower and accent remove
def iter_docs(topdir, stoplist):
    for fn in os.listdir(topdir):
        fin = open(os.path.join(topdir, fn), 'rb')
        text = fin.read()
        fin.close()
        yield (x for x in 
            gensim.utils.tokenize(text, lowercase=True, deacc=True, 
                                  errors="ignore")
            if x not in stoplist)

In [None]:
#gensim doesn't like the texts directory to be two separate so they were combined
try: os.mkdir('/sharedfolder/models/dir')
except: pass
TEXTS_DIR = "/sharedfolder/tempdirtesting"
MODELS_DIR = "/sharedfolder/models/dir"

In [None]:
#the fist is a bit of overkill since it reapplies stopwords, it also generates tokenization again
#the issue is that I can't bring the other variables over because it dislikes nltk's tokenization for utf-8
#the second creates a dictionary which is just a id per word list
#the third creates pairings between document, id, and word with a frequency count in a dense matrix
corpus = MyCorpus(TEXTS_DIR, stoplist)
corpus.dictionary.save(os.path.join(MODELS_DIR, 'spaport.dict'))
gensim.corpora.MmCorpus.serialize(os.path.join(MODELS_DIR, "spaport.mm"), 
                                  corpus)

In [None]:
#logging just to keep track of time for these, much slower than Mallet or lda
#dictionary gets loaded from dict file
#corpus is loaded from path and linked to matrix of values for frequency, id and doc
#50 topics seems to work best and keeping the passes(iterations) at the same number as Mallet (painfully slow)
numtopics = 50
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, 
                                            "spaport.dict"))
corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, "spaport.mm"))

# Project to LDA space
polylingualmodel = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, 
                                              workers=8, random_state=1, passes=1000)

In [None]:
#saving the model for persistency
polylingualmodel.save('polylingualmodel2.model')

In [None]:
#creating a bag-of-words from the corpus with doc id, word, frequency count array
bow_corpus = MmCorpus("/sharedfolder/models/dir/spaport.mm")

In [None]:
#This shows a random sorting of topics with the weight across the entire corpus 
#"topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples"
get_documents_topics = polylingualmodel.get_document_topics(bow_corpus, 
                                                            minimum_probability=.001, 
                                                            minimum_phi_value=0.01, 
                                                            per_word_topics=True)
get_documents_topics[1]

In [None]:
#This prints a topic at a time and is useful for inspecting topics one by one. The one listed here
#bears some slight resemblance to 21 in Mallet
print_topics = polylingualmodel.print_topics(num_topics=1, num_words=35)

In [None]:
#From gensim site "tuples with (topic_repr, coherence_score), where topic_repr is a list of 
#representations of the topn terms for the topic. 
#The terms are represented as tuples of (membership_in_topic, token). 
#The coherence_score is a float".
top_topics = polylingualmodel.top_topics(corpus=corpus, texts=None, dictionary='spaport.dict', 
                                         window_size=None, coherence='u_mass', 
                                         topn=50, processes=-1)
top_topics

In [None]:
#even with pprint this is quite difficult to sort through but there seems to be an issue with the weights
#all being too low at .00*
from pprint import pprint
pprint(top_topics)

In [None]:
os.chdir('/sharedfolder/models/dir')
d = gensim.corpora.Dictionary.load('spaport.dict')
c = gensim.corpora.MmCorpus('spaport.mm')
vizmod = gensim.models.LdaMulticore.load('polylingualmodel2.model')

In [None]:
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(vizmod, c, d)
data