### clustering_lda_test_models
Tests many different versions of topic modeling using documents from the CPDOC_AAS database at MySQL.

In [1]:
import nltk
import os
import codecs
import string
import matplotlib.pyplot as plt
import matplotlib as mpl
from gensim import corpora, models, similarities #Latent Dirichlet Allocation implementation with Gensim
import pyLDAvis
import pyLDAvis.gensim
from IPython.display import clear_output

import sqlite3
import pickle

In [2]:
inputs = os.path.join("..", "inputs")
outputs = os.path.join("..","outputs")

### set user-specific variables
Verifies what operational system is being used and creates user-specific variables. Renato = Linux ; Marcelo = nt (Windows)

Also sets working folders

In [3]:
if os.name == 'nt':
    encoding_type = 'utf-8'
else:
    encoding_type = 'ISO-8859-1'

### selects texts from sql database to start topic modeling

In [4]:
sql_db = os.path.join(inputs, 'cpdoc_as.sqlite')
conn = sqlite3.connect(sql_db)
cur = conn.cursor()

count = 0
texts = []

'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
selects texts from sql database to start topic modeling
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
cur.execute("SELECT * FROM docs WHERE main_language = 'pt' AND (readability > 0.4 OR readability = -1) ") # filtra textos   
data = cur.fetchall()
numrows = len(data)
percentil = int(numrows/100)

for row in data:
    count += 1
    if row is None: break

    if count % percentil == 0: 
        clear_output()
        print(int(count/percentil),'% done')

    text =  row[4]
    text = text.split()
    symbols = [x for x in string.punctuation]
    text = [p for p in text if p not in symbols]
    text = [p.strip(string.punctuation) for p in text]
    text = [p for p in text if not p.isdigit()]
    text = [p for p in text if len(p)>1]
    texts.append(text)

100 % done


### makes list of stopwords

In [5]:
additional_words = ['mr','one', 'two', 'three', 'four', 
                    'five', 'um', 'dois', 'três', 'quatro', 
                    'cinco', 'janeiro', 'fevereiro', 'março', 
                    'abril', 'maio', 'junho', 'julho', 'agosto', 
                    'setembro', 'outubro', 'novembro', 'dezembro', 
                    'january', 'february', 'march', 'april', 'may', 
                    'june', 'july', 'august', 'september', 
                    'october', 'november', 'december', 'países', 
                    'ser', 'país', 'ainda', 'milhões', 'maior', 
                    'anos', 'grande', 'apenas', 'outros', 'pode', 
                    'parte', 'partes', 'item', 'vossa', 'nota', 
                    'havia', 'pt', 'vg', 'ptvg', 'eh', 'nr', 'hrs', 
                    'pais', 'parte', 'hoje', 'brasemb', 'ontem', 
                    'dia', 'countries', 'would', 'new', 'also', 
                    'must', 'draft', 'shall', 'item', 'page', 
                    'th', 'anos', 'ii', 'dias', 'poderá', 'caso', 
                    'casos', 'qualquer', 'ano', 'mil', 'pessoas', 
                    'único', 'única', 'únicos', 'únicas', 'índice', 
                    'expedido', 'co', 'mm', 'er', 'via', 'ww', 'ra', 
                    'ia', 'ca', 'nu', 'wa', 'aa', 'ms', 'dc', 'mmm', 'pa']

stopwords = nltk.corpus.stopwords.words('english') + \
            nltk.corpus.stopwords.words('portuguese') + \
            nltk.corpus.stopwords.words('french') + \
            nltk.corpus.stopwords.words('spanish') + \
            nltk.corpus.stopwords.words('german') + \
            additional_words

stopwords = list(set(stopwords))

# Topic Modeling
Many tests with Latent Dirichlet Allocation (LDA): 20, 30, 40, 45, 60 and 100 topic structure. After 60, started to set random_state 0 to avoid random variability between tests.

Also tests with Hierarchic Dirichlet Process (HDP).

In [6]:
%time texts = [[word for word in text if word not in stopwords] for text in texts]

Wall time: 2min 1s


In [7]:
print(len(texts[0]))

47


In [8]:
%%time
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)
dictionary.filter_tokens(bad_ids=[0,]) #retira palavras a partir do id
corpus = [dictionary.doc2bow(text) for text in texts]

In [9]:
len(corpus)

9054

#### saves pickle of dictionary and corpus

In [15]:
file_corpus = os.path.join(outer_outputs, 'LDAcorpus.pkl')
file_dictionary = os.path.join(outer_outputs, 'LDAdictionary.pkl')

In [5]:
''' caso queira carregar os arquivos '''
corpus = pickle.load(open(file_corpus, 'rb'))
dictionary = pickle.load(open(file_dictionary, 'rb'))

### tests with 20 topics

In [11]:
%time lda20 = models.LdaModel(corpus, num_topics=20, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join(outputs, 'model_lda_20.pkl')
with open(os.path.join(outputs,'model_lda_20.pkl'), 'wb') as f:
    pickle.dump(lda20, f)

#### saves visualization of 20 topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda20, corpus, dictionary)

In [None]:
data_ldavis = pyLDAvis.gensim.prepare(lda20, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_20topics.html'))

### tests with 30 topics

In [None]:
%time lda30 = models.LdaModel(corpus, num_topics=30, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join('..', outputs, 'model_lda_30.pkl')
pickle.dump(lda30, open(file, 'wb'))

In [None]:
lda30.print_topics(-1, num_words=5)

#### saves visualization of 30 topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda30, corpus, dictionary)

In [None]:
data_ldavis = pyLDAvis.gensim.prepare(lda30, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_30topics.html'))

### tests with 40 topics

In [None]:
%time lda40 = models.LdaModel(corpus, num_topics=40, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join('..', outputs, 'model_lda_40.pkl')
pickle.dump(lda40, open(file, 'wb'))

In [None]:
lda40.print_topics(-1, num_words=5)

#### saves visualization of 40 topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda40, corpus, dictionary)

In [None]:
data_ldavis = pyLDAvis.gensim.prepare(lda40, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_40topics.html'))

### tests with 45 topics

In [6]:
%time lda45 = models.LdaModel(corpus, num_topics=45, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join('..', outputs, 'model_lda_45.pkl')
pickle.dump(lda45, open(file, 'wb'))

CPU times: user 17h 8min 58s, sys: 3d 10h 13min 12s, total: 4d 3h 22min 11s
Wall time: 5h 19min 30s


In [7]:
lda45.print_topics(-1, num_words=5)

[(0,
  '0.018*comissão + 0.016*projeto + 0.015*geral + 0.012*resolução + 0.010*assembleia'),
 (1, '0.043*made + 0.041*use + 0.032*end + 0.029*ambassador + 0.026*aaaaa'),
 (2,
  '0.092*venezuela + 0.052*guiana + 0.028*caracas + 0.022*trinidad + 0.021*cuba'),
 (3,
  '0.013*política + 0.012*desenvolvimento + 0.010*relações + 0.010*brasil + 0.009*internacional'),
 (4,
  '0.026*brasil + 0.020*cooperação + 0.013*acordo + 0.009*desenvolvimento + 0.007*relações'),
 (5,
  '0.035*excelência + 0.031*brasil + 0.031*senhor + 0.023*ministro + 0.020*presidente'),
 (6, '0.005*mw + 0.005*dn + 0.005*ar + 0.004*mu + 0.004*lu'),
 (7,
  '0.063*nuclear + 0.030*acordo + 0.022*energia + 0.022*nucleares + 0.021*brasil'),
 (8, '0.008*ro + 0.008*ue + 0.007*rr + 0.007*ão + 0.006*ar'),
 (9,
  '0.069*global + 0.026*ministro + 0.025*exteriores + 0.024*york + 0.023*retransmito'),
 (10,
  '0.019*brasil + 0.014*visto + 0.011*estrangeiro + 0.010*art + 0.009*brasileiro'),
 (11,
  '0.023*cuba + 0.021*uruguai + 0.019*gobie

#### saves visualization of 45 topics

In [8]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda45, corpus, dictionary)

In [9]:
data_ldavis = pyLDAvis.gensim.prepare(lda45, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_45topics.html'))

### tests with 60 topics and random_state 0

In [10]:
%time lda60_00 = models.LdaModel(corpus, num_topics=60, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join('..', outputs, 'model_lda_60_rs_00.pkl')
pickle.dump(lda60_00, open(file, 'wb'))

CPU times: user 18h 39min 2s, sys: 3d 17h 48min 24s, total: 4d 12h 27min 27s
Wall time: 5h 43min 15s


#### saves visualization of 60 topics

In [11]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda60_00, corpus, dictionary)

In [12]:
data_ldavis = pyLDAvis.gensim.prepare(lda60_00, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_60topics_rs00.html'))

### tests with 60 topics and random_state 1

In [13]:
%time lda60_01 = models.LdaModel(corpus, num_topics=60, id2word=dictionary, passes=50, eval_every=1, random_state=1)
file = os.path.join('..', outputs, 'model_lda_60_rs_01.pkl')
pickle.dump(lda60_01, open(file, 'wb'))

CPU times: user 18h 44min 14s, sys: 3d 18h 1min 1s, total: 4d 12h 45min 15s
Wall time: 5h 43min 31s


#### saves visualization of 60 topics

In [14]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda60_01, corpus, dictionary)

In [15]:
data_ldavis = pyLDAvis.gensim.prepare(lda60_01, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_60topics_rs01.html'))

### tests with 100 topics and random_state 0

In [18]:
%time lda100_00 = models.LdaModel(corpus, num_topics=100, id2word=dictionary, passes=50, eval_every=1, random_state=0)
file = os.path.join('..', outputs, 'model_lda_100_rs_00..pkl')
pickle.dump(lda100_00, open(file, 'wb'))

Wall time: 57min 27s


#### saves visualization of 100 topics

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda100_00, corpus, dictionary)

In [None]:
data_ldavis = pyLDAvis.gensim.prepare(lda100_00, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outer_outputs,'pyldavis_output_100topics_rs00.html'))

### tests with Hierarchic Dirichlet Process (HDP)

In [19]:
%time hdp = models.HdpModel(corpus, id2word=dictionary)

CPU times: user 5min 19s, sys: 23min 55s, total: 29min 14s
Wall time: 2min


In [20]:
hdp.print_topics()

['topic 0: 0.009*brasil + 0.004*relações + 0.004*governo + 0.004*presidente + 0.004*ministro + 0.004*senhor + 0.003*exteriores + 0.002*política + 0.002*acordo + 0.002*brasileiro + 0.002*telegrama + 0.002*excelência + 0.002*república + 0.002*desenvolvimento + 0.002*embaixador + 0.002*cooperação + 0.002*internacional + 0.002*rio + 0.002*brasileira + 0.002*visita',
 'topic 1: 0.006*brasil + 0.004*política + 0.003*governo + 0.003*ministro + 0.003*guerra + 0.003*senhor + 0.003*desenvolvimento + 0.003*mundo + 0.003*marinha + 0.003*unidos + 0.002*relações + 0.002*nacional + 0.002*poder + 0.002*internacional + 0.002*cooperação + 0.002*segurança + 0.002*militar + 0.001*estratégia + 0.001*cada + 0.001*silveira',
 'topic 2: 0.006*brasil + 0.002*política + 0.002*nacional + 0.002*itamaraty + 0.002*brasileira + 0.002*diplomacia + 0.002*governo + 0.001*exterior + 0.001*desenvolvimento + 0.001*presidente + 0.001*exteriores + 0.001*relações + 0.001*senhor + 0.001*externa + 0.001*nações + 0.001*mundo + 

In [21]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(hdp, corpus, dictionary)

In [22]:
data_ldavis = pyLDAvis.gensim.prepare(hdp, corpus, dictionary)
pyLDAvis.save_html(data_ldavis, os.path.join(outputs,'pyldavis_output_hdp.html'))