
# Topic extraction with Latent Dirichlet Allocation


This is an example of applying :class:`sklearn.decomposition.LatentDirichletAllocation` on a corpus
of documents and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).


The default parameters (n_samples / n_features / n_components) should make
the example runnable in a couple of tens of seconds. You can try to
increase the dimensions of the problem, but be aware that the time
complexity is proportional to (n_samples * iterations) in LDA.

* http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/
* https://stackoverflow.com/questions/20349958/understanding-lda-implementation-using-gensim




In [24]:
# Exemplo 1 (scikit-learn lda)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle

n_features = 10
n_topics = 2
n_top_words = 20

# Print the n_top_words in order
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Training dataset
data_samples = ["I like to eat broccoli and bananas.",
                "I ate a banana and spinach smoothie for breakfast.",
                "Chinchillas and kittens are cute.",
                "My sister adopted a kitten yesterday.",
                "Look at this cute hamster munching on a piece of broccoli."
               ]
# extract fetures and vectorize dataset
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

#save features
dic = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

# train LDA
p1 = lda.fit(tf)

# Save all data necessary for later prediction
# model = (dic,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_)

print_top_words(lda, dic, n_top_words)

Topic #0: cute adopted broccoli munching look piece smoothie sister kittens like
Topic #1: like broccoli adopted munching piece kittens sister look smoothie cute



In [25]:
# Exemplo 2 (scikit-learn lda)

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


# Print the n_top_words in order
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(data_samples)
print()

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
Extracting tf features for LDA...

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...

Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year jus

## Tentativa frustrada de utilizar os dados do QGS.txt como entrada

In [93]:
# Tentativa frustrada de transformar os dados do QGS.txt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle

n_features = 10
n_topics = 2
n_top_words = 20

# Print the n_top_words in order
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Training dataset
data_samples = ["I like to eat broccoli and bananas.",
                "I ate a banana and spinach smoothie for breakfast.",
                "Chinchillas and kittens are cute.",
                "My sister adopted a kitten yesterday.",
                "Look at this cute hamster munching on a piece of broccoli."
               ]
# extract fetures and vectorize dataset
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

#save features
dic = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

# train LDA
p1 = lda.fit(tf)

# Save all data necessary for later prediction
# model = (dic,lda.components_,lda.exp_dirichlet_component_,lda.doc_topic_prior_)

print_top_words(lda, dic, n_top_words)

Topic #0: cute adopted broccoli munching look piece smoothie sister kittens like
Topic #1: like broccoli adopted munching piece kittens sister look smoothie cute



## Tentativa mais próxima do satisfatório (Com um exemplo pequeno para entender o contexto)


Suponha que temos as seguintes sentenças: 

    I like to eat broccoli and bananas.
    I ate a banana and spinach smoothie for breakfast.
    Chinchillas and kittens are cute.
    My sister adopted a kitten yesterday.
    Look at this cute hamster munching on a piece of broccoli.

Após passar elas por um pré-processamento no PreText2, teremos:

    eat broccoli banana | 
    at banana spinach smoothi breakfast | 
    chinchilla kitten cute |
    sister adopt kitten yesterdai | 
    cute hamster munch piec broccoli |
    
Desta forma, podemos rodar o LDA para descobrir a relação de 2 tópicos, um relacionado a alimentos e outro a animais fofos.

In [92]:
# Tentativa mais próxima do satisfatório (Com um exemplo pequeno para entender o contexto)

import numpy
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

QGS1 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/text-example_Maid/1.txt', 'r')
QGS2 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/text-example_Maid/2.txt', 'r')
QGS3 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/text-example_Maid/3.txt', 'r')
QGS4 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/text-example_Maid/4.txt', 'r')
QGS5 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/text-example_Maid/5.txt', 'r')


# Criando variáveis com o conteudo dos arquivos
textQGS1 = QGS1.read()
textQGS2 = QGS2.read()
textQGS3 = QGS3.read()
textQGS4 = QGS4.read()
textQGS5 = QGS5.read()


# Juntando as variáveis em uma lista, onde cada componente é um documento
docSet = [textQGS1, textQGS2, textQGS3, textQGS4, textQGS5]

texts = []

# Loop na lista de documentos
for i in docSet:
    
    # Removendo as barras | do arquivo e separando por tokens
    tokens = tokenizer.tokenize(i)    
    
    # Adiciona os tokens criados em uma lista
    texts.append(tokens)

# Transforma os documentos tokenizados em um id <-> termo do dicionário
dictionary = corpora.Dictionary(texts)
    
# Converte os documentos tokenizados em uma matriz documento/termo
corpus = [dictionary.doc2bow(text) for text in texts]

# Gerar o LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=200)

# Imprimir os tópicos em destaque
print(ldamodel.print_topics(num_topics=2, num_words=3))

# Fechando os arquivos
QGS1.close()
QGS2.close()
QGS3.close()
QGS4.close()
QGS5.close()


[(0, u'0.125*"kitten" + 0.076*"banana" + 0.075*"spinach"'), (1, u'0.155*"broccoli" + 0.098*"cute" + 0.093*"banana"')]


## Tentativa mais próxima do satisfatório (Com o GQS pré-processado)


Agora iremos generalizar o problema acima para o QGS da Revisão do Francisco.

É notório dizer que a string de busca utilizada por ele foi:

    (("software process improvement") AND 
     ("business  goal" OR "strategic" OR "goal oriented" OR "business oriented" OR "business strategy")  AND
     ("alignment" OR "in line with" OR "geared to" OR "aligned with" OR "linking")  AND
     ("method" OR "approach" OR "framework" OR "methodology"))
 

In [98]:
# Tentativa mais próxima do satisfatório (Com o GQS pré-processado)

import numpy
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# Abrindo os arquivos de texto (texts_Maids = Metadados e text_Maid = Texto Completo)
QGS1 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS I.txt', 'r')
QGS2 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS II.txt', 'r')
QGS3 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS III.txt', 'r')
QGS4 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS IV.txt', 'r')
QGS5 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS V.txt', 'r')
QGS6 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS VI.txt', 'r')
QGS7 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS VII.txt', 'r')
QGS8 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS VIII.txt', 'r')
QGS9 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS IX.txt', 'r')
QGS10 = open('/home/fuchs/Documentos/Jupyter/Pre-Processing/Files/texts_Maid/QGS X.txt', 'r')


# Criando variáveis com o conteudo dos arquivos
textQGS1 = QGS1.read()
textQGS2 = QGS2.read()
textQGS3 = QGS3.read()
textQGS4 = QGS4.read()
textQGS5 = QGS5.read()
textQGS6 = QGS6.read()
textQGS7 = QGS7.read()
textQGS8 = QGS8.read()
textQGS9 = QGS9.read()
textQGS10 = QGS10.read()

# Juntando as variáveis em uma lista, onde cada componente é um documento
docSet = [textQGS1, textQGS2, textQGS3, textQGS4, textQGS5, textQGS6, textQGS7, textQGS8, textQGS9, textQGS10]

texts = []

# Loop na lista de documentos
for i in docSet:
    
    # Removendo as barras | do arquivo e separando por tokens
    tokens = tokenizer.tokenize(i)    
    
    # Adiciona os tokens criados em uma lista
    texts.append(tokens)

# Transforma os documentos tokenizados em um id <-> termo do dicionário
dictionary = corpora.Dictionary(texts)
    
# Converte os documentos tokenizados em uma matriz documento/termo
# A função doc2bow () simplesmente conta o número de ocorrências de cada palavra distinta, 
# converte a palavra em um id da palavra inteiro e retorna o resultado como um vetor esparso
corpus = [dictionary.doc2bow(text) for text in texts]

# Gerar o LDA model (https://radimrehurek.com/gensim/models/ldamodel.html)

# gensim.models.ldamodel.LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, 
#                                              update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, 
#                                              eval_every=10, iterations=50, gamma_threshold=0.001, 
#                                              minimum_probability=0.01, random_state=None, ns_conf=None, 
#                                              minimum_phi_value=0.01, per_word_topics=False, callbacks=None, 
#                                              dtype=<type 'numpy.float32'>)

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1, id2word = dictionary, passes=5000)

# Imprimir os tópicos em destaque
print("Imprimindo resultados: ['Porcentagem*'Termo']")
print(ldamodel.print_topics(num_topics=1, num_words=10))
print("\n")
print("Imprimindo resultados: ['Termo', Porcentagem]")
print(ldamodel.show_topics(num_topics=1, num_words=10, log=False, formatted=False))

# Fechando os arquivos
QGS1.close()
QGS2.close()
QGS3.close()
QGS4.close()
QGS5.close()
QGS6.close()
QGS7.close()
QGS8.close()
QGS9.close()
QGS10.close()

Imprimindo resultados: ['Porcentagem*'Termo']
[(0, u'0.028*"goal" + 0.027*"process" + 0.022*"improv" + 0.017*"softwar" + 0.013*"measur" + 0.013*"level" + 0.013*"busi" + 0.010*"organ" + 0.009*"requir" + 0.009*"defin"')]


Imprimindo resultados: ['Termo', Porcentagem]
[(0, [(u'goal', 0.028240252), (u'process', 0.027116254), (u'improv', 0.022233928), (u'softwar', 0.017140849), (u'measur', 0.013382507), (u'level', 0.013101509), (u'busi', 0.013066383), (u'organ', 0.010326658), (u'requir', 0.009272919), (u'defin', 0.009272919)])]
