<a href="https://colab.research.google.com/github/Luis-Alves2/LDA-Tutorial/blob/main/LDA_Model_Example_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


#Imports


In [12]:
!pip install smart_open nltk gensim
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
import io
import os.path
import re
import tarfile
import smart_open
from nltk.tokenize import RegexpTokenizer
from gensim.models import Phrases
from gensim.corpora import Dictionary


In [14]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


#Definindo Corpus, com Docs extraidos de link( Tutorial)

In [16]:
def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

In [17]:
docs = list(extract_documents())

In [18]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


#Vectorize e pre process


In [19]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [20]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [21]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [22]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

#Bag of Words

In [23]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [24]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


#Model Training

In [25]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [26]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.1600.
[([(0.012045064, 'stimulus'),
   (0.010159217, 'visual'),
   (0.010046919, 'response'),
   (0.008278891, 'cell'),
   (0.007922317, 'orientation'),
   (0.007790254, 'field'),
   (0.00776493, 'activity'),
   (0.006778454, 'cortex'),
   (0.005776343, 'receptive'),
   (0.005659936, 'cortical'),
   (0.005468552, 'frequency'),
   (0.0054076924, 'correlation'),
   (0.0053951223, 'spatial'),
   (0.005347881, 'receptive_field'),
   (0.0046996027, 'signal'),
   (0.004194944, 'connection'),
   (0.0040912535, 'map'),
   (0.004084548, 'neuron'),
   (0.0038993047, 'noise'),
   (0.0038398877, 'center')],
  -0.7962247016213089),
 ([(0.009568345, 'memory'),
   (0.0087028025, 'neuron'),
   (0.0071405377, 'net'),
   (0.0065950584, 'layer'),
   (0.005421945, 'node'),
   (0.0053603724, 'dynamic'),
   (0.005356251, 'recurrent'),
   (0.0046124146, 'connection'),
   (0.0037603115, 'activation'),
   (0.0036677395, 'sequence'),
   (0.0036129758, 'architecture'),
   (0.003601215