In [18]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import pyLDAvis.gensim

# Load the corpus
corpus_path = 'data/corpus'
corpus = PlaintextCorpusReader(corpus_path, '.*')

# Tokenize the corpus
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
raw_documents = [corpus.raw(fileid) for fileid in corpus.fileids()]
documents = [tokenizer.tokenize(doc.lower()) for doc in raw_documents]
documents = [[word for word in doc if word not in stop_words] for doc in documents]

#print('Number of documents:', len(corpus.fileids()))

# Create a dictionary
dictionary = Dictionary(documents)
dic_corpus = [dictionary.doc2bow(doc) for doc in documents]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(dic_corpus))

# set training params
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

# Train the model
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
  corpus=dic_corpus,
  id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# visualize the topics
pyLDAvis.gensim.prepare(model, dic_corpus, dictionary)


Number of unique tokens: 7987
Number of documents: 11
