In [1]:
## I'm keeping this all in gensim, but scikit-learn, scipy+numpy, and nltk all 
##   have nice helper functions for some of this stuff...
from gensim import corpora, models, similarities

In [2]:
### Set this to wherever you unpacked this example
wd = "./"

In [3]:
## Read in a corpus. In this example, the file should be a line-delimeted set of documents.

# There are better (in terms of RAM) ways to load the corpus such that is encoded as it's read.
documents = []
fin       = open(wd + "/tasa.docs-line-delimited-20k", "rb")
for line in fin: documents.append(line.rstrip("\n"))
fin.close()

In [4]:
## Pre-processing: utterly important...can take a while (~1 minute on my desktop)

# 1) load and remove stop-words
stoplist = set()
fin = open(wd + "/stopwords.txt","rb") # A Gerow special
for w in fin: stoplist.add(w.rstrip("\n"))
fin.close()

# Note the lowercasing...
texts = [[word.lower() for word in document.lower().split() if word not in stoplist] for document in documents]

# 2) Remove words that appear only once:
all_tokens  = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts       = [[word for word in text if word not in tokens_once] for text in texts]

## Other important things people tend to do here, in order of importance:
##    1) Deal with punctuation.
##    2) Remove words with a low and high document frequency.
##    3) Remove words below a minimum average TF*IDF value.
##    4) Stem words (NLTK has a couple good stemmers).
##    5) Remove words not found in a dictionary (I have a great / huge dictionary; can take a while).
##    6) Compute and include bigrams that have a collocational strength above some threshold (I use a top n sort of thing).
##       scikit-learn has a great, but convoluted collocation extraction implementation.

## The best rule of thumb I've heard for 2-4 is that the resulting dictionary should be between 15k and 25k words.

In [5]:
## Create the dictionary (also called the vocabulary)
dictionary = corpora.Dictionary(texts)

## Convert our corpus to the gensim sparse representation. 
##This is the great part of the gensim implementations
corpus = [dictionary.doc2bow(text) for text in texts]

In [7]:
#### Don't need to run this for this example####

## Getting to this point may take a while on large corpora. Luckily, you can serialize
##   the corpus in the sparse representation for future use:

corpora.MmCorpus.serialize(wd + '/tasa.mm', corpus) # store to disk, for later use
dictionary.save(wd + '/tasa.dict')

## And read it back in:
corpus     = corpora.MmCorpus(wd + '/tasa.mm')
dictionary = corpora.Dictionary.load(wd + '/tasa.dict')

In [9]:
## And now we do some topic modeling. Here, I'm using Hierarchical Dirichlet Processe (HDP)
##   which is just LDA that fits its own parameter for the number of topics. Gensim has
##   a handful of other models, but HDP is one of the more advanced.
## You may get some warnings for the first few iterations here.
hdpmodel = models.HdpModel(corpus, id2word=dictionary)

# Print the topics (a bit ugly, but you get the picture...)
# Also notice how harmful punctuation can be.
hdpmodel.print_topics(topics=-1,topn=15) # -1 prints all topics

## These topics, to me, don't seem great -- probably because of the small corpus and negligent pre-processing.

## At this point, there's lots you can do with the topics...
## You can also serialize the fit model for later use:
hdpmodel.save(wd + "/tasa-20k.hdpmodel.gensim")