Latent Dirichlet Allocation (LDA) is a algorithms used to discover the topics that are present in a corpus.

A few open source libraries exist, but if you are using Python then the main contender is Gensim. Gensim is an awesome library and scales really well to large text corpuses. Gensim, however does not include Non-negative Matrix Factorization (NMF), which can also be used to find topics in text. The mathematical basis underpinning NMF is quite different from LDA. I have found it interesting to compare the results of both of the algorithms and have found that NMF sometimes produces more meaningful topics for smaller datasets. 

NMF has been included in Scikit Learn for quite a while but LDA has only recently (late 2015) been included. The great thing about using Scikit Learn is that it brings API consistency which makes it almost trivial to perform Topic Modeling using both LDA and NMF. Scikit Learn also includes seeding options for NMF which greatly helps with algorithm convergence and offers both online and batch variants of LDA.

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# create English stop words list
en_stop = stopwords.words('english')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
   
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

### Use sklearn.decomposition.NMF to run top modeling

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
tfidf.fit_transform(doc_set).toarray()[1]

array([ 0.        ,  0.30442406,  0.        ,  0.30442406,  0.        ,
        0.        ,  0.        ,  0.20387634,  0.        ,  0.        ,
        0.        ,  0.        ,  0.24560742,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.30442406,  0.        ,  0.20387634,
        0.40775268,  0.        ,  0.        ,  0.30442406,  0.        ,
        0.        ,  0.30442406,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.30442406,  0.        ,
        0.        ,  0.        ,  0.30442406,  0.20387634,  0.        ,  0.        ])

In [4]:
def tokenize(text):
    return [p_stemmer.stem(w) for w in text.split()]

tfidf=TfidfVectorizer(stop_words=en_stop, tokenizer=tokenize)
tfidf_data=tfidf.fit_transform(doc_set)
tfidf_data.toarray()[1]

array([ 0.36398684,  0.36398684,  0.        ,  0.        ,  0.        ,
        0.        ,  0.24376623,  0.        ,  0.24376623,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.36398684,  0.        ,
        0.29366229,  0.        ,  0.        ,  0.        ,  0.        ,
        0.36398684,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.36398684,  0.        ,  0.        ,
        0.36398684,  0.        ])

In [5]:
tfidf.get_feature_names()

['around',
 'basebal',
 'better.',
 'blood',
 'brocolli',
 'brocolli,',
 'brother',
 'caus',
 'drive',
 'eat',
 'eat.',
 'expert',
 'feel',
 'good',
 'health',
 'health.',
 'increas',
 'like',
 'lot',
 'may',
 'mother',
 'mother.',
 'never',
 'often',
 'perform',
 'practice.',
 'pressur',
 'pressure.',
 'profession',
 'say',
 'school,',
 'seem',
 'spend',
 'suggest',
 'tension',
 'time',
 'well']

In [6]:
from sklearn.decomposition import NMF
nmf=NMF(n_components=3)
nmf.fit(tfidf_data)

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [7]:
nmf.components_

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.75311216e-01,   2.55432534e-01,
          1.62195511e-01,   0.00000000e+00,   0.00000000e+00,
          2.55432534e-01,   2.55432534e-01,   0.00000000e+00,
          0.00000000e+00,   6.81583674e-01,   2.56015822e-01,
          3.33018812e-01,   0.00000000e+00,   2.55432534e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.55432534e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.33018812e-01,   3.33018812e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  2.30447562e-01,   2.30447562e-01,   1.95220476e-01,
          0.00000000e+00,   0.00000000e+00,   5.53711222e-03,
          2.88983416e-01,   0.00000000e+00,   2.84307136e-01,
          5.53711222e-03,   5.53711222e-03,

In [8]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [9]:
display_topics(nmf, tfidf.get_feature_names(), 3)

Topic 0:
good brocolli say
Topic 1:
mother brother drive
Topic 2:
tension suggest increas


### Use sklearn.decomposition.LatentDirichletAllocation to run top modeling

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer()
ct_data=count.fit_transform(doc_set)
ct_data.toarray()[1]

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [11]:
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_topics=3)
lda.fit(ct_data)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=3, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [12]:
lda.components_

array([[ 0.47877873,  1.26048566,  1.27281186,  1.25058509,  1.27062406,
         0.47061127,  0.47368621,  2.09892108,  1.28707031,  0.46967821,
         1.25348414,  1.27636214,  1.25589913,  0.42678886,  0.43129356,
         1.25745904,  0.46417913,  0.48903754,  0.45514668,  0.46225367,
         0.4624507 ,  0.4587893 ,  1.28097558,  0.44374677,  2.05333976,
         3.68004993,  1.2782428 ,  0.47698271,  1.27108281,  1.27609436,
         1.28449295,  1.27622247,  1.24648183,  0.45634715,  0.42442582,
         1.28790386,  1.28043624,  0.46580038,  1.26620852,  0.46277578,
         0.4667647 ,  0.4504949 ,  1.27565799,  3.69024574,  1.27252745,
         0.51128832],
       [ 0.46627016,  0.46432638,  0.46692767,  0.46288833,  0.48365718,
         0.46076456,  2.07468826,  1.27388109,  1.24424305,  0.49900327,
         0.4392863 ,  0.49984064,  0.43360291,  2.1020979 ,  0.43680005,
         0.49487926,  0.45019329,  2.06885764,  0.49033502,  0.45825183,
         1.2488794 ,  1.27549

In [13]:
display_topics(lda, count.get_feature_names(), 3)

Topic 0:
to my brother
Topic 1:
eat my brocolli
Topic 2:
health that for


### Use gensim to run topic modeling

In [15]:
from gensim import corpora
import gensim

In [16]:
tokenizer = RegexpTokenizer(r'\w+')

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

texts

[['brocolli',
  'good',
  'eat',
  'brother',
  'like',
  'eat',
  'good',
  'brocolli',
  'mother'],
 ['mother',
  'spend',
  'lot',
  'time',
  'drive',
  'brother',
  'around',
  'basebal',
  'practic'],
 ['health',
  'expert',
  'suggest',
  'drive',
  'may',
  'caus',
  'increas',
  'tension',
  'blood',
  'pressur'],
 ['often',
  'feel',
  'pressur',
  'perform',
  'well',
  'school',
  'mother',
  'never',
  'seem',
  'drive',
  'brother',
  'better'],
 ['health', 'profession', 'say', 'brocolli', 'good', 'health']]

In [17]:

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
ldamodel.print_topics(num_topics=3, num_words=3)

[(0, '0.065*"drive" + 0.065*"health" + 0.065*"pressur"'),
 (1, '0.082*"good" + 0.082*"brocolli" + 0.081*"mother"'),
 (2, '0.081*"health" + 0.047*"feel" + 0.047*"perform"')]

In [None]:
corpus