In [60]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import string
import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]
doc_complete

['Sugar is bad to consume. My sister likes to have sugar, but not my father.',
 'My father spends a lot of time driving my sister around to dance practice.',
 'Doctors suggest that driving may cause increased stress and blood pressure.',
 'Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.',
 'Health experts say that Sugar is not good for your lifestyle.']

In [4]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [7]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

doc_clean = [clean(doc).split() for doc in doc_complete]
doc_clean

[['sugar', 'bad', 'consume', 'sister', 'like', 'sugar', 'father'],
 ['father',
  'spends',
  'lot',
  'time',
  'driving',
  'sister',
  'around',
  'dance',
  'practice'],
 ['doctor',
  'suggest',
  'driving',
  'may',
  'cause',
  'increased',
  'stress',
  'blood',
  'pressure'],
 ['sometimes',
  'feel',
  'pressure',
  'perform',
  'well',
  'school',
  'father',
  'never',
  'seems',
  'drive',
  'sister',
  'better'],
 ['health', 'expert', 'say', 'sugar', 'good', 'lifestyle']]

## Create Document-Term Matrix
It is a matrix representing bag of words model. Each document is composed of many terms. So, frequency of those terms/words is represented in this matrix.  
```python
   W1  W2  W3 ...
D1 1   99  
D2
D3
```

In [37]:
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)],
 [(2, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(8, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(2, 1),
  (4, 1),
  (18, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)],
 [(5, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]

In [42]:
# get id for each term
[[(dictionary[id], freq) for id, freq in cp] for cp in doc_term_matrix[:5]]

[[('bad', 1),
  ('consume', 1),
  ('father', 1),
  ('like', 1),
  ('sister', 1),
  ('sugar', 2)],
 [('father', 1),
  ('sister', 1),
  ('around', 1),
  ('dance', 1),
  ('driving', 1),
  ('lot', 1),
  ('practice', 1),
  ('spends', 1),
  ('time', 1)],
 [('driving', 1),
  ('blood', 1),
  ('cause', 1),
  ('doctor', 1),
  ('increased', 1),
  ('may', 1),
  ('pressure', 1),
  ('stress', 1),
  ('suggest', 1)],
 [('father', 1),
  ('sister', 1),
  ('pressure', 1),
  ('better', 1),
  ('drive', 1),
  ('feel', 1),
  ('never', 1),
  ('perform', 1),
  ('school', 1),
  ('seems', 1),
  ('sometimes', 1),
  ('well', 1)],
 [('sugar', 1),
  ('expert', 1),
  ('good', 1),
  ('health', 1),
  ('lifestyle', 1),
  ('say', 1)]]

## LDA (Latent Drichlet Allocation)
LDA uses document-term matrix to model the topic. It breaks down this matrix into 2 matrices, **M1** and **M2** where:  
M1 = document-topic matrix  
M2 = topic-terms matrix

In [43]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel


lda_model = gensim.models.ldamodel.LdaModel(corpus=doc_term_matrix,
                                           id2word=dictionary,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [51]:
#lda_model.print_topics(num_topics=3, num_words=5)
pprint.pprint(lda_model.print_topics(num_topic))
doc_lda = lda_model[doc_term_matrix]
doc_lda

[(0,
  '0.079*"driving" + 0.045*"suggest" + 0.045*"stress" + 0.045*"blood" + '
  '0.045*"may" + 0.045*"doctor" + 0.045*"increased" + 0.045*"time" + '
  '0.045*"cause" + 0.045*"around"'),
 (1,
  '0.091*"sugar" + 0.064*"sister" + 0.064*"father" + 0.036*"health" + '
  '0.036*"bad" + 0.036*"consume" + 0.036*"expert" + 0.036*"lifestyle" + '
  '0.036*"good" + 0.036*"like"'),
 (2,
  '0.029*"father" + 0.029*"pressure" + 0.029*"drive" + 0.029*"sometimes" + '
  '0.029*"school" + 0.029*"perform" + 0.029*"well" + 0.029*"never" + '
  '0.029*"better" + 0.029*"sister"')]


<gensim.interfaces.TransformedCorpus at 0x7f29b6e549e8>

In [64]:
# Compute Perplexity
# a measure of how good the model is. lower the better.
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix))  #

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -4.11920134341994

Coherence Score:  0.31970581848144614


In [65]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
