## Latent Dirichlet Allocation

* Topic modeling 指的是一組從文集中抽取隱藏「主題」thematic structures 的技術方法。在許多的應用，我們都想自動抽取一篇文章或一段話所表達的「中心思想」。

![title](img/lda.png)
![title](img/lda2.png)

### 特別該注意的地方
1. LDA是bag of word的model
2. 最基本的LDA model在產生每個詞時, 都需要重選doc-topic和topic-word的股子


reference from http://emma.memect.com/t/9756da9a47744de993d8df13a26e04e38286c9bc1c5a0d2b259c4564c6613298/LDA

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation as LDA

# http://qwone.com/~jason/20Newsgroups/
dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    categories=['sci.electronics', 'sci.crypt', 'sci.med', 'sci.space']
    # categories=['rec.sport.baseball', 'sci.crypt', 'comp.graphics', 'talk.politics.guns']
)
documents = dataset.data
labels = dataset.target

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(
    max_df=0.95, 
    min_df=2, 
    max_features=1000, 
    stop_words='english',
    strip_accents = 'unicode',
    lowercase = True,
    token_pattern = r'\b[a-zA-Z]{3,}\b'
)
tf = tf_vectorizer.fit_transform(documents)
feature_names = tf_vectorizer.get_feature_names()

# Run LDA
model = LDA(
    n_topics=4, 
    max_iter=5, 
    learning_method='batch', 
    learning_offset=50.,
    random_state=0
).fit(tf)

# Show keywords
no_top_words = 10
for topic_idx, topic in enumerate(model.components_):
    print "Topic %d:" % (topic_idx)
    print " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

Topic 0:
edu people don like just know com think time good
Topic 1:
space nasa launch earth program data orbit shuttle satellite lunar
Topic 2:
encryption government key use chip clipper privacy information law security
Topic 3:
key use like don bit just used know time good


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups

categories = ['rec.sport.hockey', 'talk.religion.misc', 'soc.religion.christian']

# training navie bayes
trained_dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    subset='train',
    categories=categories
)
X0 = trained_dataset.data
y0 = trained_dataset.target
h_pipeline = Pipeline([
    ('vec', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB(alpha=.1)),
])
%timeit h_pipeline.fit(X0, y0)

# prediction
test_dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    subset='test',
    categories=categories
)
X1 = test_dataset.data
y1 = test_dataset.target
test_label_names = test_dataset.target_names
h_pipeline.score(X1, y1)
predicted_y1 = h_pipeline.predict(X1)
print(metrics.classification_report(
    y1, 
    predicted_y1, 
    target_names=test_label_names
))

1 loop, best of 3: 394 ms per loop
                        precision    recall  f1-score   support

      rec.sport.hockey       0.93      0.97      0.95       399
soc.religion.christian       0.69      0.95      0.80       398
    talk.religion.misc       0.93      0.33      0.49       251

           avg / total       0.84      0.81      0.78      1048



In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation as LDA

categories = ['rec.sport.hockey', 'talk.religion.misc', 'soc.religion.christian']

# prepare lda model
all_dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    categories=categories
)
X = all_dataset.data
tf_vectorizer = CountVectorizer(
    max_df=0.95, 
    min_df=2, 
    max_features=1000, 
    stop_words='english',
    strip_accents = 'unicode',
    lowercase = True,
    token_pattern = r'\b[a-zA-Z]{3,}\b'
)
tf = tf_vectorizer.fit_transform(X)
lda = LDA(
    n_topics=3, 
    max_iter=5, 
    learning_method='batch', 
    learning_offset=50.,
    random_state=0
).fit(tf)
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print "Topic %d:" % (topic_idx)
    print " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])

# training navie bayes
trained_dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    subset='train',
    categories=categories
)
X0 = trained_dataset.data
y0 = trained_dataset.target
h_pipeline = Pipeline([
    ('vec', TfidfVectorizer(stop_words='english')),
    ('lda', lda),
    ('clf', MultinomialNB(alpha=.1)),
])
%timeit h_pipeline.fit(X0, y0)

# prediction
test_dataset = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    subset='test',
    categories=categories
)
X1 = test_dataset.data
y1 = test_dataset.target
test_label_names = test_dataset.target_names
h_pipeline.score(X1, y1)
predicted_y1 = h_pipeline.predict(X1)
print(metrics.classification_report(
    y1, 
    predicted_y1, 
    target_names=test_label_names
))

Topic 0:
people god think don know just say like does christian
Topic 1:
team game hockey play season period games year nhl pts
Topic 2:
god jesus christ church law bible john father son spirit
1 loop, best of 3: 2.71 s per loop
                        precision    recall  f1-score   support

      rec.sport.hockey       0.75      0.32      0.45       399
soc.religion.christian       0.43      0.95      0.59       398
    talk.religion.misc       0.00      0.00      0.00       251

           avg / total       0.45      0.48      0.39      1048



  'precision', 'predicted', average, warn_for)


In [4]:
# https://github.com/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation as LDA

pyLDAvis.enable_notebook()

categories = ['rec.sport.hockey', 'talk.religion.misc', 'soc.religion.christian']
newsgroups = fetch_20newsgroups(
    random_state=1, 
    remove=('headers', 'footers', 'quotes'),
    categories=categories
)
docs_raw = newsgroups.data
print(len(docs_raw))
tf_vectorizer = CountVectorizer(
    strip_accents = 'unicode',
    stop_words = 'english',
    lowercase = True,
    token_pattern = r'\b[a-zA-Z]{3,}\b',
    max_df = 0.5, 
    min_df = 10
)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
lda_tf = LDA(    
    n_topics=3, 
    max_iter=5, 
    learning_method='batch', 
    learning_offset=50.,
    random_state=0
)
lda_tf.fit(dtm_tf)
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
# miscellaneous 就是「各式各樣、五花八門、混雜」的意思

1576
