## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

* create a variable called `'no_features'` and set its value to 100.

In [3]:
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [4]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [5]:
vec = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english')

* use fit_transform method of TfidfVectorizer to transform the documents

In [6]:
X = vec.fit_transform(documents)

* get the features names from TfidfVectorizer

In [10]:
print(vec.get_feature_names())

['00', '10', '12', '14', '15', '16', '20', '25', 'a86', 'available', 'ax', 'b8f', 'believe', 'best', 'better', 'bit', 'case', 'com', 'come', 'course', 'data', 'day', 'did', 'didn', 'different', 'does', 'doesn', 'don', 'drive', 'edu', 'fact', 'far', 'file', 'g9v', 'god', 'going', 'good', 'got', 'government', 'help', 'information', 'jesus', 'just', 'key', 'know', 'law', 'let', 'like', 'line', 'list', 'little', 'll', 'long', 'look', 'lot', 'mail', 'make', 'max', 'mr', 'need', 'new', 'number', 'people', 'point', 'power', 'probably', 'problem', 'program', 'question', 'read', 'really', 'right', 'run', 'said', 'say', 'second', 'set', 'software', 'space', 'state', 'sure', 'tell', 'thanks', 'thing', 'things', 'think', 'time', 'true', 'try', 'use', 'used', 'using', 've', 'want', 'way', 'windows', 'work', 'world', 'year', 'years']


* instantiate NMF and fit transformed data

In [19]:
nmf = NMF()
nmf_fitted = nmf.fit_transform(X)



In [20]:
print(nmf_fitted.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(11314, 100)


## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [28]:
count_vec = CountVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english')

* use fit_transform method of CountVectorizer to transform documents

In [30]:
X2 = count_vec.fit_transform(documents)

* get the features names from TfidfVectorizer

In [31]:
print(count_vec.get_feature_names())

['00', '10', '12', '14', '15', '16', '20', '25', 'a86', 'available', 'ax', 'b8f', 'believe', 'best', 'better', 'bit', 'case', 'com', 'come', 'course', 'data', 'day', 'did', 'didn', 'different', 'does', 'doesn', 'don', 'drive', 'edu', 'fact', 'far', 'file', 'g9v', 'god', 'going', 'good', 'got', 'government', 'help', 'information', 'jesus', 'just', 'key', 'know', 'law', 'let', 'like', 'line', 'list', 'little', 'll', 'long', 'look', 'lot', 'mail', 'make', 'max', 'mr', 'need', 'new', 'number', 'people', 'point', 'power', 'probably', 'problem', 'program', 'question', 'read', 'really', 'right', 'run', 'said', 'say', 'second', 'set', 'software', 'space', 'state', 'sure', 'tell', 'thanks', 'thing', 'things', 'think', 'time', 'true', 'try', 'use', 'used', 'using', 've', 'want', 'way', 'windows', 'work', 'world', 'year', 'years']


* instantiate LatentDirichletAllocation and fit transformed data 

In [32]:
lda = LatentDirichletAllocation()
lda_fitted = lda.fit_transform(X2)

* create a function `display_topics` that is able to display the top words in a topic for different models

In [43]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
#         if idx == 9:
#             break

* display top 10 words from each topic from NMF model

In [44]:
print("NMF Model:")
print_topics(nmf, vec)

NMF Model:
Topic 0:
[('did', 16.40089167562295), ('just', 2.7125032748952746e-07), ('ll', 6.068010410205508e-09), ('data', 3.4930937935321226e-10), ('years', 0.0), ('going', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0), ('fact', 0.0)]
Topic 1:
[('thanks', 10.587911306174842), ('14', 6.19577186560542e-06), ('file', 1.261614559601937e-07), ('data', 3.2150618082632774e-10), ('years', 0.0), ('going', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0), ('fact', 0.0)]
Topic 2:
[('does', 4.436783869191332), ('know', 8.159441789334389e-05), ('just', 2.3191431546221394e-07), ('ll', 4.134329650471942e-09), ('data', 2.8295448876162086e-10), ('years', 0.0), ('god', 0.0), ('don', 0.0), ('drive', 0.0), ('edu', 0.0)]
Topic 3:
[('edu', 5.083671527080077), ('14', 1.5876118805757653e-06), ('file', 1.0610127259158247e-06), ('just', 8.851142897319139e-08), ('ll', 5.013211101913869e-09), ('data', 2.0520599540926969e-10), ('line', 1.907036395945729e-12), ('going', 0.0), ('don', 0.0), ('drive', 0.0)]
Topic

* display top 10 words from each topic from LDA model

In [45]:
print("LDA Model:")
print_topics(lda, count_vec)

LDA Model:
Topic 0:
[('people', 2076.0369855399726), ('god', 1961.0182740158004), ('government', 1167.3700394675188), ('law', 1036.1825508006202), ('believe', 965.0523957513936), ('does', 931.4622051041653), ('jesus', 887.0999468312318), ('state', 720.939754114056), ('question', 694.4593507322932), ('right', 663.0098920462898)]
Topic 1:
[('ax', 62387.099998807396), ('max', 4531.217620738386), ('g9v', 1166.0999981563643), ('b8f', 1111.0999957688675), ('a86', 916.0999960650672), ('14', 115.42083266659765), ('mr', 100.71429057192557), ('25', 49.94675397250037), ('ll', 8.95500425744777), ('12', 4.204068808161783)]
Topic 2:
[('edu', 2403.98081529731), ('com', 1375.4846686185003), ('mail', 937.2485830801152), ('list', 688.586686506407), ('available', 507.54609960953184), ('information', 337.5703252439141), ('use', 145.1295022407829), ('like', 143.5294515042493), ('thanks', 140.0768825923925), ('new', 136.75367239208163)]
Topic 3:
[('space', 1279.909893757134), ('data', 1044.2060773103772), (

### Stretch: Use LDA w/ Gensim to do the same thing.