## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [57]:
len(documents)

11314

* create a variable called `'no_features'` and set its value to 100.

In [3]:
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [4]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [5]:
tfidf = TfidfVectorizer(
    max_df = 0.95,
    min_df = 2,
    max_features = no_features,
    stop_words = 'english'
)

* use fit_transform method of TfidfVectorizer to transform the documents

In [19]:
tfidf_docs = tfidf.fit_transform(documents)

* get the features names from TfidfVectorizer

In [69]:
tfidf_feature_names = tfidf.get_feature_names_out()

* instantiate NMF and fit transformed data

In [21]:
nmf = NMF()

In [22]:
nmf.fit_transform(tfidf_docs)



array([[0.0186895 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04587933, 0.        ,
        0.02482667],
       [0.        , 0.        , 0.        , ..., 0.        , 0.07081967,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01037582, 0.        , 0.        , ..., 0.        , 0.03395286,
        0.        ]])

## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [23]:
count = CountVectorizer(
    max_df = 0.95,
    min_df = 2,
    max_features = no_features,
    stop_words='english'
)

* use fit_transform method of CountVectorizer to transform documents

In [24]:
cv = count.fit_transform(documents)

* get the features names from TfidfVectorizer

In [70]:
tf_feature_mames = count.get_feature_names_out()

In [71]:
count.vocabulary_

{'sure': 80,
 'did': 22,
 'world': 97,
 'try': 88,
 'think': 85,
 'government': 38,
 'look': 53,
 'got': 37,
 'power': 64,
 'people': 62,
 'read': 69,
 'need': 59,
 'little': 50,
 'just': 42,
 'new': 60,
 'don': 27,
 'like': 47,
 'know': 44,
 'question': 68,
 'want': 93,
 'work': 96,
 'key': 43,
 'drive': 28,
 'use': 89,
 'going': 35,
 'probably': 65,
 'doesn': 26,
 'year': 98,
 'line': 48,
 'time': 86,
 'right': 71,
 'does': 25,
 'll': 51,
 'good': 36,
 'let': 46,
 'mail': 55,
 'edu': 29,
 'jesus': 41,
 'day': 21,
 'lot': 54,
 '20': 6,
 '25': 7,
 'better': 14,
 'file': 32,
 'mr': 58,
 'say': 74,
 'make': 56,
 'way': 94,
 '10': 1,
 'years': 99,
 'used': 90,
 'tell': 81,
 'thanks': 82,
 'list': 49,
 'available': 9,
 'help': 39,
 'information': 40,
 'software': 77,
 'data': 20,
 '12': 2,
 '14': 3,
 '15': 4,
 'space': 78,
 '16': 5,
 'com': 17,
 'number': 61,
 'things': 84,
 'run': 72,
 'program': 67,
 'set': 76,
 'windows': 95,
 'bit': 15,
 'best': 13,
 'state': 79,
 'course': 19,
 'diffe

* instantiate LatentDirichletAllocation and fit transformed data 

In [72]:
lda = LatentDirichletAllocation()

In [73]:
lda_docs = lda.fit_transform(cv)

In [74]:
lda_docs

array([[0.01      , 0.01000171, 0.01000244, ..., 0.01000133, 0.0100006 ,
        0.01000094],
       [0.01111111, 0.01111215, 0.01111294, ..., 0.01111269, 0.01111411,
        0.01111166],
       [0.01111111, 0.01111119, 0.01111122, ..., 0.01111227, 0.0111123 ,
        0.01111157],
       ...,
       [0.03333333, 0.03333348, 0.03333405, ..., 0.0333348 , 0.03333477,
        0.0333343 ],
       [0.02      , 0.02000013, 0.02000119, ..., 0.81998184, 0.02000439,
        0.02000021],
       [0.004     , 0.00400093, 0.00400079, ..., 0.00400101, 0.00400078,
        0.00400013]])

* create a function `display_topics` that is able to display the top words in a topic for different models

In [75]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f'Topic {topic_idx}:')
        print(' '.join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10

Topic 0:
did just ll data years going don drive edu fact
Topic 1:
thanks 14 file data years going don drive edu fact
Topic 2:
does know just ll data years god don drive edu
Topic 3:
edu 14 file just ll data line going don drive
Topic 4:
just don like tell ll ve years god doesn drive
Topic 5:
like 14 just file data years going don drive edu
Topic 6:
just a86 g9v does think doesn don drive edu fact
Topic 7:
use don ve data years going doesn drive edu fact
Topic 8:
people don just mr ll ve years going drive edu
Topic 9:
good 14 ll just data years going don drive edu
Topic 10:
think don years going doesn drive edu fact far file
Topic 11:
god don tell jesus 14 just ll 25 years going
Topic 12:
time just ll ve did want key doesn don drive
Topic 13:
windows tell data years going doesn don drive edu fact
Topic 14:
drive problem power file data years does don edu fact
Topic 15:
tell ll power mail program software data doesn did years
Topic 16:
don ll data want did years going drive edu fact
Topi

* display top 10 words from each topic from NMF model

In [76]:
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
did just ll data years going don drive edu fact
Topic 1:
thanks 14 file data years going don drive edu fact
Topic 2:
does know just ll data years god don drive edu
Topic 3:
edu 14 file just ll data line going don drive
Topic 4:
just don like tell ll ve years god doesn drive
Topic 5:
like 14 just file data years going don drive edu
Topic 6:
just a86 g9v does think doesn don drive edu fact
Topic 7:
use don ve data years going doesn drive edu fact
Topic 8:
people don just mr ll ve years going drive edu
Topic 9:
good 14 ll just data years going don drive edu
Topic 10:
think don years going doesn drive edu fact far file
Topic 11:
god don tell jesus 14 just ll 25 years going
Topic 12:
time just ll ve did want key doesn don drive
Topic 13:
windows tell data years going doesn don drive edu fact
Topic 14:
drive problem power file data years does don edu fact
Topic 15:
tell ll power mail program software data doesn did years
Topic 16:
don ll data want did years going drive edu fact
Topi

* display top 10 words from each topic from LDA model

In [77]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
ax max g9v b8f a86 14 mr 25 ll 12
Topic 1:
10 drive 20 12 16 15 25 14 year power
Topic 2:
00 list mail new day got ve 15 20 thanks
Topic 3:
god good jesus believe does time think true people say
Topic 4:
file windows thanks program does know use help set like
Topic 5:
like just don people want make think really know way
Topic 6:
people said don did know think say right mr going
Topic 7:
problem use bit using used law time try work run
Topic 8:
key data information use used new number software available law
Topic 9:
edu com space available mail information program list set new


### Stretch: Use LDA w/ Gensim to do the same thing.