## NLP Topic Modeling Exercise

In [1]:
# import TfidfVectorizer and CountVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import fetch_20newsgroups from sklearn.datasets
from sklearn.datasets import fetch_20newsgroups

# import NMF and LatentDirichletAllocation from sklearn
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

* create a variable called `'no_features'` and set its value to 100.

In [10]:
no_features = 100

* create a variable `'no_topics'` and set its value to 100

In [11]:
no_topics = 100

## NMF

* instantiate a TfidfVectorizer with the following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [12]:
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features,stop_words='english')

* use fit_transform method of TfidfVectorizer to transform the documents

In [6]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

ainly interested in setting up an international inspection system which will prevent Laos from being used as a base for Communist attacks on neighboring Thailand and South Viet Nam . They count on the aid of the neutral countries attending the Geneva conference to achieve this . The United States hopes that any future Lao Cabinet would not become Communist dominated . But it is apparent that no acceptable formula has been found to prevent such a possibility . Policies modified The inclination here is to accept a de facto cease-fire in Laos , rather than continue to insist on a verification of the cease-fire by the international control commission before participating in the Geneva conference . This is another of the modifications of policy on Laos that the Kennedy administration has felt compelled to make . It excuses these actions as being the chain reaction to basic errors made in the previous administration . Its spokesmen insist that there has not been time enough to institute refo

In [13]:
data_vectorized = tfidf_vect.fit_transform(data)

* get the features names from TfidfVectorizer

In [16]:
tfidf_vect.get_feature_names()

['000',
 'af',
 'american',
 'asked',
 'away',
 'better',
 'business',
 'called',
 'came',
 'children',
 'city',
 'come',
 'course',
 'day',
 'days',
 'did',
 'didn',
 'does',
 'don',
 'end',
 'eyes',
 'face',
 'fact',
 'far',
 'form',
 'general',
 'given',
 'going',
 'good',
 'got',
 'government',
 'great',
 'group',
 'half',
 'hand',
 'head',
 'high',
 'home',
 'house',
 'john',
 'just',
 'knew',
 'know',
 'large',
 'later',
 'left',
 'let',
 'life',
 'like',
 'little',
 'll',
 'long',
 'look',
 'make',
 'man',
 'men',
 'mr',
 'mrs',
 'national',
 'new',
 'night',
 'number',
 'old',
 'order',
 'people',
 'place',
 'point',
 'present',
 'president',
 'program',
 'public',
 'right',
 'room',
 'said',
 'say',
 'school',
 'second',
 'set',
 'small',
 'social',
 'state',
 'states',
 'think',
 'thought',
 'time',
 'told',
 'took',
 'united',
 'use',
 'used',
 'war',
 'water',
 'way',
 'went',
 'white',
 'work',
 'world',
 'year',
 'years',
 'young']

* instantiate NMF and fit transformed data

In [17]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [19]:
nmf_model = NMF(n_components=no_topics)
nmf_Z = nmf_model.fit_transform(data_vectorized)



In [20]:
nmf_Z

array([[0.00000000e+00, 3.97212869e-07, 0.00000000e+00, ...,
        3.49901632e-02, 0.00000000e+00, 1.06844184e-02],
       [0.00000000e+00, 1.03124961e-01, 0.00000000e+00, ...,
        2.27706511e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.62684485e-02, 0.00000000e+00, ...,
        8.70571863e-03, 0.00000000e+00, 3.98756538e-02],
       ...,
       [1.61628268e-02, 1.74134483e-07, 0.00000000e+00, ...,
        1.85050949e-02, 0.00000000e+00, 8.47494443e-03],
       [2.93318565e-02, 2.78107774e-06, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.30701228e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.88185003e-02, 3.61754918e-02, 0.00000000e+00]])

## LDA w/ Sklearn

* instantiate a CountVectorizer with following parameters:


    * max_df = 0.95
    * min_df = 2
    * max_features = no_features
    * stop_words = 'english'

In [22]:
count_vect = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')

* use fit_transform method of CountVectorizer to transform documents

In [23]:
data_vectorized = count_vect.fit_transform(data)

* get the features names from TfidfVectorizer

In [24]:
count_vect.get_feature_names()

['000',
 'af',
 'american',
 'asked',
 'away',
 'better',
 'business',
 'called',
 'came',
 'children',
 'city',
 'come',
 'course',
 'day',
 'days',
 'did',
 'didn',
 'does',
 'don',
 'end',
 'eyes',
 'face',
 'fact',
 'far',
 'form',
 'general',
 'given',
 'going',
 'good',
 'got',
 'government',
 'great',
 'group',
 'half',
 'hand',
 'head',
 'high',
 'home',
 'house',
 'john',
 'just',
 'knew',
 'know',
 'large',
 'later',
 'left',
 'let',
 'life',
 'like',
 'little',
 'll',
 'long',
 'look',
 'make',
 'man',
 'men',
 'mr',
 'mrs',
 'national',
 'new',
 'night',
 'number',
 'old',
 'order',
 'people',
 'place',
 'point',
 'present',
 'president',
 'program',
 'public',
 'right',
 'room',
 'said',
 'say',
 'school',
 'second',
 'set',
 'small',
 'social',
 'state',
 'states',
 'think',
 'thought',
 'time',
 'told',
 'took',
 'united',
 'use',
 'used',
 'war',
 'water',
 'way',
 'went',
 'white',
 'work',
 'world',
 'year',
 'years',
 'young']

* instantiate LatentDirichletAllocation and fit transformed data 

In [26]:
lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(data_vectorized)

* create a function `display_topics` that is able to display the top words in a topic for different models

In [27]:

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 

In [28]:
print_topics(lsi_model, count_vect)

Topic 0:
[('said', 0.3692391634567834), ('time', 0.2397710480393271), ('new', 0.22421821776565362), ('like', 0.21414103558839429), ('man', 0.20979180327698285), ('af', 0.16734595948876485), ('did', 0.16112111937015908), ('mr', 0.14552097303578335), ('just', 0.14001793392085932), ('years', 0.13403180471426757)]
Topic 1:
[('af', 0.972320440304515), ('form', 0.055875141435542126), ('point', 0.051344425089258496), ('number', 0.04469496764026157), ('used', 0.033388669175006165), ('order', 0.03127891993951869), ('let', 0.027838807715546622), ('general', 0.02564973731376213), ('state', 0.025324135276413084), ('second', 0.017302794035884795)]
Topic 2:
[('state', 0.4531420740692523), ('new', 0.30527974023965826), ('states', 0.2746613016449504), ('year', 0.20706654220740833), ('united', 0.1905624276424762), ('government', 0.16172136561070521), ('program', 0.13264913388717584), ('years', 0.12479587513383822), ('world', 0.11851255873907648), ('000', 0.11612989392645844)]
Topic 3:
[('mrs', 0.836402

* display top 10 words from each topic from NMF model

In [29]:
print_topics(nmf_model, tfidf_vect)

, ('good', 0.0), ('got', 0.0), ('government', 0.0)]
Topic 20:
[('house', 4.374534224768016), ('come', 1.5443589374104446e-05), ('face', 2.500553133447294e-06), ('year', 1.9586942427613948e-10), ('school', 1.560602591867271e-12), ('head', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0), ('government', 0.0)]
Topic 21:
[('city', 3.315418817693784), ('home', 0.00010090732880566193), ('night', 7.619446502312185e-07), ('world', 2.2036341369946655e-08), ('old', 3.601067763737721e-16), ('young', 0.0), ('head', 0.0), ('going', 0.0), ('good', 0.0), ('got', 0.0)]
Topic 22:
[('american', 3.689096824620324), ('point', 0.09075373705377267), ('states', 0.003870624993479135), ('years', 1.96911209271172e-05), ('world', 1.307172581272092e-07), ('men', 2.2620312870806886e-08), ('group', 1.2106335823845674e-10), ('john', 3.662718240408694e-11), ('half', 1.3896186070033515e-11), ('young', 0.0)]
Topic 23:
[('public', 2.9657294565354824), ('home', 1.4669385036012732e-05), ('come', 6.412678464917038e-07), (

* display top 10 words from each topic from LDA model

### Stretch: Use LDA w/ Gensim to do the same thing.