In [1]:
# We are Using the Same Dataset :)
import pandas as pd
npr = pd.read_csv('npr.csv')

### PreProceessing <br>
Here We Use TfidfVectorizer Because NMF works on Coefficient Values (Futher Explanation on Notebook 😁 ) <br>
And TfidfVectorizer already performs Count Vectorizer before hand

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [3]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
# max_df = 0.95 -> words that show up in no more than 95% of the documents.
# min_df = 2 -> Words that at least shows in minimal 2 documents.
# stop_words = 'english' -> remove English stop_words

In [4]:
# Document Term Matrix (Feature Extraction from Text)
dtm = tfidf.fit_transform(npr['Article'])

In [5]:
dtm
# 11992 Article x 54777 Words

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

### Non-Negative Matrix Factorization

In [6]:
from sklearn.decomposition import NMF

In [7]:
nmf_model = NMF(n_components = 7, random_state = 42)

n_components -> How many Topics do we want <br>
random_state -> exact randomization

In [8]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [9]:
tfidf.get_feature_names()[2300]

'albala'

So previously with LDA, we're dealing with words that have the highest probabilities of belonging to a topic. <br>
Now with NMF we're dealing with words that have the highest coefficient values inside of that matrix. <br>

In [10]:
for index, topic in enumerate(nmf_model.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC # 0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC # 1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC # 2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC # 3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC # 4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC # 5
['love', 've', 'don

#### Attach Label to the Original Results

In [11]:
topic_results = nmf_model.transform(dtm)

it's essentially a kind of coefficient value for the top topic that's representative. <br>
And what we want is the index position of the most representative target or topic. <br>

In [12]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [13]:
topic_results[0].argmax()

1

In [14]:
topic_results.argmax(axis = 1) # We run this Accross All Column

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [15]:
npr['Topic'] = topic_results.argmax(axis = 1)

In [16]:
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [17]:
mytopic_dict = {0:'health', 1:'election', 2:'legis', 3:'poli', 4:'election', 5:'music', 6:'education'}
npr['Topic Label'] = npr['Topic'].map(mytopic_dict)

In [19]:
npr

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,poli
4,"From photography, illustration and video, to d...",6,education
5,I did not want to join yoga class. I hated tho...,5,music
6,With a who has publicly supported the debunk...,0,health
7,"I was standing by the airport exit, debating w...",0,health
8,"If movies were trying to be more realistic, pe...",0,health
9,"Eighteen years ago, on New Year’s Eve, David F...",5,music
