In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('../input/articles/articles.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [5]:
# Document Term Matric
dtm = tfidf.fit_transform(df['Article'])

In [6]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

> `# PERFORMING NMF using sklearn`

In [7]:
from sklearn.decomposition import NMF

In [8]:
nmf_model = NMF(n_components=7, random_state=42)

In [9]:
nmf_model.fit(dtm)

NMF(n_components=7, random_state=42)

In [10]:
tfidf.get_feature_names()[1029]

'756'

In [11]:
# Grab the highest probability words per topic
for i, topic in enumerate(nmf_model.components_):
    print(f'TOP 15 WORDS FOR TOPIC #{i}')
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n\n')

TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']



TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']



TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']



TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']



TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']



TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'album', 'way', 'time'

In [12]:
topic_results = nmf_model.transform(dtm)

In [13]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [14]:
topic_results.shape

(11992, 7)

In [15]:
# Probabilities belonging to a particular topic
print(topic_results[0])

print(f'-------------------------------------------------------------')
# Percentages in a rounded off form
print(topic_results[0].round(2))

[0.         0.12075603 0.00140297 0.05919954 0.01518909 0.
 0.        ]
-------------------------------------------------------------
[0.   0.12 0.   0.06 0.02 0.   0.  ]


In [16]:
df['Topics'] = topic_results.argmax(axis=1)

In [17]:
df.head(20)

Unnamed: 0,Article,Topics
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
5,I did not want to join yoga class. I hated tho...,5
6,With a who has publicly supported the debunk...,0
7,"I was standing by the airport exit, debating w...",0
8,"If movies were trying to be more realistic, pe...",0
9,"Eighteen years ago, on New Year’s Eve, David F...",5
