# Let’s start coding by loading the data and the required libraries!

In [9]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
 
documents = pd.read_csv('news-data.csv', error_bad_lines=False)
#documents=pd.read_json ('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
documents.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Build the TF-IDF Matrix

In [11]:
# use tfidf by removing tokens that don't appear in at least 50 documents
vect = TfidfVectorizer(min_df=50, stop_words='english')
 
# Fit and transform
X = vect.fit_transform(documents.headline_text)
print(X)

  (0, 5904)	0.38150132624074345
  (0, 1502)	0.5178795390810941
  (0, 2273)	0.30728041415686325
  (0, 2810)	0.47191378105745035
  (0, 227)	0.5187809281395711
  (1, 2829)	0.5424693206334905
  (1, 897)	0.5779035083899079
  (1, 11056)	0.4819332869088385
  (1, 307)	0.373490131236848
  (2, 9757)	0.5401089335427871
  (2, 7838)	0.5101597079716474
  (2, 5199)	0.5381649807792245
  (2, 1685)	0.39799229355912596
  (3, 8516)	0.3512948314643252
  (3, 7297)	0.36109283322747493
  (3, 9663)	0.3786195698706414
  (3, 858)	0.38303698552018856
  (3, 9497)	0.3947242417206409
  (3, 6924)	0.39004274743128264
  (3, 446)	0.38495873075838305
  (4, 10331)	0.5179119163914456
  (4, 862)	0.3105805038106875
  (4, 383)	0.46294420318743096
  (4, 9663)	0.3688112555316992
  (4, 6924)	0.3799385104164402
  :	:
  (1103657, 5084)	0.49968507562830095
  (1103657, 9696)	0.5422650925294712
  (1103657, 9884)	0.30309149758976567
  (1103657, 4880)	0.3806255668821827
  (1103657, 11159)	0.4685329407232252
  (1103658, 9283)	0.50806236

# Build the NMF Model

In [None]:
# Create an NMF instance: model
# the 10 components will be the topics
model = NMF(n_components=10, random_state=5) #'random': non-negative random matrices, scaled with: sqrt(X.mean() / n_components)
 
# Fit the model to TF-IDF
model.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)
print(nmf_features)

# It important to check the dimensions of the 3 tables:
# 1- TF-IDF Dimensions:

In [None]:
X.shape

# 2- Features Dimensions:

In [11]:
nmf_features.shape

(1103663, 10)

# 3- Components Dimensions:

In [12]:
model.components_.shape

(10, 11213)

In [19]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names())
components_df

Unnamed: 0,000,01,02,03,04,05,06,07,08,09,...,zimbabwean,zimbabwes,zinc,zinifex,zoe,zone,zones,zoning,zoo,zuma
0,0.000515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000243,0.0,0.0,0.0,0.001377,0.001534,0.0,0.0,0.006219,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9e-05,0.0,0.0,...,0.0,9e-06,0.000601,0.0,0.001317,9.6e-05,1.4e-05,0.0,1.1e-05,2e-06
2,0.000639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001682,0.000187,0.0,0.0,0.000146,0.005309,0.002583,0.0,0.0,0.0
3,5e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000337,0.000537,4.2e-05,0.0,0.00866,0.009708,0.001328,0.024154,0.000637
4,0.000603,0.002029,0.01103,0.009208,0.01011,0.004755,0.004782,0.004933,0.005841,0.011264,...,0.0,0.000763,0.0,0.0,0.000127,0.002611,0.005015,0.000556,0.006629,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00024,0.000548,3e-05,0.000367,0.001205,0.0,0.0,0.000503,0.0
6,0.000158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000112,0.000331,0.0,0.0,0.006822,0.0,0.0,0.007919,0.0
7,0.001223,0.0,0.0,0.0,0.000521,0.000105,8.7e-05,0.00079,0.0,0.0,...,0.004573,0.000989,0.0,0.000786,0.001699,0.002618,0.00024,0.0,0.002192,0.005424
8,2.2e-05,0.028027,0.053097,0.042028,0.036621,0.038969,0.034321,0.025582,0.03336,0.037268,...,0.0,0.0,0.00032,0.0,0.000125,0.010972,0.001403,0.0,0.000322,0.0
9,0.002293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002326,0.002482,0.00214,0.00168,0.001404,0.030537,0.01305,0.005512,0.00975,0.001527


# Get the Words of the Highest Value for each Topic

In [25]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10)) # Return the first n rows ordered byAnd columns in descending order. 
    # And This method is equivalent to df.sort_values(columns, ascending=False).head(n) , but more performant.
    print('\n')

For topic 1 the words with the highest value are:
man         8.396817
charged     3.117248
murder      1.367349
jailed      0.891809
missing     0.880894
stabbing    0.727232
guilty      0.637090
arrested    0.600157
death       0.587411
sydney      0.532504
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
interview    7.471284
extended     0.393083
michael      0.383856
david        0.226665
john         0.222362
james        0.211161
nrl          0.202279
smith        0.179707
ben          0.172380
andrew       0.169546
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
police         6.886678
probe          0.814901
investigate    0.795326
missing        0.679711
search         0.637454
death          0.497555
hunt           0.420141
officer        0.329780
seek           0.313185
shooting       0.300862
Name: 2, dtype: float64


For topic 4 the words with the highest value are:
new         8.436840
zealand     0.571164
laws      

# Get the Topic of a Document

In [26]:
my_document = documents.headline_text[57]
my_document

'gilchrist backs rest policy'

# We will need to work with the Features matrix. So let’s get the 55th row:

In [31]:
pd.DataFrame(nmf_features).loc[55]

0    0.000000
1    0.000000
2    0.001271
3    0.000000
4    0.000000
5    0.000000
6    0.000000
7    0.000000
8    0.000000
9    0.011652
Name: 55, dtype: float64

In [32]:
# If we see the most important words of Topic 10 we will see that it contains the “funding“!
pd.DataFrame(nmf_features).loc[55].idxmax()

9

In [33]:
# Finally, if we want to see the number of documents for each topic we can easily get it by typing:
pd.DataFrame(nmf_features).idxmax()

0    773063
1    186097
2    305808
3     27576
4    648375
5    374427
6    142751
7    209596
8    621162
9    397901
dtype: int64

# How to Predict the Topic of a New Document

In [34]:
my_news = """15-year-old girl stabbed to death in grocery store during fight with 4 younger girls
Authorities said they gathered lots of evidence from videos on social media"""
 
# Transform the TF-IDF
X = vect.transform([my_news])
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)
 
pd.DataFrame(nmf_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.002345,4.2e-05,0.002354,0.001428,3.1e-05,0.000179,0.000599,0.001379,0.000886,0.00416


In [36]:
# And if we want to get the index of the topic with the highest score:
pd.DataFrame(nmf_features).idxmax(axis=1)
#As expected, this document was classified as Topic 10 (with index 9).

0    9
dtype: int64