In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('../input/articles/articles.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
df.isnull().sum() ## Check for empty data

Article    0
dtype: int64

In [5]:
df['Article'][1029]

'Amnesty International released a report this week that may make you wonder how much of what we conscientiously report as important news truly is by comparison. The human rights group, which received the Nobel Peace Prize in 1977, says as many as 13, 000 opponents of Bashar Assad have been hanged in the Saydnaya prison on the outskirts of Damascus. It is worth repeating that number: as many as 13, 000 people, hanged to death. The researchers interviewed 84 people, including former guards, a military judge, and 31 people who were held in two buildings of the prison. The witnesses say once or twice a week, 20 to 50 prisoners were taken from their cells and told they will be transferred elsewhere. Instead, they were taken to a basement to appear before a military judge for two or three minutes  —   just enough time to be condemned to death. The death sentence was signed by the minister of defense, who was deputized to sign by President Assad, says the report. The men and women were hanged

In [6]:
len(df)

11992

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cvr = CountVectorizer(max_df=0.9, min_df=2, stop_words='english') 
# max_df = discard 90% of the words that are common in all documents/rows
# min_df = check freq of a word so that it should be common in atleast 2 documents

> `Applying un-supervised learning`

In [8]:
dtm = cvr.fit_transform(df['Article'])

In [9]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

`Let's perform Latent Dirichlet Allocation using Scikit-Learn`

In [10]:
from sklearn.decomposition import LatentDirichletAllocation
import torch
torch.cuda.is_available()

True

In [11]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

device = torch.device(dev)

In [17]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42) # n_components = Topics

In [None]:
LDA

In [None]:
LDA.fit(dtm).to(device)

In [20]:
print(f'{"DONE FINALLY!!!!!!"}')

DONE FINALLY!!!!!!


In [24]:
# Grab vocabulary of words
len(cvr.get_feature_names())

54777

In [25]:
type(cvr.get_feature_names())

list

In [28]:
cvr.get_feature_names()[1029]

'756'

In [36]:
## We can import any random words from the list(54777)
import random

random_word_id = random.randint(0, 54777)
cvr.get_feature_names()[random_word_id]

'inception'

In [40]:
# Grab the topics
len(LDA.components_)## checking the length of components/topics

7

In [42]:
type(LDA.components_) ## Just a numpy array containing probabilities of each word

numpy.ndarray

In [43]:
# Grab single topic out of those 7 components
single_topic = LDA.components_[0] # grabbing very first topic

In [45]:
single_topic.argsort() # returns index position to sort the array from lowest value to highest value

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [50]:
## example how argsort() works
import numpy as np

arr = np.array([5, 100, 23, 1])
print(f'Simple array:- {arr}')
print(f'Argsort:- {arr.argsort()}') ## will return index value of the numbers in an ascending order

Simple array:- [  5 100  23   1]
Argsort:- [3 0 2 1]


In [53]:
# Let's grab top 10 values (top 10 greatest values) from single_topic using argsort()

single_topic.argsort()[-10:] ## since argsort() works in ascending order, hence, [-10:] is bringing last 10 greatest values

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [54]:
top_10_words = single_topic.argsort()[-10:]

In [55]:
for index in top_10_words:
    print(cvr.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [57]:
# The above was for first topic. Let's do it for 3rd topic and grab top 20 words
third_topic = LDA.components_[2]
third_topic.argsort()
top_20_words_in_3rd = third_topic.argsort()[-20:]

for i in top_20_words_in_3rd:
    print(cvr.get_feature_names()[i])

little
know
don
year
make
way
world
family
home
day
time
water
city
new
years
food
just
people
like
says


In [62]:
# Grab the highest probability words per topic
for i, topic in enumerate(LDA.components_):
    print(f'TOP 15 WORDS FOR TOPIC #{i}')
    print([cvr.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n\n')

TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']



TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']



TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']



TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']



TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']



TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think', 'people', 'just', 

In [63]:
topic_results = LDA.transform(dtm)

In [64]:
topic_results

array([[1.61040465e-02, 6.83341493e-01, 2.25376318e-04, ...,
        2.99652737e-01, 2.25479379e-04, 2.25497980e-04],
       [3.63424997e-02, 8.86130697e-01, 4.40751747e-04, ...,
        7.57636804e-02, 4.40866779e-04, 4.40835574e-04],
       [3.28569485e-04, 6.96344889e-01, 3.28302105e-04, ...,
        3.02012902e-01, 3.28724083e-04, 3.28352652e-04],
       ...,
       [1.44467964e-02, 1.60696622e-01, 1.73678310e-01, ...,
        2.24636569e-02, 3.98728349e-04, 3.98359730e-04],
       [4.33560738e-04, 3.53196803e-02, 4.33022554e-04, ...,
        9.62512640e-01, 4.33971991e-04, 4.33490254e-04],
       [3.98777533e-01, 2.54376049e-04, 3.59290659e-01, ...,
        2.40914375e-01, 2.54445555e-04, 2.54253739e-04]])

In [65]:
topic_results.shape

(11992, 7)

In [73]:
# Probabilities belonging to a particular topic
print(topic_results[0])

print(f'-------------------------------------------------------------')
# Percentages in a rounded off form
print(topic_results[0].round(2))

[1.61040465e-02 6.83341493e-01 2.25376318e-04 2.25369288e-04
 2.99652737e-01 2.25479379e-04 2.25497980e-04]
-------------------------------------------------------------
[0.02 0.68 0.   0.   0.3  0.   0.  ]


In [75]:
# Getting index position of the highest probability
topic_results[0].argmax()

1

In [76]:
df['Topics'] = topic_results.argmax(axis=1)

In [77]:
df ## which all rows of article data are under general topics

Unnamed: 0,Article,Topics
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
