In [1]:
import pandas as pd
import time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

### Importing dataset

reading in the dataset from our 'data' folder

In [2]:
df = pd.read_csv('../../../data/npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
len(df)

11992

Checking to make sure there are no null rows

In [5]:
df.isna().sum()

Article    0
dtype: int64

In [6]:
# Checking to make sure there are no articles containing only spaces within the df
blank_list = []
for idx,article in enumerate(df['Article']):
    if type(article) == str:
        if article.isspace():
            blank_list.append(idx)
print(blank_list)

[]


## LDA

### Setting up the Count Vectorizer

In [7]:
count_vect = CountVectorizer(max_df=0.85, min_df=2, stop_words='english')

In [8]:
dtm = count_vect.fit_transform(df['Article'])

In [9]:
# The number of articles, and the total number of terms
dtm.get_shape()

(11992, 54777)

### Setting up LDA model

In [10]:
lda_model = LatentDirichletAllocation(n_components=7, random_state=42)

In [11]:
lda_model.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [12]:
# Should be equal to the number of topics (n_components) we defined above
len(lda_model.components_)

7

#### Viewing the first topic

In [13]:
first_topic = lda_model.components_[0]

In [14]:
# An array of indexes from least --> greatest in terms of probability the term 
# associated with the index belongs in this topic
first_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [15]:
# Top ten words associated with the first topic
first_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993], dtype=int64)

### Getting the top N words for each topic in our LDA Model

In [16]:
n = 15
for i, topic in enumerate(lda_model.components_):
    print(f"Top {n} words in Topic #{i+1}:")
    print([count_vect.get_feature_names()[idx] for idx in lda_model.components_[i].argsort()[-n:]])
    print('\n')

Top 15 words in Topic #1:
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


Top 15 words in Topic #2:
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


Top 15 words in Topic #3:
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


Top 15 words in Topic #4:
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


Top 15 words in Topic #5:
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


Top 15 words in Topic #6:
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think', 'people', 'just', 'like

### Creating a new column in df for the Topic Number of each Article

In [17]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [18]:
topic_results = lda_model.transform(dtm)

In [28]:
# For each article in the df, the topic_results holds the probability an article
# is associated with a certain topic
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [29]:
# Gives us the topic with the highest probability
topic_results[0].argmax()

1

In [30]:
df['LDA_topic'] = topic_results.argmax(axis=1)

In [31]:
df.head()

Unnamed: 0,Article,LDA_topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
