In [1]:
import pandas as pd
npr=pd.read_csv("npr.csv")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()

def lemm_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

npr['pretext']=npr['Article'].apply(lambda text: lemm_words(text))

npr.head()

Unnamed: 0,Article,pretext
0,"In the Washington of 2016, even when the polic...","In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...,Donald Trump ha used Twitter — his preferred m...
2,Donald Trump is unabashedly praising Russian...,Donald Trump is unabashedly praising Russian P...
3,"Updated at 2:50 p. m. ET, Russian President Vl...","Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d...","From photography, illustration and video, to d..."


In [4]:
tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [5]:
dtm=tfidf.fit_transform(npr['pretext'])

In [6]:
dtm

<11992x53334 sparse matrix of type '<class 'numpy.float64'>'
	with 3003305 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.decomposition import NMF

In [8]:
nmf_model=NMF(n_components=7,random_state=42)

In [9]:
nmf_model.fit(dtm)

In [10]:
for index,topic in enumerate(nmf_model.components_):
    print(f"The Top 20 words for Topic {index}")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-30:]])
    print("\n")

The Top 20 words for Topic 0
['new', 'kind', 'work', 'family', 've', 'want', 'year', 'story', 'make', 'lot', 'world', 'food', 'going', 'book', 'ha', 'don', 'way', 'time', 'life', 'thing', 'woman', 'really', 'know', 'think', 'just', 'say', 'like', 'people', 'says', 'wa']


The Top 20 words for Topic 1
['washington', 'speech', 'policy', 'news', 'intelligence', 'office', 'senate', 'tax', 'committee', 'business', 'gop', 'pence', 'nominee', 'comey', 'republicans', 'presidential', 'russia', 'election', 'administration', 'wa', 'republican', 'ha', 'obama', 'white', 'donald', 'house', 'campaign', 'said', 'president', 'trump']


The Top 20 words for Topic 2
['house', 'insurer', 'company', 'doctor', 'says', 'repeal', 'federal', 'program', 'hospital', 'say', 'zika', 'act', 'republicans', 'cost', 'state', 'medical', 'drug', 'law', 'affordable', 'obamacare', 'percent', 'people', 'patient', 'tax', 'plan', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The Top 20 words for Topic 3
['accordin

In [11]:
topic_result=nmf_model.transform(dtm)

In [12]:
npr["Topic"]=topic_result.argmax(axis=1)

In [13]:
mytopicdict={0:"Health",1:"election",2:"legis",3:"politics",4:"election",5:"Music",6:"education"}

In [14]:
npr=npr.drop('pretext',axis=1)
npr["Topic Label"]=npr['Topic'].map(mytopicdict)
npr.head()

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,politics
4,"From photography, illustration and video, to d...",3,politics
