In [1]:
from sklearn.datasets import load_files
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [None]:
#Only run this code blocks if you need to preprocess. Will make the CSV
#Will take a long time to run
df1 = pd.read_csv('datasets/articles1.csv', index_col=0)
df2 = pd.read_csv('datasets/articles2.csv', index_col=0)
df3 = pd.read_csv('datasets/articles3.csv', index_col=0)

def remove_nouns(texts):
    output = []
    for doc in nlp.pipe(texts):
        noun_text = " ".join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
        output.append(noun_text)
    return output

total_df = pd.concat([df1, df2, df3])
total_df = total_df[total_df['content'].notna()]
total_df['content'] = remove_nouns(total_df['content'])
total_df.to_csv("removed_nouns_all_data.csv")

In [2]:
## Only run this block if you have the processed csv:
total_df = pd.read_csv('removed_nouns_all_data.csv', index_col=0)

In [16]:
#train the classifer
n_topics = 10
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vec = TfidfVectorizer(max_features=5000, stop_words="english", max_df=0.95, min_df=2)
features = vec.fit_transform(total_df.content)

from sklearn.decomposition import NMF
cls = NMF(n_components=n_topics, random_state=1)
cls.fit(features)

NMF(n_components=10, random_state=1)
[[0.00491156 0.01576551 0.00697774 ... 0.         0.19426374 0.        ]
 [0.         0.         0.         ... 0.00039211 0.01146575 0.00576775]
 [0.00036625 0.00414164 0.00069257 ... 0.         0.02325821 0.        ]
 ...
 [0.         0.         0.         ... 0.01114921 0.07043442 0.0005973 ]
 [0.00089279 0.00119511 0.00085951 ... 0.         0.00928997 0.00228052]
 [0.         0.         0.00303111 ... 0.         0.         0.        ]]


In [47]:
#Print top 15 words for each category
feature_names = vec.get_feature_names()
n_top_words = 15
for i, topic_vec in enumerate(cls.components_):
    print(i, end=' ')
    for fid in topic_vec.argsort()[-1:-n_top_words-1:-1]:
        print(feature_names[fid], end=' ')
    print()

0 attack country government force war group official leader city missile security refugee administration policy people 
1 police officer shooting gun man attack video suspect car city death incident victim authority crime 
2 percent company market share price stock rate investor year business bank growth sale deal economy 
3 people film life time thing year story way family movie book world friend day child 
4 voter campaign candidate party election poll vote state nominee race delegate percent primary debate nomination 
5 email investigation campaign intelligence information official president report election administration news committee statement document press 
6 health law state court insurance care tax plan people case government coverage judge order legislation 
7 game team season player coach league year fan football ball time sport playoff point quarterback 
8 student school college campus teacher university education child parent kid class program district family community 
9

In [5]:
def label_article(row):
    return cls.transform(vec.transform([row])).argsort(axis=1)[:,-1][0]

In [6]:
#Classify/label our documents
total_df['label'] = total_df['content'].apply(label_article)

In [7]:
#Save the labeled data
total_df.to_csv("labeled_articles.csv")

In [46]:
#Make CSV's for each group, for word cloud visualizations
group_df = pd.DataFrame(columns=['Group', 'Frequency', 'Word'])

for i, topic_vec in enumerate(cls.components_):
    for fid in topic_vec.argsort()[-1:-11:-1]:
        group_df = group_df.append({'Group':i, 'Frequency':topic_vec[fid], 'Word':feature_names[fid]}, ignore_index=True)
print(group_df)

group_df.to_csv("group_word_frequency.csv")


   Group  Frequency        Word
0      0   2.140287      attack
1      0   1.983907     country
2      0   1.654136  government
3      0   1.524959       force
4      0   1.368958         war
..   ...        ...         ...
95     9   0.523352      gender
96     9   0.383038       right
97     9   0.375441     assault
98     9   0.369654       child
99     9   0.358651        rape

[100 rows x 3 columns]
