In [52]:
#import data
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

#limit retrieval
categories = ['talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']

#reformat to dataframe
dataset = fetch_20newsgroups(shuffle=True, categories=categories, random_state=1, remove=('headers','footers','quotes'))
df = pd.DataFrame(dataset.data)

df.head

<bound method NDFrame.head of                                                       0
0     com\n\n\n\nOn a DA revolver, you get another t...
1     \nI don't, though when I was in Israel I did m...
2     **********************************************...
3     What happened in Waco is not the fault of the ...
4     To my fellow Columbian, I must ask, why do you...
5     \n#Rick Anderson replied to my letter with...\...
6     \nSome people pay shares that are more "fair" ...
7     Hey Serdar,\n           What nationality are y...
8     \n\n\n\nBecause there are about 40 homicides t...
9     \nRight now, I'm just going to address this po...
10    \n\nFreedom of speech does not mean that other...
11    \n\n\n\n\nYes, I am pro-gun, and yes, I do dis...
12    \nThe letter implies that both warrants were i...
13    \nAviation Week March 15 1993 p.48\n\n"the CBO...
14    \nExcellently put!\n\nEven as a libertarian, I...
15    04/19/1993 0000  Lezghis Astir\n\nBy NEJLA SAM...
16    \n\n\nAs I r

In [2]:
#Stemming, Lemmatization and Stopword processing
import nltk
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

#define stopwords
custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                     '\u201c', 'say', 'saying', 'sayings',
                     'says', 'us', 'un', '.\"', 'would',
                     'let', '.”', 'said', ',”', 'ax','max',
                     'b8f','g8v','a86','pl','145','ld9','0t',
                     '34u']
                     
stopwords = set(sw.words('english') + custom_stop_words)

def lemmatize(token, tag):
    # collapse word inflections into single representation
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # tokenize the corpus
    tokens = []

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

In [3]:
%%time
#Preprocessing and Vector Fitting
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#NMF requires TFIDF vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                   min_df=0.1, max_df=0.90)
tfidf = tfidf_vectorizer.fit_transform(df[0])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

#LDA requires Count Vectorizer
tf_vectorizer = CountVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                   min_df=0.1, max_df=0.90)
tf = tf_vectorizer.fit_transform(df[0])
tf_feature_names = tf_vectorizer.get_feature_names()

print("Vectorized corpus")

Vectorized corpus
CPU times: user 1min 31s, sys: 1.12 s, total: 1min 33s
Wall time: 1min 37s


In [34]:
%%time
#Model Generation
from sklearn.decomposition import NMF, LatentDirichletAllocation
topics = 10

#Non-Negative Matrix Factorization - fit model using tfidf vector
nmf = NMF(n_components=topics,random_state=1,alpha=0.1,l1_ratio=0.5,init='nndsvd').fit(tfidf)

#Latent Dirilicht Analysis - fit the model using term frequency vector
lda = LatentDirichletAllocation(n_components=topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0).fit(tf)

print("Models fitted")

Models fitted
CPU times: user 4.71 s, sys: 12 ms, total: 4.72 s
Wall time: 3.86 s


In [51]:
corpus = {'document': ['this is a test',
         'this is another test']}

df = pd.DataFrame(data=corpus)

df.head(2)

Unnamed: 0,document
0,this is a test
1,this is another test


In [47]:
#10 topics, each comprised of 82 wordsprint, probabilties not normalized however
#1952 articles
# print(lda.components_.shape)

# print(lda.shape)
# print(type(lda))

print(tf.shape)

test = lda.transform(tf)
print(test.shape)
print(test[:,0].shape)
print(test[0])
print(sum(test[0]))

(1952, 82)
(1952, 10)
(1952,)


In [5]:
#display results
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in (-topic).argsort()[:no_top_words]]))
        
no_top_words = 10

print("NMF Topics:")
display_topics(nmf, tfidf_feature_names, no_top_words)
print("\nLDA Topics:")
display_topics(lda, tf_feature_names, no_top_words)

NMF Topics:
Topic 0:
get go see time well take use good give want
Topic 1:
gun use get law number like year problem time point
Topic 2:
state law right use case may 1 also since force
Topic 3:
people many kill first force like tell problem live country
Topic 4:
one kill child two another many consider come number seem
Topic 5:
post point get could number part question want new good
Topic 6:
make see much child look life case well problem start
Topic 7:
think like case really might want take question see something
Topic 8:
know like something tell even go believe look thing come
Topic 9:
government need case 2 time right year also force fact

LDA Topics:
Topic 0:
state people post government make want right time get ask
Topic 1:
1 2 kill people state year one government number two
Topic 2:
think go make work know get want well look time
Topic 3:
people right like government good life think even case much
Topic 4:
child show good like really make question without last first
Topic 5:
gun 