In [3]:
#import data
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

#limit retrieval
pd.set_option('display.max_colwidth', 40) #display entirety of tweet document
categories = ['talk.politics.guns','talk.politics.mideast','talk.politics.misc','talk.religion.misc']

#reformat to dataframe
dataset = fetch_20newsgroups(shuffle=True, categories=categories, random_state=1, remove=('headers','footers','quotes'))
df = pd.DataFrame(dataset.data)
df = df[:20]

print(df)

                                          0
0   com\n\n\n\nOn a DA revolver, you get...
1   \nI don't, though when I was in Isra...
2   ************************************...
3   What happened in Waco is not the fau...
4   To my fellow Columbian, I must ask, ...
5   \n#Rick Anderson replied to my lette...
6   \nSome people pay shares that are mo...
7   Hey Serdar,\n           What nationa...
8   \n\n\n\nBecause there are about 40 h...
9   \nRight now, I'm just going to addre...
10  \n\nFreedom of speech does not mean ...
11  \n\n\n\n\nYes, I am pro-gun, and yes...
12  \nThe letter implies that both warra...
13  \nAviation Week March 15 1993 p.48\n...
14  \nExcellently put!\n\nEven as a libe...
15  04/19/1993 0000  Lezghis Astir\n\nBy...
16  \n\n\nAs I recall, in the 60's the K...
17  \nSo it was a complete non-sequitur,...
18  RE: Red, wwhite, and black, the colo...
19  THE WHITE HOUSE\n\n                 ...


In [4]:
#Stemming, Lemmatization and Stopword processing
import nltk
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag

# initialize constants, lematizer, punctuation and stopwords
lemmatizer = WordNetLemmatizer()
punct = set(string.punctuation)

#define stopwords
custom_stop_words = ['–', '\u2019', 'u', '\u201d', '\u201d.',
                     '\u201c', 'say', 'saying', 'sayings',
                     'says', 'us', 'un', '.\"', 'would',
                     'let', '.”', 'said', ',”', 'ax','max',
                     'b8f','g8v','a86','pl','145','ld9','0t',
                     '34u']
                     
stopwords = set(sw.words('english') + custom_stop_words)

def lemmatize(token, tag):
    # collapse word inflections into single representation
    tag = {
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV,
        'J': wordnet.ADJ
    }.get(tag[0], wordnet.NOUN)

    return lemmatizer.lemmatize(token, tag)

def cab_tokenizer(document):
    # tokenize the corpus
    tokens = []

    # split the document into sentences
    for sent in sent_tokenize(document):
        # tokenize each sentence
        for token, tag in pos_tag(wordpunct_tokenize(sent)):
            # preprocess and remove unnecessary characters
            token = token.lower()
            token = token.strip()
            token = token.strip('_')
            token = token.strip('*')

            # If punctuation, ignore token and continue
            if all(char in punct for char in token):
                continue

            # If stopword, ignore token and continue
            if token in stopwords:
                continue

            # Lemmatize the token and add back to the token
            lemma = lemmatize(token, tag)

            # Append lemmatized token to list
            tokens.append(lemma)
    return tokens

In [5]:
%%time
#Preprocessing and Vector Fitting
from sklearn.feature_extraction.text import CountVectorizer

#LDA requires Count Vectorizer
tf_vectorizer = CountVectorizer(tokenizer=cab_tokenizer,ngram_range=(1,2),
                                   min_df=0.1, max_df=0.90)
tf = tf_vectorizer.fit_transform(df[0])
tf_feature_names = tf_vectorizer.get_feature_names()

print("Vectorized corpus")

Vectorized corpus
CPU times: user 2.55 s, sys: 104 ms, total: 2.66 s
Wall time: 2.69 s


In [6]:
%%time
#Model Generation
from sklearn.decomposition import NMF, LatentDirichletAllocation
topics = 5

#Latent Dirilicht Analysis - fit the model using term frequency vector
lda = LatentDirichletAllocation(n_components=topics,max_iter=5,learning_method='online',learning_offset=50,random_state=0).fit(tf)

print("Models fitted")

Models fitted
CPU times: user 196 ms, sys: 24 ms, total: 220 ms
Wall time: 178 ms


In [41]:
#retrieve topic word distributions, as well as document topic distributions
normTWDist = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis] #topi word distribution
print(normTW.shape)
print(sum(normTW[0]))

normDTDist = lda.transform(tf)
print(normDTDist.shape)
print(normDTDist[1].sum())

(5, 368)
1.0
(20, 5)
1.0


In [11]:
#display results
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in (-topic).argsort()[:no_top_words]]))
        
no_top_words = 368

print("\nLDA Topics:")
display_topics(lda, tf_feature_names, no_top_words)


LDA Topics:
Topic 0:
000 rally war russia draft want two come 3 russian 600 000 include right must half center 1993 order sunday could 600 former people 1 weapon support pull live state go fact l child jackson government official disagree palestinian 10 participation 5 able one rule life heavy set separate soon pistol range ask p belong pay form early old find hard information next tomorrow muslim class individual good press nothing back create country general call national year security line living play authority hope highly high town presence large others libertarian strong work fire listen deny force people die 150 time face 500 newspaper base different office question happen reject agree move radio first control job die leader approach lot announce many remain freedom speech die war discuss small conclusion april everyone also see deliberate gun march obviously past rather jump real hill 70 address refer try comment bottom line week may think failure quote speech bit account keep 