In [1]:
import os
CORPUS_PATH = os.path.join('./documents/state_of_union')
filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in os.listdir(CORPUS_PATH)])

# Function to display the words in each topic
def print_topic_words(ldamodel,n_topics,n_words):
    for i in ldamodel.print_topics(num_topics=n_topics, num_words=n_words):
        print(i[1])

In [2]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')
en_stop.append(['will','can','must','s'])

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
#Our Document
filenames

# list for tokenized documents in loop
texts = []

# Non-tokenized texts
texts_nt = []

# loop through document list
for k in filenames:
    
    file = open(k,'r') # open file
    
    # clean and tokenize document text
    raw_text = file.read().lower()
    
    tokens = tokenizer.tokenize(raw_text)
    

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)
    texts_nt.append(raw_text)
    
    file.close() # close file

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)



## Sentiment Analysis

In [21]:
from textblob import TextBlob
import pandas as pd
sentiments = [TextBlob(text).sentiment for text in texts_nt ] 

polarity = [ i[0] for i in sentiments]
subjectivity = [ i[1] for i in sentiments]

In [27]:
sentiments[0]

Sentiment(polarity=0.10376431364334592, subjectivity=0.5562584369035982)

In [29]:
SOU_sentiments = pd.DataFrame(
    {'Filename': filenames,
     'Polarity': polarity,
     'Subjectivity': subjectivity
    })

In [30]:
SOU_sentiments

Unnamed: 0,Filename,Polarity,Subjectivity
0,./documents/state_of_union\1945-Truman.txt,0.103764,0.556258
1,./documents/state_of_union\1946-Truman.txt,0.111545,0.422838
2,./documents/state_of_union\1947-Truman.txt,0.136731,0.409495
3,./documents/state_of_union\1948-Truman.txt,0.165403,0.432409
4,./documents/state_of_union\1949-Truman.txt,0.155075,0.456053
5,./documents/state_of_union\1950-Truman.txt,0.181496,0.442650
6,./documents/state_of_union\1951-Truman.txt,0.199358,0.498749
7,./documents/state_of_union\1953-Eisenhower.txt,0.147536,0.454934
8,./documents/state_of_union\1954-Eisenhower.txt,0.147351,0.412097
9,./documents/state_of_union\1955-Eisenhower.txt,0.156330,0.421521


## Topic Modeling

In [4]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=50)
print_topic_words(ldamodel,10,6)

0.000*"must" + 0.000*"year" + 0.000*"peopl" + 0.000*"american" + 0.000*"america" + 0.000*"world"
0.000*"year" + 0.000*"nation" + 0.000*"must" + 0.000*"world" + 0.000*"congress" + 0.000*"new"
0.013*"world" + 0.009*"american" + 0.008*"us" + 0.008*"nation" + 0.007*"let" + 0.007*"must"
0.000*"year" + 0.000*"nation" + 0.000*"state" + 0.000*"dollar" + 0.000*"program" + 0.000*"govern"
0.014*"nation" + 0.009*"must" + 0.009*"world" + 0.008*"year" + 0.007*"program" + 0.007*"congress"
0.017*"program" + 0.011*"tax" + 0.009*"year" + 0.009*"billion" + 0.009*"govern" + 0.007*"now"
0.000*"nation" + 0.000*"year" + 0.000*"peopl" + 0.000*"must" + 0.000*"world" + 0.000*"govern"
0.011*"year" + 0.011*"american" + 0.010*"america" + 0.009*"peopl" + 0.009*"must" + 0.007*"work"
0.014*"year" + 0.012*"war" + 0.011*"dollar" + 0.009*"million" + 0.009*"nation" + 0.008*"program"
0.011*"nation" + 0.010*"us" + 0.007*"unit" + 0.006*"let" + 0.006*"american" + 0.006*"one"


Using sklearn version of LDA instead: https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py 

In [19]:
#texts[0]

In [53]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(texts_nt) # feed untransformed text to vectorizer

sklda = LatentDirichletAllocation(n_components=3,max_iter=50,
                                learning_method='online',
                               # learning_offset=50.,
                              #  random_state=0
                                 )
sklda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=3, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [56]:
n_top_words = 15

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(sklda, tf_feature_names, n_top_words)

Topic #0: applause americans tonight let children ask ve budget care just tax health right want economy
Topic #1: federal program war economic free billion dollars programs shall million defense military economy tax strength
Topic #2: americans program tax federal economic war applause free long defense right economy budget children programs



In [58]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word = dictionary, passes=30)
print_topic_words(ldamodel,3,7)

0.010*"american" + 0.010*"year" + 0.010*"america" + 0.009*"must" + 0.009*"peopl" + 0.008*"work" + 0.007*"nation"
0.013*"nation" + 0.009*"must" + 0.008*"world" + 0.007*"year" + 0.006*"peopl" + 0.006*"govern" + 0.006*"program"
0.013*"year" + 0.011*"war" + 0.011*"dollar" + 0.009*"million" + 0.009*"govern" + 0.008*"nation" + 0.008*"program"
