In [2]:
import pandas as pd
import re
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

In [4]:
# load datasets and perform some cleaning
def get_corpus(addr):
    text = pd.read_csv(addr)

    # remove punctuation and lower case the text
    text['text_processed'] = text['text'].map(lambda x: re.sub('[,\.!?]', '', x))
    text['text_processed'] = text['text_processed'].map(lambda x: x.lower())

    return(text)

# cleaning the data for lda
def get_ct(dat, col):
    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform(dat[col])
    return(count_vectorizer, count_data)

# perform LDA on col and return the model with specified number of topics
def get_lda(dat, col, k):
    count_vectorizer, count_data = get_ct(dat, col)

    # fit model
    lda = LDA(n_components=k, n_jobs=-1)
    lda.fit(count_data)
    return(lda, count_data, count_vectorizer)

# print the topics for a given lda model and count_vectorizer
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print('\nTopic #%d:' % topic_idx)
        print(' '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

# get the lda visual in html as specified in filepath
def get_vis_html(filepath, lda, count_data, count_vecorizer):
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vecorizer)
    pyLDAvis.save_html(LDAvis_prepared, filepath)


In [7]:
addr = 'D:/OneDrive/File/College/Umich/DataTeam/Debate-bingo/data/january_transcript.csv'
jan = get_corpus(addr)
lda, cd, cv = get_lda(jan, 'text_processed', 5)
print_topics(lda, cv, 10)
#get_vis_html('jan_lda.html', lda, cd, cv)


Topic #0:
going need trade iran think got agreement military look know

Topic #1:
ve don healthcare said think medicare going people country troops

Topic #2:
going people president nuclear got trump say climate ve make

Topic #3:
senator warren thank sanders klobuchar people care ve right need

Topic #4:
president people going trump war ve vice biden mr right
