In [44]:
# This is a notebook to utilize LDA topic modeling for social media data
# and then visualizing it utilizing pyLDAvis

# A lot of this is from websites such as:
# https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
# http://hojunhao.github.io/sgparliament/LDA.html

# Conda does not include gensim and pyLDAvis, so you must install those
# via pip install from bash

In [45]:
# Bring in social media posts from a CSV file
import pandas as pd

# You need to change this file name below to whichever CSV file that you are using
csv_file = 'user_timeline_Illinois_Alma.csv'

# Give the name of the column with the social data in your CSV file
dataheader = 'Tweet'

# Assign the number of topics
topicsnumber = 4

In [46]:
# Initializing document list for topic modeling
doc_complete = []

smdata = pd.read_csv(csv_file)
for row in smdata[dataheader]:
    doc_complete.append(row)

print("\n Now you have a list called doc_complete that has stored all of your social media posts! \n")


 Now you have a list called doc_complete that has stored all of your social media posts! 



In [47]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
# Add Twitter stop words
stop.add('rt')
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

In [48]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [49]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=topicsnumber, id2word = dictionary, passes=50)

In [50]:
# Now import pyLDAvis to visualize
import pyLDAvis.gensim

In [51]:
# We need to create a corpus in market matrix format
corpora.MmCorpus.serialize('temp.mm', doc_term_matrix)
corpus = corpora.MmCorpus('temp.mm')
print(corpus)

MmCorpus(3174 documents, 11019 features, 36576 non-zero entries)


In [52]:
# Visualize!
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
