In [9]:
# This is a notebook to utilize LDA topic modeling for social media data
# and then visualizing it utilizing pyLDAvis

# A lot of this is from websites such as:
# https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/
# http://hojunhao.github.io/sgparliament/LDA.html

# Conda does not include gensim and pyLDAvis, so you must install those
# via pip install from bash

In [31]:
# Bring in social media posts from a CSV file
import pandas as pd
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

# You need to change this file name below to whichever CSV file that you are using
# You also need to change below what the column name in your CSV file is that holds your social media data
# IF YOU ARE PULLING IN FROM NODEXL, MAKE SURE THE FIRST ROW IS THE HEADER NAME!!!
csv_file = 'trumptweets.csv'
data_header = 'Tweet'
twitteryes = 'TRUE'

# Initializing document list for topic modeling
doc_complete = []

# Working with tokenizer specially made for Tweets
tweet_tokenizer = TweetTokenizer()
punct = list(string.punctuation)
stopword_list = stopwords.words('english') + punct + ['rt', 'via', '...'] 

# From Bonzanini
def process(text, tokenizer=TweetTokenizer(), stopwords=[]): 
    text = text.lower() 
    tokens = tokenizer.tokenize(text) 
    return [tok for tok in tokens if tok not in stopwords and not tok.isdigit()]

# If we are dealing with Twitter data
if twitteryes:
    smdata = pd.read_csv(csv_file)
    for line in smdata:
        tokens = process(line, tokenizer=tweet_tokenizer, stopwords=stopword_list) 
        doc_complete.append(tokens)

print("\n Now you have a list called doc_complete that has stored all of your social media posts! \n")

print(smdata)


 Now you have a list called doc_complete that has stored all of your social media posts! 

                                                  ﻿Tweet
0      #FreeSouthernCameroons\r#FreeALLarrested\r#sto...
1      @jlzorzi @mirthalegrand y como fue a la asunci...
2      @NewssTrump @Dinkiedow -Yea, DO IT! Miracles d...
3      I want .@realDonaldTrump &amp; his incompetent...
4      I bet @realDonaldTrump wouldn't want us to kno...
5      @realDonaldTrump Why do you always lie? @Barac...
6      @Old_Bern_Kenobi @realDonaldTrump - I've been ...
7      Estoy totalmente de acuerdo @realDonaldTrump \...
8      .@realDonaldTrump vacation last weekend cost t...
9      RT @sydneyrachel: lol at anyone who thought a ...
10     @realDonaldTrump let's make America grey again...
11     RT @realDonaldTrump: Heading to Joint Base And...
12     No #WMD &amp; Many 100,000s #Iraqi Civilians K...
13     RT @X123Alpha: No #WMD &amp; Many 100,000s #Ir...
14     RT @X123Alpha: No #WMD &amp; Many 100,000s #Ir

In [22]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in doc_complete]

In [23]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [24]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [68]:
# Now import pyLDAvis to visualize
import pyLDAvis.gensim

In [25]:
# We need to create a corpus in market matrix format
corpora.MmCorpus.serialize('temp.mm', doc_term_matrix)
corpus = corpora.MmCorpus('temp.mm')
print(corpus)

MmCorpus(1 documents, 1 features, 1 non-zero entries)


In [70]:
# Visualize!
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
