#Presidential Candidate Topic Analysis of Tweets
#By Joshua E. Jodesty

In [29]:
# CREDITS
import graphlab as gl
from pymongo import MongoClient
import pickle
from requests_oauthlib import OAuth1
import cnfg
import csv
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
import nltk
import string
from nltk.tag import pos_tag
from gensim import corpora, models, similarities
import re
from gensim.models import Word2Vec as w2v
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import graphlab as gl
import pyLDAvis
import pyLDAvis.graphlab
from scipy.stats import mode

# Data Extraction with Twitter API via TwitterSearch

# LDA: Topic Analysis & Modeling with GraphLab

### Step 1: Connect to MongoDB database containing tables of tweets concerning Hillary Clinton & Donald Trump

In [30]:
client = MongoClient('54.69.199.34', 27017)
dbh = client.election2016
hillary_tweets = dbh.hillary_tweets
dbt = client.election2016
trump_tweets = dbt.trump_tweets

###2: Convert collections of tweet statuses into an SArray of tokenized and stemmed documents

In [31]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [32]:
# h_statuses = []
# for tweet in list(hillary_tweets.find()):
#         h_statuses.append(re.sub('(http)[^ ]+', '',tweet['text'].encode('utf-8')))

In [33]:
h_statuses_tok_stem = []
for tweet in list(hillary_tweets.find()):
        h_statuses_tok_stem.append(tokenize_and_stem(re.sub('(http)[^ ]+', '',tweet['text'])))

h_statuses = []
for i in range(len(h_statuses_tok_stem)):
    h_statuses.append(' '.join(word for word in h_statuses_tok_stem[i]).encode('utf-8'))

In [195]:
# t_statuses = []
# for tweet in list(trump_tweets.find()):
#         t_statuses.append(re.sub('(http)[^ ]+', '',tweet['text'].encode('utf-8')))

In [34]:
t_statuses_tok_stem = []
for tweet in list(trump_tweets.find()):
        t_statuses_tok_stem.append(tokenize_and_stem(re.sub('(http)[^ ]+', '',tweet['text'])))
        
t_statuses = []
for i in range(len(h_statuses_tok_stem)):
    t_statuses.append(' '.join(word for word in h_statuses_tok_stem[i]).encode('utf-8'))

###3: Test/Train split bag of words
- a. Convert collections of tweets to unigram, bigram, & trigram bag of words / document vector 
- b. Remove Stop/Irrelevant-words
- c. Test/Train split 

In [35]:
def getdocs(statuses):
    docs1 = gl.text_analytics.count_words(gl.SArray(statuses))
    docs1 = docs1.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)
    docs1 = docs1.dict_trim_by_keys(['key', "they're", "#hillaryclinton", '…',
                                    "don't", 'rt', '&amp;', '-', "clinton's", '|',
                                    'clinton', 'hillary',"'s",'hillari', 'hillaryclinton',
                                    "n't", 'news', 'tri', 'point', 'hillary2016',
                                    "they'r", 'countryboyiif', 'someon'], exclude=True)
    docs2 = gl.text_analytics.count_ngrams(gl.SArray(statuses), 2)
    docs2 = docs2.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)
    # docs2 = docs2.dict_trim_by_keys([], 
    #                               exclude=True)
    docs3 = gl.text_analytics.count_ngrams(gl.SArray(statuses), 3)
    docs3 = docs3.dict_trim_by_keys(gl.text_analytics.stopwords(), exclude=True)
    # docs3 = docs3.dict_trim_by_keys([], 
    #                               exclude=True)
    return docs1, docs2, docs3

#fffffffffffffffffffffffffffffffff
def splits_and_docs(statuses):
    docs1, docs2, docs3 = getdocs(statuses)
    train1, test1 = gl.text_analytics.random_split(docs1, prob=0.75)
    train2, test2 = gl.text_analytics.random_split(docs2, prob=0.75)
    train3, test3 = gl.text_analytics.random_split(docs3, prob=0.75)
    return train1, test1, train2, test2, train3, test3, docs1, docs2, docs3

In [36]:
h_train1, h_test1, h_train2, h_test2, h_train3, h_test3, h_docs1, h_docs2, h_docs3 = splits_and_docs(h_statuses)

In [37]:
t_train1, t_test1, t_train2, t_test2, t_train3, t_test3, t_docs1, t_docs2, t_docs3 = splits_and_docs(t_statuses)

###4: Fit N_Gram Models for Hillary & Trump on respective training sets

In [178]:
mh1 = gl.topic_model.create(h_train1, num_topics=10, alpha=0.1)
mh2 = gl.topic_model.create(h_train2, num_topics=10, alpha=0.1)
mh3 = gl.topic_model.create(h_train3, num_topics=10, alpha=0.1)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size      3340
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 192.039ms     | 1.0885e+06     | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size     10133
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 280.483ms     | 1.39135e+06   

In [179]:
mt1 = gl.topic_model.create(t_train1, num_topics=10, alpha=0.1)
mt2 = gl.topic_model.create(t_train2, num_topics=10, alpha=0.1)
mt3 = gl.topic_model.create(t_train3, num_topics=10, alpha=0.1)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size      3238
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 217.46ms      | 674222         | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size     10437
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 420.4ms       | 1.23649e+06   

###5: 10 Topics 

#### Hillary

In [180]:
print "Hillary Clinton"
print "Unigrams:", mode(mh1.predict(h_test1, output_type='assignment'))
print mh1.get_topics().print_rows(num_rows=100)

Hillary Clinton
Unigrams: (array([1]), array([ 1453.]))
+-------+---------------+-----------------+
| topic |      word     |      score      |
+-------+---------------+-----------------+
|   0   |     state     | 0.0218898385565 |
|   0   |     weigh     |  0.020465337132 |
|   0   |  controversi  |  0.020465337132 |
|   0   |     begin     | 0.0185660018993 |
|   0   |     polit     | 0.0185660018993 |
|   1   |     votin     | 0.0853773584906 |
|   1   |     merica    | 0.0699399656947 |
|   1   |     trump     | 0.0373499142367 |
|   1   |     obama     | 0.0347770154374 |
|   1   |      whi      | 0.0339193825043 |
|   2   |     presid    | 0.0453762466002 |
|   2   |     black     | 0.0263372620127 |
|   2   |    support    | 0.0227107887579 |
|   2   |     peopl     | 0.0186310063463 |
|   2   |      bad      | 0.0154578422484 |
|   3   |     email     | 0.0484892086331 |
|   3   |     berni     | 0.0350599520384 |
|   3   |     sander    | 0.0293045563549 |
|   3   |     server

#### Trump

In [181]:
print "Donald Trump"
print "Unigrams:", mode(mt1.predict(h_test1, output_type='assignment'))
print mt1.get_topics().print_rows(num_rows=100)

Donald Trump
Unigrams: (array([9]), array([ 1238.]))
+-------+-------------+-----------------+
| topic |     word    |      score      |
+-------+-------------+-----------------+
|   0   |    trump    | 0.0995322657365 |
|   0   |     bush    |  0.097153955922 |
|   0   |     tie     | 0.0781274774061 |
|   0   |     team    | 0.0674250832408 |
|   0   |     babi    | 0.0642540034882 |
|   1   |    email    | 0.0604079019572 |
|   1   |    sander   | 0.0256539235412 |
|   1   |   democrat  | 0.0219956100238 |
|   1   | controversi | 0.0215383208341 |
|   1   |    berni    | 0.0192518748857 |
|   2   |   benghazi  | 0.0283029117775 |
|   2   |     amp     | 0.0158083441982 |
|   2   |    spark    | 0.0147218600608 |
|   2   |   concern   | 0.0141786179922 |
|   2   |    attack   | 0.0141786179922 |
|   3   |    trump    | 0.0652486734419 |
|   3   |    donald   | 0.0544563360014 |
|   3   |     poll    | 0.0364691069341 |
|   3   |   florida   | 0.0234283658602 |
|   3   |    state    |

###6: Model Evaluation: Perplexity - Ridiculously High

In [185]:
print "Hillary", mh1.evaluate(h_train1, h_test1)

Hillary {'perplexity': 531.2560270747849}


In [183]:
print "Trump", mt1.evaluate(t_train1, t_test1)

Trump {'perplexity': 548.403099751884}


###7: Visualization in D3.js via pyLDAvis Python wrapper

In [38]:
# turn on automatic rendering of visualizations
pyLDAvis.enable_notebook()

In [39]:
mh = gl.topic_model.create(h_docs1, num_topics=10, alpha=0.1)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size      6365
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 468.987ms     | 1.66091e+06    | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+


In [40]:
mt = gl.topic_model.create(t_docs1, num_topics=10, alpha=0.1)

PROGRESS: Learning a topic model
PROGRESS:        Number of documents     10163
PROGRESS:            Vocabulary size      6365
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 513.35ms      | 2.00611e+06    | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+


#### Hillary

In [41]:
print "Hillary Clinton"
pyLDAvis.graphlab.prepare(mh, h_docs1)

Hillary Clinton


#### Trump

In [42]:
print "Donald Trump"
pyLDAvis.graphlab.prepare(mt, t_docs1)

Donald Trump


# K-Means Clustering

In [61]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")

In [105]:
def cluster(statuses, true_k=10):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(statuses)
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
    model.fit(X)
    print("Top terms per cluster:")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    print ""
    for i in range(true_k):
        words=[]
        print("Cluster %d:" % i)
        for ind in order_centroids[i, :10]:
            words.append(terms[ind])
        for word in words:
            print word,
        print ""
        print ""
    return model, X, 

In [196]:
cluster(h_statuses,true_k=15)


Top terms per cluster:

Cluster 0:
privat server use regret email hillari clintonemail texasshebandit wakeupamerica john 

Cluster 1:
spokesperson benshapiro discuss hire break wipe new campaign server rt 

Cluster 2:
campaign upend yarmuth compar bradi scandal deflateg tom email hillari 

Cluster 3:
classifi server email say clinton wipe wsj campaign hillari lawyer 

Cluster 4:
tie bush trump team anchor babi seek clinton jeb campa 

Cluster 5:
nytim black sacrifici lamb treat cow sacr nytopinion peopl like 

Cluster 6:
poll trump donald florida hillari clinton state rt deez pennsylvania 

Cluster 7:
torpedo did obama whi washtim republican alreadi hillari jail clinton 

Cluster 8:
nail birthright citizenship levin mark noamnesti drmartyfox illeg interview hanniti 

Cluster 9:
votin merica someon say cloydriv countryboyiif rt hillari clinton trump 

Cluster 10:
spark concern benghazi giant mock email clever tweet gun remington 

Cluster 11:
hillari clinton rt just hillaryclinton email

(KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=15, n_init=1,
     n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
     verbose=0), <10163x5985 sparse matrix of type '<type 'numpy.float64'>'
 	with 97876 stored elements in Compressed Sparse Row format>)

In [197]:
cluster(t_statuses,true_k=15)

Top terms per cluster:

Cluster 0:
torpedo obama whi washtim hillari clinton monicacrowley crowley monica mi 

Cluster 1:
server campaign spokesperson benshapiro discuss hire break wipe new email 

Cluster 2:
sander berni lost point week just wi mulawpol hillari mu 

Cluster 3:
tie bush trump team anchor babi seek clinton jeb campa 

Cluster 4:
votin merica someon say cloydriv countryboyiif rt hillari clinton trump 

Cluster 5:
ohio pennsylvania florida match holi crap numbersmunch trustworthi frontrunn frankluntz 

Cluster 6:
black live polit matter lamb sacrifici treat cow sacr nytopinion 

Cluster 7:
nail birthright citizenship levin mark noamnesti drmartyfox illeg interview hanniti 

Cluster 8:
donald trump deez nut poll hillari clinton surg florida slip 

Cluster 9:
mock giant tweet gun clever remington theblaz hillaryclinton remingtonarm suit 

Cluster 10:
trustworthi honest yes thefix think problem voter 3rds fl pa 

Cluster 11:
did republican alreadi jail reel washtim trump car

(KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=15, n_init=1,
     n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
     verbose=0), <10163x5985 sparse matrix of type '<type 'numpy.float64'>'
 	with 97876 stored elements in Compressed Sparse Row format>)

# Word2Vec

In [43]:
h_statuses_ascii = []
for tweet in list(hillary_tweets.find()):
        h_statuses_ascii.append(re.sub('(http)[^ ]+', '',tweet['text']))

In [44]:
t_statuses_ascii = []
for tweet in list(trump_tweets.find()):
        t_statuses_ascii.append(re.sub('(http)[^ ]+', '',tweet['text']))

In [None]:
#remove proper names
%time h_preprocess = [strip_proppers(status) for status in h_statuses_ascii]
%time t_preprocess = [strip_proppers(status) for status in t_statuses_ascii]

#tokenize
%time h_tokenized_text = [tokenize_and_stem(text) for text in h_preprocess]
%time t_tokenized_text = [tokenize_and_stem(text) for text in t_preprocess]

#remove stop words
%time h_texts = [[word for word in text if word not in stopwords.words()] for text in h_tokenized_text]
%time t_texts = [[word for word in text if word not in stopwords.words()] for text in t_tokenized_text]

CPU times: user 12.2 s, sys: 11.1 s, total: 23.3 s
Wall time: 16.2 s
CPU times: user 11.6 s, sys: 9.83 s, total: 21.5 s
Wall time: 15.2 s
CPU times: user 10.9 s, sys: 9.01 s, total: 19.9 s
Wall time: 14 s
CPU times: user 12.2 s, sys: 8.8 s, total: 21 s
Wall time: 15.2 s
CPU times: user 5min 35s, sys: 1min 45s, total: 7min 20s

In [299]:
h_model = w2v(h_texts, size=100, window=5, min_count=1, workers=4,sg=1)
t_model = w2v(t_texts, size=100, window=5, min_count=1, workers=4,sg=1)

In [359]:
h_model.most_similar(positive=['email', 'fbi'])

[(u'secret', 0.5495145320892334),
 (u'+video', 0.4835871160030365),
 (u'materi', 0.48173990845680237),
 (u'govt', 0.47390615940093994),
 (u'dhrxsol1234', 0.4716964662075043),
 (u'aid', 0.4661208391189575),
 (u'privat', 0.45924410223960876),
 (u'unprotect', 0.45504775643348694),
 (u'retroact', 0.45098018646240234),
 (u'clean', 0.4288298785686493)]

In [330]:
h_model.most_similar(positive=['public','opinion'])

[(u'handl', 0.7293180227279663),
 (u'racism', 0.709782063961029),
 (u'data', 0.6904987692832947),
 (u'way', 0.6818188428878784),
 (u'doe', 0.6672757863998413),
 (u"she'wip", 0.6624436974525452),
 (u'neither', 0.6459596753120422),
 (u'remain', 0.638897180557251),
 (u'info', 0.638538122177124),
 (u'aggronat', 0.6348254680633545)]

In [400]:
h_model.most_similar(positive=['vpotus'])

KeyError: "word 'vpotus' not in vocabulary"

In [328]:
t_model.most_similar(positive=['public','opinion'])

[(u'cost', 0.8708800077438354),
 (u'posit', 0.8399043083190918),
 (u'stupid', 0.8359530568122864),
 (u'ufc', 0.8357139229774475),
 (u'president*', 0.8266575336456299),
 (u"intellect'for", 0.8219687938690186),
 (u'sooner', 0.8177927732467651),
 (u'walahi', 0.8078489899635315),
 (u"wall'round", 0.8000162839889526),
 (u'strength', 0.7981834411621094)]

In [349]:
t_model.most_similar(positive=['deez', 'nutz'])

[(u'everybodi', 0.6605314612388611),
 (u'norm', 0.6603173017501831),
 (u'remov', 0.6553205847740173),
 (u'campaig', 0.6405559182167053),
 (u'bro', 0.6391571760177612),
 (u'throw', 0.6381931900978088),
 (u'tbh', 0.6375591158866882),
 (u'today', 0.6374085545539856),
 (u'id', 0.6340110301971436),
 (u'market', 0.6328635215759277)]

In [369]:
t_model.most_similar(positive=['money'])

[(u'alway', 0.8137528896331787),
 (u'could', 0.7805806994438171),
 (u'shelbychong', 0.7560003995895386),
 (u'consequ', 0.7216663360595703),
 (u'elle_emm_aitch', 0.6869857311248779),
 (u'sent', 0.6844447255134583),
 (u'option', 0.683176577091217),
 (u'stood', 0.6818699836730957),
 (u'efelsenth', 0.674308180809021),
 (u'give', 0.6727308034896851)]

In [394]:
t_model.most_similar(positive=['potus'])

[(u'*in', 0.5780456066131592),
 (u'dailyko', 0.5230739712715149),
 (u'cnn', 0.5207012295722961),
 (u'dynamictunez', 0.5005640387535095),
 (u'aldotcom', 0.47949233651161194),
 (u'comp\u2026', 0.46756237745285034),
 (u'doom', 0.46511128544807434),
 (u'bagelhusband', 0.4650976061820984),
 (u'beritaterkini', 0.461683988571167),
 (u'stoolpresident', 0.4475025236606598)]

# Conclusions/Future:
### 1. Need documents with more than 144 characters or more tweets to extract more insightful topics 
### 2. Gensim LDA model take a long time to train