In [1]:
# Mathematical/data cleaning tools
import numpy as np
import pandas as pd

#sklearn
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
# nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

## Twitter Data

In [2]:
twitter = pd.read_csv('data/full-corpus.csv')
# twitter2 = pd.read_csv('data/twitter_en.txt', sep='\n')

In [3]:
twitter.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [4]:
twitter.Topic.unique()

array(['apple', 'google', 'microsoft', 'twitter'], dtype=object)

In [5]:
arr = twitter['TweetText'].to_numpy()
type(arr[0])

str

In [12]:
type(twitter['TweetText'][0])

str

In [7]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text, min_len=0):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > min_len:
            result.append(lemmatize_stemming(token))
    return result

In [8]:
stemmer = SnowballStemmer('english')
processed_docs = twitter['TweetText'].map(preprocess)

In [9]:
type(processed_docs)

pandas.core.series.Series

In [10]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [11]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in bow_corpus]

[[('appl', 1), ('crack', 1), ('iphon', 2), ('swype', 1)],
 [('appl', 1),
  ('iphon', 1),
  ('add', 1),
  ('announc', 1),
  ('carrier', 1),
  ('support', 1)],
 [('appl', 1),
  ('affair', 1),
  ('duet', 1),
  ('exbnqji', 1),
  ('guy', 1),
  ('hilari', 1),
  ('http', 1),
  ('love', 1),
  ('pretti', 1),
  ('siri', 1),
  ('sum', 1),
  ('video', 1),
  ('youtub', 1)],
 [('appl', 1),
  ('iphon', 1),
  ('easi', 1),
  ('rim', 1),
  ('switch', 1),
  ('ya', 1)],
 [('appl', 1),
  ('get', 1),
  ('io', 1),
  ('realiz', 1),
  ('reason', 1),
  ('thank', 1),
  ('twitter', 1)],
 [('appl', 1),
  ('iphon', 1),
  ('android', 1),
  ('bite', 1),
  ('blackberri', 1),
  ('current', 1),
  ('disappoint', 1),
  ('littl', 1),
  ('user', 1)],
 [('appl', 1),
  ('http', 1),
  ('siri', 2),
  ('far', 1),
  ('give', 1),
  ('glad', 1),
  ('happyplac', 1),
  ('humor', 1),
  ('say', 1),
  ('sens', 1),
  ('sooo', 1),
  ('strangest', 1),
  ('thing', 1),
  ('twaeudbp', 1)],
 [('appl', 1),
  ('close', 1),
  ('event', 1),
  ('gr

In [12]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=4, id2word=id2word, passes=5, workers=2)

In [13]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.049*"twitter" + 0.033*"http" + 0.026*"appl" + 0.015*"rt" + 0.013*"googl" '
  '+ 0.011*"microsoft" + 0.009*"android" + 0.009*"iphon" + 0.006*"new" + '
  '0.005*"que"'),
 (1,
  '0.031*"googl" + 0.028*"http" + 0.024*"microsoft" + 0.022*"twitter" + '
  '0.013*"android" + 0.009*"appl" + 0.009*"rt" + 0.008*"el" + 0.008*"samsung" '
  '+ 0.007*"nexus"'),
 (2,
  '0.056*"twitter" + 0.025*"http" + 0.017*"googl" + 0.015*"microsoft" + '
  '0.014*"en" + 0.013*"que" + 0.012*"rt" + 0.008*"facebook" + 0.007*"la" + '
  '0.006*"android"'),
 (3,
  '0.026*"twitter" + 0.019*"microsoft" + 0.014*"http" + 0.010*"appl" + '
  '0.009*"rt" + 0.004*"app" + 0.004*"window" + 0.004*"use" + 0.004*"facebook" '
  '+ 0.004*"build"')]


In [14]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.266558393865026

Coherence Score:  0.2845474989171778


In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word, mds='mmds')
pyLDAvis.save_html(vis, 'media/LDA_4_topics.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
