In [127]:
import pandas as pd
import numpy as np

import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaMulticore
from gensim import corpora

import spacy
spacy.load('en', disable=['parser', 'ner'])

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

from google.colab import drive



In [128]:
drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [129]:
# Load data
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Tweets.csv')

In [124]:
def preprocess_text(tweets, allowed_pos_tags=['NOUN', 'VERB', 'ADV', 'ADJ']):
  cleaned_tweets = list()
  nlp = spacy.load('en_core_web_sm')
  for tweet in tweets:
    # Use simple preprocess to convert to lower case and remove punctuation
    tweet = nlp(' '.join(simple_preprocess(tweet, deacc=True,)))
    # Use spacy to lemmatize and conform token's pos tag is in the allowed list
    cleaned_tweets.append([token.lemma_ for token in tweet if token.pos_ in allowed_pos_tags])
  
  return cleaned_tweets

In [106]:
# Cleaned up text
texts = preprocess_text(data.text)

# Create dictionary
id2word = corpora.Dictionary(texts)

# Create corpus
corpus = [id2word.doc2bow(text) for text in texts]

In [126]:
# Check to confirm the corpus is in bow format
corpus[:3]

[[(0, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1)],
 [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]]

**Train LDA Model**

In [144]:
# Build LDA model to create topics
lda_model = LdaMulticore(corpus=corpus, num_topics=8, id2word=id2word, alpha=.4, random_state=1, passes=10, workers=4)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [145]:
# Get topic coherence
coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=texts, corpus=corpus, dictionary=id2word)
coherence_model.get_coherence()

0.3123169394386769

In [146]:
# Visualize topics
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=corpus, dictionary=id2word)
vis