# LDA analysis

In [1]:
# TODO: POS, NER, BI/TRI-GRAMS ???

In [2]:
# Import libraries
import pandas as pd
import csv, html, re
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
# Load data
data = pd.read_csv("drive/MyDrive/data_train_balanced_7-6.csv").drop(['usefulCount', 'Unnamed: 0'], axis = 1)
data['review'] = data['review'].apply(html.unescape)

In [5]:
# View data head
data.head()

Unnamed: 0,drugName,condition,review,rating,date,positiveness
0,Daytrana,ADHD,"""Hi all, My son who is 12 was diagnosed when h...",10.0,"January 12, 2017",1
1,Vyvanse,ADHD,"""Today was my first day taking Vyvanse and wow...",9.0,"August 31, 2011",1
2,Lisdexamfetamine,ADHD,"""I have been on Vyvanse for almost 2 years and...",10.0,"March 10, 2014",1
3,Atomoxetine,ADHD,"""28 years old, 165 pounds. Severe paranoia fro...",9.0,"November 27, 2011",1
4,Concerta,ADHD,"""Some of these experiences are quite alarming!...",10.0,"February 7, 2012",1


In [6]:
data.shape

(102752, 6)

## Preprocessing

In [7]:
# Import Text Mining libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from textblob import TextBlob
import gensim
from gensim.models.coherencemodel import CoherenceModel

In [8]:
# Define useful classes
%%capture
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer = PorterStemmer()
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
def correct (text):
  textBlb = TextBlob(text)        
  textCorrected = textBlb.correct()
  return textCorrected

In [10]:
def preprocessing(text):

  # Tokenization
  tokens = word_tokenize(text.lower())

  # Keep only alphabet strings
  tokens = [t for t in tokens if t.isalpha()]

  # Correct text
  # tokens = [correct(t) for t in tokens]

  # Stop words removal
  tokens = [t for t in tokens if t not in stop_words]

  # Lemmatization
  # tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

  # Stemming (Porter stemmer)
  tokens = [porter_stemmer.stem(t) for t in tokens]

  # Short words removal
  # tokens = [t for t in tokens if len(t) > 2]

  return tokens

In [11]:
# Preprocess data
reviews = data['review'].apply(preprocessing)

In [12]:
# Create dictionary
dictionary = gensim.corpora.Dictionary(reviews)

# Filter words that occurs in less than 'less'% documents and words that occurs in more than 'more'% of total documents. Keep top 'keep_n' frequent words
less = 10
more = 65
keep_n = 100000
dictionary.filter_extremes(no_below = less/100, no_above = more/100, keep_n = keep_n)

# Create BoW
bow_corpus = [dictionary.doc2bow(doc) for doc in reviews]

## LDA

In [13]:
# Get best LDA according to highest coherence based on number of topics
def get_best_LDA():
  best_coherence = 0
  best_lda_model = 0
  for num_topics in [2,4,6,8]:
    lda_model = gensim.models.LdaModel(corpus = bow_corpus,
                                        id2word = dictionary,
                                        num_topics = num_topics,
                                        offset = 2,
                                        random_state = 123,
                                        update_every = 1,
                                        passes = 10,
                                        alpha = 'auto',
                                        eta = "auto",
                                        per_word_topics = True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    if coherence_lda > best_coherence:
      best_coherence = coherence_lda
      best_lda_model = lda_model
  return best_lda_model

In [14]:
lda_model = get_best_LDA()



In [15]:
from pprint import pprint
pprint(lda_model.print_topics())

[(0,
  '0.028*"depress" + 0.023*"effect" + 0.023*"anxieti" + 0.020*"feel" + '
  '0.019*"side" + 0.014*"drug" + 0.013*"sleep" + 0.010*"medic" + '
  '0.010*"thought" + 0.010*"life"'),
 (1,
  '0.098*"pain" + 0.025*"infect" + 0.023*"effect" + 0.022*"side" + '
  '0.017*"sever" + 0.015*"relief" + 0.012*"blood" + 0.011*"drug" + 0.010*"leg" '
  '+ 0.010*"muscl"'),
 (2,
  '0.055*"day" + 0.028*"take" + 0.021*"feel" + 0.017*"took" + 0.014*"like" + '
  '0.014*"first" + 0.013*"night" + 0.013*"hour" + 0.011*"start" + '
  '0.011*"time"'),
 (3,
  '0.065*"read" + 0.054*"review" + 0.022*"yi" + 0.021*"psoriasi" + '
  '0.019*"peopl" + 0.015*"experi" + 0.012*"everyon" + 0.011*"comment" + '
  '0.011*"im" + 0.010*"say"'),
 (4,
  '0.030*"period" + 0.027*"month" + 0.022*"cymbalta" + 0.021*"pill" + '
  '0.017*"shot" + 0.016*"got" + 0.016*"week" + 0.016*"get" + 0.016*"cramp" + '
  '0.014*"insert"'),
 (5,
  '0.077*"weight" + 0.043*"lost" + 0.041*"gain" + 0.033*"pound" + 0.033*"side" '
  '+ 0.032*"effect" + 0.028*

In [16]:
# Compute Perplexity (lower is better)
print('Perplexity: ', lda_model.log_perplexity(bow_corpus))  

# Compute Coherence Score (higher is better)
coherence_model_lda = CoherenceModel(model=lda_model, texts=reviews, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

Perplexity:  -6.969130695678051

Coherence Score:  0.5203136326333659


In [17]:
%%capture
!pip install pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
import os

In [18]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
vis

In [24]:
# Find stemmed words
word = "tri\w+"
sorted(dict(zip(*np.unique([itm[0] for itm in data['review'].str.findall(r'{}'.format(word)) if len(itm)>0], return_counts = True))).items(), key= lambda x: x[1], reverse = True)[:5]

  word = "tri\w+"


[('tried', 7724),
 ('trin', 2280),
 ('trist', 601),
 ('trial', 520),
 ('triosis', 480)]