<a href="https://colab.research.google.com/github/NUELBUNDI/NLP-using-Python/blob/master/Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Topic Modelling techniques

1. Latent Smemantic Analysis (LSA)
2. Probabilistic Latent Semantic Analysis (PLSA)
3. Latent Dirichlet Allocation (LDA)
4. Correlated Topic Model (CTM)



LDA assumes that the documents are generated using a statistical generative process, such that each document is a mixture of topics, and each topics are a mixture of words.

In [None]:
# pip install pyLDAvis

In [None]:
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import CoherenceModel

import pandas as pd
import numpy as np

import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()


# package to print
from pprint import pprint

In [None]:
df = pd.read_csv('/content/Market review.csv',encoding='latin-1')

## Cleaning Methods

1. Data Cleaning Methods

    - lowercase
    - tokenize
    - remove stop words
    - remove punctuation
    - remove numbers
    - remove white space
    - remove special characters
    - remove URLS
    - remove Non-ASCII
    - remove html tags
    - lemmatize
    - stem
    - correct spelling
    - remove diacritics
    - remove emojis
    - expand contractions
    - case folding
    - remove inconsistent whitespace
    - spell checks and correction
    - word filtering
    - remove mark up languages
    - detection of language

In [None]:
import spacy
from nltk.corpus import stopwords
# nltk.download("stopwords")

# Load the English language model in spaCy
nlp = spacy.load("en_core_web_sm")

import re

In [None]:
nltk.download('wordnet')

In [None]:
# lowercasing
def lowercase(text):
  return text.lower()

# Tokenize
def tokenize(text):
  return text.split()

#Remove stopwords
def remove_stopwords(text):
  doc             = nlp(text)
  tokens          = [token.text for token in doc]
  filtered_tokens = [token for token in tokens if  token.lower() not in set(stopwords.words('english'))]
  return ' '.join(filtered_tokens)

#Punctuuation remova
def removal_punctuation(text):
  return re.sub(r'[^\w\s]', '', text)

# Number Removal
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

#Special Character Removal
def remove_special_characters(text):
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

#Whitespace Removal
def remove_whitespace(text):
    return ' '.join(text.split())

# 11. Lemmatization (requires NLTK or spaCy)
from nltk.stem import WordNetLemmatizer

def lemmatize(text, lemmatizer):
    words = text.split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words])

# 12. Stemming (requires NLTK or spaCy)
from nltk.stem import PorterStemmer

def stem(text, stemmer):
    words = text.split()
    return ' '.join([stemmer.stem(word) for word in words])

#Word Filtering
def filter_words(text, min_length=3, stopwords=[]):
    words = text.split()
    return ' '.join([word for word in words if len(word) >= min_length and word not in stopwords])



In [None]:

stemmer = PorterStemmer()
lemmer  = WordNetLemmatizer()

def clean_text(text):
  text = lowercase(text)
  # text = tokenize(text)
  text = remove_stopwords(text)
  text = removal_punctuation(text)
  text = remove_numbers(text)
  text = remove_special_characters(text)
  text = remove_whitespace(text)
  text = lemmatize(text,lemmer)
  # text = stem(text,stemmer)

  return text


In [None]:
import nltk
nltk.download('stopwords')

## Approach One

##### Clean Data

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

df['data_words'] = list(sent_to_words(df.Market_Review))

# After tokenization remove stoword and apply lemmatization
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['gikomba', 'however', 'still','various','also','increase', 'year','new','time'
                    ])  #adding my own stop words

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ','ADV']):

    # remove stop words
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts_out = []

    # Lemmatization
    nlp = spacy.load("en_core_web_sm",disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
    return texts_out

df['data_ready'] = process_words(df.data_words)  # processed Text Data!

In [None]:
data_words = df['data_ready'].values.tolist()

In [None]:
# # Build the bigram and trigram models
# bigram     = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
# trigram    = gensim.models.Phrases(bigram[data_words], threshold=100)
# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod  = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)
# # See trigram example
# print(trigram_mod[bigram_mod[data_words[1]]])

##### Model Building

In [None]:
#  Dictionary
id2word = corpora.Dictionary(df.data_ready)

#  Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df.data_ready]


#### Coherence-

##### What is optimal Topics??

Calculate Coherence score using C_umass

In [None]:
import matplotlib.pyplot as plt

from gensim.models import LdaMulticore
topics = []
score  = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=id2word, iterations=50, num_topics=i, workers = 4, passes=10, random_state=100)
   cm        = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

Calculating the coherence score using C_v:

In [None]:
topics = []
score  = []
for i in range(1,20,1):
   lda_model = LdaMulticore(corpus=corpus, id2word=id2word, iterations=10, num_topics=i, workers = 4, passes=10, random_state=100)
   cm = CoherenceModel(model=lda_model, texts = df['data_ready'], corpus=corpus, dictionary=id2word, coherence='c_v')
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

Optimal Topics are 3 or 10

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics= 10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=20,
                                           passes=100,
                                           alpha='auto',
                                           iterations=100,
                                           per_word_topics=True)

# Look at the topics and key words
pprint(lda_model.print_topics())

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df.data_ready, dictionary=id2word, coherence='c_v')
coherence_lda      = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# df['Environ_Review'][0]

In [None]:
# lda_model[corpus][0]


In [None]:
for idx, topic in lda_model.show_topics(formatted=False, num_words= 15):
    print('Topic: {} --> Words: {}'.format(idx, '/'.join([w[0] for w in topic])))

In [None]:
# Calculate the topic distribution

from matplotlib.ticker import FuncFormatter

# Dominant topic in each review
def topics_per_review(model, corpus, start=0, end=1):
    corpus_sel = corpus[start:end]
    dominant_topics = []
    topic_percentages = []
    for i, corp in enumerate(corpus_sel):
        topic_percs, wordid_topics, wordid_phivalues = model[corp]
        dominant_topic = sorted(topic_percs, key = lambda x: x[1], reverse=True)[0][0]
        dominant_topics.append((i, dominant_topic))
        topic_percentages.append(topic_percs)
    return(dominant_topics, topic_percentages)

dominant_topics, topic_percentages = topics_per_review(model=lda_model, corpus=corpus, end=-1)

In [None]:
# Distribution of Dominant Topics in Each review
ndf                           = pd.DataFrame(dominant_topics, columns=['Document_Id', 'Dominant_Topic'])
dominant_topic_in_each_rev    = ndf.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_rev = dominant_topic_in_each_rev.to_frame(name='count').reset_index()
display(df_dominant_topic_in_each_rev)

In [None]:
# Total Topic Distribution by actual weight
topic_weightage_by_rev     = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_rev  = topic_weightage_by_rev.sum().to_frame(name='count').reset_index()

display(df_topic_weightage_by_rev)

In [None]:
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt

# Top  Keywords for each Topic
topic_top_n_words = [(i, topic) for i, topics in lda_model.show_topics(formatted=False)
                                 for j, (topic, wt) in enumerate(topics) if j < 5]  # for 5 key words

df_top_n_words_stacked = pd.DataFrame(topic_top_n_words, columns=['topic_id', 'words'])
df_top_n_words = df_top_n_words_stacked.groupby('topic_id').agg(', \n'.join)
df_top_n_words.reset_index(level=0,inplace=True)

# Plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 4), dpi=120, sharey=True)

# Topic Distribution by Dominant Topics
ax1.bar(x='Dominant_Topic', height='count', data=df_dominant_topic_in_each_rev, width=.5, color='#9ECBEA')
ax1.set_xticks(range(df_dominant_topic_in_each_rev.Dominant_Topic.unique().__len__()))
tick_formatter = FuncFormatter(lambda x, pos: 'Topic ' + str(x)+ '\n' + df_top_n_words.loc[df_top_n_words.topic_id==x, 'words'].values[0])
# ax1.xaxis.set_major_formatter(tick_formatter)
ax1.tick_params(labelsize=5)
ax1.set_title('Number of Reviews by Dominant Topic', fontdict=dict(size=8))
ax1.set_ylabel('Number of Reviews', fontsize = 8)
ax1.set_ylim(0, 300)


# Topic Distribution by Topic Weights
ax2.bar(x='index', height='count', data=df_topic_weightage_by_rev, width=.5, color='#EADA9E')
ax2.set_xticks(range(df_topic_weightage_by_rev.index.unique().__len__()))
ax2.xaxis.set_major_formatter(tick_formatter)
ax2.tick_params(labelsize=5)
ax2.set_title('Number of Reviews by Topic Weightage', fontdict=dict(size=8))
ax2.set_ylabel('Number of Review', fontsize = 8)
plt.show()


In [None]:
import warnings
warnings.filterwarnings('ignore')

def format_topics_sentences(ldamodel=None, corpus=corpus, texts= df.Environ_Review):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document

    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents         = pd.Series(texts)
    contents.reset_index(drop=True, inplace=True)
    sent_topics_df   = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords   = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=df.Environ_Review)

# Format
df_dominant_topic         = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'review']
display(df_dominant_topic.head(5))

In [None]:
df_dominant_topic.shape

In [None]:
final_df = pd.merge(df,df_dominant_topic, left_index=True, right_index=True)

In [None]:
# final_df.to_csv('final_data.csv')

In [None]:
# pip install pyLDAVis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [None]:
# pip install --upgrade pyLDAvis joblib pandas


### Reference

1. https://github.com/rollyjohn/Topic-Modelling/blob/main/topic_model_V3.ipynb

2. https://towardsdatascience.com/topic-modelling-in-python-with-spacy-and-gensim-dc8f7748bdbf

In [None]:
### refe