# Topic Modelling

In [1]:
import pandas as pd
import re

# load the data
df = pd.read_csv("no_rare_words_mail_text.csv")
df.head()

Unnamed: 0,no_rare_words_mail_text
0,curve shift
1,ahead learned tragedies adversity teaches less...
2,phillip automated notification availability in...
3,phillip spin win spin iwon prize machine chanc...
4,resource acceptance forms admin permanent reso...


In [2]:
print(df.shape)
# drop the reocrds with null values
df = df.dropna()

print(df.shape)

(37117, 1)
(35911, 1)


In [4]:
stopwords = ["edi", "cpr", "basic", "size", 'men', 'woman', 'trtd', 'alignleftfont', 'intercontinentalexchange', 'bfonttdtd', 'sizeb', 'sizebb',
            'rusgopsgeotoolsaprs', 'facedarial', 'sansserif', 'widthdfont', 'colorbfont', 'sizefont', 'alignmiddlefont',
            'tdnbsptd', 'mimeversion', 'contenttype', 'licensed', 'kcfs', 'nyisotechexchange', 'alignleft',
             'classemailtextblack', 'arial', 'fontsize', 'fontfamily', 'colspan', 'valigntop', 'fontstyle', 'contain', 'confidential',
            'oncall', 'sitara', 'ossuatarp', 'thur', 'eps', 'mseb', 'ampm', 'llc', 'ubsw', 'textdecoration', 'none', 'women', 'man',
            'solely', 'responsible', 'ecn', 'kevin', 'presto', 'fontweight', 'bold', 'xms','registered', 'trademark', 'intended',
            'recipient', 'strictly', 'prohibited', 'ddddddddddddddddddddddddd', 'published', 'behalf', 'represent',
            'carr', 'nwn', 'midcolumbia', 'corner', 'rockies', 'tds', 'hereby', 'notified', 'restriction', 'attachment',
            'follows', 'pt', 'throughout', 'restriction', 'representation', 'southwest', 'mid', 'subscribe', 'stop', 
            'ets', 'nothing', 'considered', 'author', 'differ', 'yahoo', 'betweeneveryone', 'terminate', 'subscription',
            'pdt', 'widthtr', 'trtable', 'colspancash', 'span', 'span', 'classspecialredspannbspnbspnbspnbspnbspspan',
             'tdtrtr', 'colspancash', 'adobe', 'acrobat', 'palo', 'drive', 'eol', 'opinion', 'expressed', 'respect',
            'url', 'browser', 'paste', 'xxxx', 'anyone', 'else']

# removing html content from the text
df['html_cleaned_mail_text'] = df['no_rare_words_mail_text'].apply(lambda x: re.sub('html.*.html', "", x))
df = df.dropna()

# remooving the stopwords as above
df['cleaned_mail_text'] = df['html_cleaned_mail_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
df = df.dropna()
print(df.shape)

(35911, 3)


In [5]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
#from stop_words import get_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pandas as pd
import gensim
import re
import pyLDAvis#.gensim
import pyLDAvis.gensim_models


texts = []
lemmatizer = WordNetLemmatizer()

# we create bigrams from the cleaned text
for i in df['cleaned_mail_text']:
    tokens = i.split()
    # lemmatize tokens
    lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # remove word containing only single char
    new_lemma_tokens = [raw for raw in lemma_tokens if not len(raw) == 1]
    lemmatized_string = " ".join(token for token in new_lemma_tokens)
    
    import nltk
    #word_data = "The best performance can bring in sky high success."
    nltk_tokens = nltk.word_tokenize(lemmatized_string)
    #print(list(nltk.bigrams(nltk_tokens)))
    texts.append([ele[0]+ " " + ele[1] for ele in list(nltk.bigrams(nltk_tokens))])


In [6]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [7]:
# lda model with 10 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
import pprint

# printing the topics to analyse the terms that contribute to that topic
pprint.pprint(ldamodel.top_topics(corpus,topn=10))

[([(0.0017032448, 'solicitation instrument'),
   (0.0016340485, 'reliable accurate'),
   (0.0016188252, 'material respect'),
   (0.0016169606, 'accurate solicitation'),
   (0.0016169606, 'instrument opinion'),
   (0.0016169606, 'officer affiliate'),
   (0.0016169606, 'opinion material'),
   (0.0016169606, 'respect officer'),
   (0.0013402642, 'hot link'),
   (0.0013402642, 'clicking hot')],
  -0.07509681347423333),
 ([(0.0029388594, 'stephanie taylor'),
   (0.0029354566, 'esource present'),
   (0.0025061967, 'custom clip'),
   (0.002456986, 'basic clinic'),
   (0.0020929791, 'fast seat'),
   (0.0020918015, 'seat stephanie'),
   (0.0020918015, 'fill fast'),
   (0.0020918015, 'seat fill'),
   (0.001676022, 'noshows charged'),
   (0.0016663461, 'clinic pmeb')],
  -0.19882183250570434),
 ([(0.0024026139, 'henry hub'),
   (0.0021867852, 'los angeles'),
   (0.0016785123, 'advance reservation'),
   (0.0015706796, 'dallasft dfw'),
   (0.0014474891, 'alamo rent'),
   (0.001411202, 'embassy suit

In [8]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -18.707536268377158

Coherence Score:  0.47771410229462513
