In [1]:
import pandas as pd

data = pd.read_csv('comments.csv', error_bad_lines=False);
data_text = data[['content']]
data_text = data_text[data_text.content.notnull()]
data_text['index'] = data_text.index
documents = data_text

In [2]:
len(documents)

246525

In [3]:
documents[:5]

Unnamed: 0,content,index
0,"I recently activated my home phone, so no one ...",0
1,My caller ID shows this number when I am late ...,1
2,Every few days I get calls with this number sh...,2
3,I had this one on my caller ID awhile back. L...,3
4,I get four or five calls from this number almo...,4


### Data Preprocessing

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Lemmatize example

In [6]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [7]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [8]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 4310].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Said', 'he', 'was', 'from', 'Craig', 'Research', 'a', 'survey', 'firm.', 'Asked', 'if', 'I', 'had', 'a', 'few', 'minutes', 'for', 'a', 'survey?I', 'asked', 'less', 'than', '5', 'minutes?', 'He', 'said', 'no', '7', 'mins,', 'I', 'hung', 'up.']


 tokenized and lemmatized document: 
['say', 'craig', 'research', 'survey', 'firm', 'ask', 'minut', 'survey', 'ask', 'minut', 'say', 'min', 'hang']


In [10]:
doc_sample = documents[documents['index'] == 6].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['who', 'is', 'this', 'number', 'belong', 'too?', 'I', 'thought', 'blockbuster', 'was', 'all', "9's..."]


 tokenized and lemmatized document: 
['number', 'belong', 'think', 'blockbust']


In [11]:
processed_docs = documents['content'].map(preprocess)

In [12]:
processed_docs[:10]

0    [recent, activ, home, phone, know, phone, comp...
1    [caller, show, number, late, return, movi, blo...
2    [day, call, number, show, answer, time, collec...
3    [caller, awhil, year, montel, william, address...
4    [call, number, blocker, work, number, caller, ...
5    [reicev, today, leav, messag, htink, larg, tel...
6                   [number, belong, think, blockbust]
7    [get, call, caller, tri, answer, twice, record...
8    [feel, number, concoct, telemarket, kind, voip...
9    [caller, say, cruis, websit, site, show, emb, ...
Name: content, dtype: object

### Bag of words on the dataset

In [13]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [14]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 activ
1 angri
2 answer
3 apolog
4 ask
5 call
6 caller
7 collect
8 compani
9 decid
10 final


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(4, 2),
 (12, 1),
 (17, 2),
 (25, 1),
 (26, 2),
 (401, 2),
 (721, 1),
 (1813, 1),
 (2785, 1)]

In [17]:
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 4 ("ask") appears 2 time.
Word 12 ("hang") appears 1 time.
Word 17 ("minut") appears 2 time.
Word 25 ("research") appears 1 time.
Word 26 ("say") appears 2 time.
Word 401 ("survey") appears 2 time.
Word 721 ("min") appears 1 time.
Word 1813 ("firm") appears 1 time.
Word 2785 ("craig") appears 1 time.


### TF-IDF

In [18]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [19]:
corpus_tfidf = tfidf[bow_corpus]

In [20]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.21131193524613445),
 (1, 0.282624169336468),
 (2, 0.0851622615053595),
 (3, 0.2912945639852048),
 (4, 0.11146579609742409),
 (5, 0.03467520908867233),
 (6, 0.10995840839913462),
 (7, 0.1408288056193443),
 (8, 0.2144532810352479),
 (9, 0.2309991525013733),
 (10, 0.1892040237592669),
 (11, 0.2711831996185862),
 (12, 0.10037069738361608),
 (13, 0.15399584469591843),
 (14, 0.10041381603150235),
 (15, 0.11551451474702014),
 (16, 0.19249810113412646),
 (17, 0.1787413029940316),
 (18, 0.0452858046298924),
 (19, 0.2770897592752591),
 (20, 0.22851786375536318),
 (21, 0.16150087467865076),
 (22, 0.2493310058636861),
 (23, 0.21633132459052803),
 (24, 0.17220339988773378),
 (25, 0.21961606161059294),
 (26, 0.07667442988017567),
 (27, 0.13787689891206767),
 (28, 0.160268526121491),
 (29, 0.09136950515864309)]


### Running LDA using Bag of Words

In [21]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"number" + 0.024*"scam" + 0.023*"state" + 0.021*"receiv" + 0.021*"say" + 0.020*"legal" + 0.019*"call" + 0.016*"offic" + 0.016*"case" + 0.016*"file"
Topic: 1 
Words: 0.056*"card" + 0.043*"credit" + 0.038*"account" + 0.036*"bank" + 0.032*"scam" + 0.026*"number" + 0.021*"charg" + 0.021*"check" + 0.018*"money" + 0.016*"send"
Topic: 2 
Words: 0.152*"number" + 0.091*"call" + 0.070*"phone" + 0.051*"list" + 0.027*"cell" + 0.019*"remov" + 0.017*"receiv" + 0.015*"answer" + 0.013*"line" + 0.013*"area"
Topic: 3 
Words: 0.134*"call" + 0.101*"messag" + 0.077*"leav" + 0.065*"number" + 0.061*"answer" + 0.043*"time" + 0.026*"phone" + 0.024*"hang" + 0.017*"stop" + 0.016*"caller"
Topic: 4 
Words: 0.068*"scam" + 0.047*"scammer" + 0.045*"block" + 0.039*"caller" + 0.038*"number" + 0.023*"post" + 0.020*"thank" + 0.017*"spam" + 0.012*"warranti" + 0.011*"telemarket"
Topic: 5 
Words: 0.071*"debt" + 0.039*"collect" + 0.026*"collector" + 0.017*"inform" + 0.016*"agenc" + 0.015*"state" + 0.01

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [23]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)

In [24]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.094*"messag" + 0.087*"leav" + 0.053*"call" + 0.051*"answer" + 0.041*"number" + 0.027*"cell" + 0.026*"block" + 0.026*"phone" + 0.026*"time" + 0.024*"hang"
Topic: 1 Word: 0.011*"debt" + 0.008*"collect" + 0.007*"scam" + 0.007*"compani" + 0.006*"inform" + 0.006*"money" + 0.006*"bank" + 0.006*"account" + 0.006*"number" + 0.005*"state"
Topic: 2 Word: 0.015*"say" + 0.011*"tell" + 0.011*"number" + 0.011*"call" + 0.008*"ask" + 0.008*"know" + 0.007*"scam" + 0.007*"loan" + 0.007*"card" + 0.007*"receiv"
Topic: 3 Word: 0.018*"number" + 0.017*"stop" + 0.017*"call" + 0.017*"list" + 0.014*"block" + 0.011*"unknown" + 0.011*"phone" + 0.010*"thank" + 0.009*"spam" + 0.009*"answer"
Topic: 4 Word: 0.012*"call" + 0.011*"hang" + 0.011*"say" + 0.011*"time" + 0.010*"ask" + 0.009*"number" + 0.009*"answer" + 0.008*"tell" + 0.007*"phone" + 0.007*"speak"


### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [25]:
processed_docs[4310]

['say',
 'craig',
 'research',
 'survey',
 'firm',
 'ask',
 'minut',
 'survey',
 'ask',
 'minut',
 'say',
 'min',
 'hang']

In [26]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6127671003341675	 
Topic: 0.062*"call" + 0.057*"say" + 0.053*"tell" + 0.037*"ask" + 0.023*"number" + 0.019*"hang" + 0.017*"know" + 0.016*"compani" + 0.016*"person" + 0.016*"loan"

Score: 0.33007094264030457	 
Topic: 0.033*"peopl" + 0.016*"compani" + 0.015*"like" + 0.015*"time" + 0.014*"call" + 0.013*"know" + 0.011*"good" + 0.011*"think" + 0.011*"work" + 0.009*"money"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [27]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8187533020973206	 
Topic: 0.012*"call" + 0.011*"hang" + 0.011*"say" + 0.011*"time" + 0.010*"ask" + 0.009*"number" + 0.009*"answer" + 0.008*"tell" + 0.007*"phone" + 0.007*"speak"

Score: 0.13697345554828644	 
Topic: 0.018*"number" + 0.017*"stop" + 0.017*"call" + 0.017*"list" + 0.014*"block" + 0.011*"unknown" + 0.011*"phone" + 0.010*"thank" + 0.009*"spam" + 0.009*"answer"

Score: 0.014835461974143982	 
Topic: 0.094*"messag" + 0.087*"leav" + 0.053*"call" + 0.051*"answer" + 0.041*"number" + 0.027*"cell" + 0.026*"block" + 0.026*"phone" + 0.026*"time" + 0.024*"hang"

Score: 0.014797520823776722	 
Topic: 0.015*"say" + 0.011*"tell" + 0.011*"number" + 0.011*"call" + 0.008*"ask" + 0.008*"know" + 0.007*"scam" + 0.007*"loan" + 0.007*"card" + 0.007*"receiv"

Score: 0.014640218578279018	 
Topic: 0.011*"debt" + 0.008*"collect" + 0.007*"scam" + 0.007*"compani" + 0.006*"inform" + 0.006*"money" + 0.006*"bank" + 0.006*"account" + 0.006*"number" + 0.005*"state"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [30]:
unseen_document = "Why I keep receiving calls from this damn debet colletcting? I never loan anything from them!"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))

Score: 0.45515045523643494	 Topic: 0.134*"call" + 0.101*"messag" + 0.077*"leav" + 0.065*"number" + 0.061*"answer" + 0.043*"time" + 0.026*"phone" + 0.024*"hang" + 0.017*"stop" + 0.016*"caller"
Score: 0.3848022222518921	 Topic: 0.062*"call" + 0.057*"say" + 0.053*"tell" + 0.037*"ask" + 0.023*"number" + 0.019*"hang" + 0.017*"know" + 0.016*"compani" + 0.016*"person" + 0.016*"loan"
Score: 0.020009079948067665	 Topic: 0.033*"peopl" + 0.016*"compani" + 0.015*"like" + 0.015*"time" + 0.014*"call" + 0.013*"know" + 0.011*"good" + 0.011*"think" + 0.011*"work" + 0.009*"money"
Score: 0.02000764012336731	 Topic: 0.152*"number" + 0.091*"call" + 0.070*"phone" + 0.051*"list" + 0.027*"cell" + 0.019*"remov" + 0.017*"receiv" + 0.015*"answer" + 0.013*"line" + 0.013*"area"
Score: 0.020007435232400894	 Topic: 0.032*"number" + 0.024*"scam" + 0.023*"state" + 0.021*"receiv" + 0.021*"say" + 0.020*"legal" + 0.019*"call" + 0.016*"offic" + 0.016*"case" + 0.016*"file"
Score: 0.020006395876407623	 Topic: 0.056*"card" +

In [29]:
unseen_document = "I am a Canadian living in Canada and they were trying to tell me that this was a once in a life time chance for me a Canadian citizen to get money from the U.S. government. I asked so many questions that they finally just got mad and hung up on me."
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))

Score: 0.37906453013420105	 Topic: 0.020*"say" + 0.019*"servic" + 0.018*"compani" + 0.012*"inform" + 0.010*"scam" + 0.010*"like" + 0.009*"sound" + 0.009*"number" + 0.009*"secur" + 0.009*"ask"
Score: 0.3217141032218933	 Topic: 0.062*"call" + 0.057*"say" + 0.053*"tell" + 0.037*"ask" + 0.023*"number" + 0.019*"hang" + 0.017*"know" + 0.016*"compani" + 0.016*"person" + 0.016*"loan"
Score: 0.2580339014530182	 Topic: 0.033*"peopl" + 0.016*"compani" + 0.015*"like" + 0.015*"time" + 0.014*"call" + 0.013*"know" + 0.011*"good" + 0.011*"think" + 0.011*"work" + 0.009*"money"
