# Imports


In [96]:
import sys
from __future__ import division
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
import string
from matplotlib import rcParams
from itertools import chain
from nltk import everygrams, word_tokenize
import pprint
import enchant
stemmer = SnowballStemmer("english")


# Load Data

In [97]:
#df_train = pd.read_csv('../kaggle_data_exploration/all/train.csv', encoding = "utf_8")
df_test = pd.read_csv('../kaggle_data_exploration/all/test.csv', encoding = 'utf_8')
data_t = df_test[['question_text']]
data_t['qid'] = df_test.qid
documents = data_t

print('documents length {} \n{}'.format(len(documents), documents[:5]))

documents length 56370 
                                       question_text                   qid
0  My voice range is A2-C5. My chest voice goes u...  00014894849d00ba98a9
1           How much does a tutor earn in Bangalore?  000156468431f09b3cae
2  What are the best made pocket knives under $20...  000227734433360e1aae
3  Why would they add a hypothetical scenario tha...  0005e06fbe3045bd2a92
4   What is the dresscode for Techmahindra freshers?  00068a0f7f41f50fc399


# Lemmatise and stemming functions

In [98]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


# Select a document to preview after preprocessing.

In [145]:
#print(len(documents))
#print(documents[1:2])
doc_sample = documents[documents['qid']=='000156468431f09b3cae']
print(doc_sample)
doc_sample = doc_sample.values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))




                              question_text                   qid
1  How much does a tutor earn in Bangalore?  000156468431f09b3cae
original document: 
['How', 'much', 'does', 'a', 'tutor', 'earn', 'in', 'Bangalore?']


 tokenized and lemmatized document: 
['tutor', 'earn', 'bangalor']


# Preprocess the headline text, saving the results as ‘processed_docs’

In [146]:
processed_docs = documents['question_text'].map(preprocess)

In [147]:
processed_docs[:10]

0    [voic, rang, chest, voic, go, includ, sampl, h...
1                              [tutor, earn, bangalor]
2                                [best, pocket, knive]
3    [hypothet, scenario, imposs, happen, link, sho...
4                    [dresscod, techmahindra, fresher]
5                                       [adapt, trump]
6                                 [thing, peopl, life]
7    [receiv, condit, offer, master, intern, busi, ...
8                      [appareil, photo, mean, french]
9                              [public, litig, canada]
Name: question_text, dtype: object

# Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [113]:
print(len(processed_docs))

56370


In [112]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 chest
1 go
2 higher
3 includ
4 rang
5 sampl
6 type
7 voic
8 bangalor
9 earn
10 tutor


# Filter out tokens that appear in less than 15 documents (absolute number) or more than 0.5 documents (fraction of total corpus size, not absolute number)


Filter out tokens that appear in

less than no_below documents (absolute number) or
more than no_above documents (fraction of total corpus size, not absolute number).
after (1) and (2), keep only the first keep_n most frequent tokens (or keep all if None).
After the pruning, shrink resulting gaps in word ids.

Note: Due to the gap shrinking, the same word may have a different word id before and after the call to this function!



In [114]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [125]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[1]

[(8, 1), (9, 1), (10, 1)]

# Preview Bag Of Words for our sample preprocessed document.

In [128]:
bow_doc_1 = bow_corpus[1]
for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                               dictionary[bow_doc_1[i][0]], 
bow_doc_1[i][1]))

Word 8 ("bangalor") appears 1 time.
Word 9 ("earn") appears 1 time.
Word 10 ("tutor") appears 1 time.


# TF-IDF
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [129]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.47464595144957933),
 (1, 0.15245407189012472),
 (2, 0.19815202840396937),
 (3, 0.196991567476024),
 (4, 0.43503012520630324),
 (5, 0.2416911528186504),
 (6, 0.16206519730077684),
 (7, 0.6320349051717128)]


# Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [132]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

## For each topic, we will explore the words occuring in that topic and its relative weight.



In [133]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.065*"best" + 0.019*"learn" + 0.015*"human" + 0.012*"onlin" + 0.012*"north" + 0.012*"cours" + 0.010*"train" + 0.009*"electr" + 0.009*"develop" + 0.008*"join"
Topic: 1 
Words: 0.045*"mean" + 0.026*"work" + 0.015*"thing" + 0.012*"best" + 0.012*"possibl" + 0.010*"music" + 0.010*"say" + 0.009*"invest" + 0.009*"see" + 0.009*"bank"
Topic: 2 
Words: 0.054*"like" + 0.025*"feel" + 0.023*"good" + 0.020*"friend" + 0.018*"happen" + 0.014*"love" + 0.013*"girl" + 0.013*"better" + 0.012*"look" + 0.012*"form"
Topic: 3 
Words: 0.050*"peopl" + 0.017*"live" + 0.012*"feel" + 0.012*"anim" + 0.011*"american" + 0.011*"like" + 0.010*"english" + 0.010*"success" + 0.009*"chines" + 0.009*"high"
Topic: 4 
Words: 0.035*"quora" + 0.025*"know" + 0.022*"question" + 0.014*"answer" + 0.012*"play" + 0.012*"game" + 0.012*"websit" + 0.011*"time" + 0.011*"video" + 0.011*"peopl"
Topic: 5 
Words: 0.016*"prepar" + 0.015*"busi" + 0.013*"good" + 0.013*"main" + 0.011*"rank" + 0.011*"year" + 0.011*"presid" + 0.0

# Running LDA using TF-IDF


In [134]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"effect" + 0.007*"best" + 0.006*"famous" + 0.006*"function" + 0.006*"year" + 0.005*"time" + 0.005*"power" + 0.005*"peopl" + 0.005*"today" + 0.005*"teacher"
Topic: 1 Word: 0.012*"book" + 0.010*"best" + 0.009*"cours" + 0.007*"form" + 0.007*"travel" + 0.007*"hair" + 0.006*"happen" + 0.006*"number" + 0.006*"choos" + 0.006*"cost"
Topic: 2 Word: 0.008*"black" + 0.006*"know" + 0.006*"peopl" + 0.006*"date" + 0.006*"affect" + 0.005*"think" + 0.005*"best" + 0.005*"world" + 0.005*"characterist" + 0.005*"india"
Topic: 3 Word: 0.009*"india" + 0.008*"caus" + 0.007*"fight" + 0.007*"indian" + 0.007*"think" + 0.006*"peopl" + 0.006*"best" + 0.006*"china" + 0.006*"class" + 0.005*"experi"
Topic: 4 Word: 0.014*"good" + 0.011*"life" + 0.008*"exampl" + 0.007*"colleg" + 0.007*"univers" + 0.006*"best" + 0.006*"role" + 0.006*"time" + 0.006*"women" + 0.006*"anim"
Topic: 5 Word: 0.019*"best" + 0.013*"like" + 0.012*"work" + 0.010*"quora" + 0.009*"learn" + 0.009*"start" + 0.009*"question" + 0.0

# Performance evaluation by classifying sample document using LDA Bag of Words model

In [139]:
processed_docs[1]

['tutor', 'earn', 'bangalor']

In [140]:
for index, score in sorted(lda_model[bow_corpus[1]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.33928558230400085	 
Topic: 0.065*"best" + 0.019*"learn" + 0.015*"human" + 0.012*"onlin" + 0.012*"north" + 0.012*"cours" + 0.010*"train" + 0.009*"electr" + 0.009*"develop" + 0.008*"join"

Score: 0.26851847767829895	 
Topic: 0.045*"mean" + 0.026*"work" + 0.015*"thing" + 0.012*"best" + 0.012*"possibl" + 0.010*"music" + 0.010*"say" + 0.009*"invest" + 0.009*"see" + 0.009*"bank"

Score: 0.21719369292259216	 
Topic: 0.025*"engin" + 0.021*"studi" + 0.021*"want" + 0.015*"year" + 0.015*"best" + 0.013*"colleg" + 0.012*"career" + 0.009*"work" + 0.009*"doctor" + 0.008*"program"

Score: 0.025001665577292442	 
Topic: 0.016*"prepar" + 0.015*"busi" + 0.013*"good" + 0.013*"main" + 0.011*"rank" + 0.011*"year" + 0.011*"presid" + 0.011*"exam" + 0.011*"start" + 0.010*"best"

Score: 0.025000322610139847	 
Topic: 0.041*"differ" + 0.022*"chang" + 0.020*"caus" + 0.018*"countri" + 0.017*"consid" + 0.013*"person" + 0.011*"social" + 0.011*"free" + 0.010*"state" + 0.009*"legal"

Score: 0.02500024810433387

# Performance evaluation by classifying sample document using LDA TF-IDF model.

In [141]:
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.6268593072891235	 
Topic: 0.008*"effect" + 0.007*"best" + 0.006*"famous" + 0.006*"function" + 0.006*"year" + 0.005*"time" + 0.005*"power" + 0.005*"peopl" + 0.005*"today" + 0.005*"teacher"

Score: 0.3115895092487335	 
Topic: 0.008*"black" + 0.006*"know" + 0.006*"peopl" + 0.006*"date" + 0.006*"affect" + 0.005*"think" + 0.005*"best" + 0.005*"world" + 0.005*"characterist" + 0.005*"india"


# Testing model on unseen document

In [142]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.42642053961753845	 Topic: 0.035*"quora" + 0.025*"know" + 0.022*"question" + 0.014*"answer" + 0.012*"play"
Score: 0.41357311606407166	 Topic: 0.054*"like" + 0.025*"feel" + 0.023*"good" + 0.020*"friend" + 0.018*"happen"
Score: 0.020003680139780045	 Topic: 0.016*"prepar" + 0.015*"busi" + 0.013*"good" + 0.013*"main" + 0.011*"rank"
Score: 0.020001044496893883	 Topic: 0.041*"differ" + 0.022*"chang" + 0.020*"caus" + 0.018*"countri" + 0.017*"consid"
Score: 0.020000869408249855	 Topic: 0.050*"peopl" + 0.017*"live" + 0.012*"feel" + 0.012*"anim" + 0.011*"american"
Score: 0.020000237971544266	 Topic: 0.065*"best" + 0.019*"learn" + 0.015*"human" + 0.012*"onlin" + 0.012*"north"
Score: 0.02000022865831852	 Topic: 0.041*"think" + 0.016*"long" + 0.016*"trump" + 0.015*"year" + 0.013*"way"
Score: 0.02000017836689949	 Topic: 0.035*"indian" + 0.017*"write" + 0.016*"india" + 0.013*"import" + 0.013*"peopl"
Score: 0.020000074058771133	 Topic: 0.045*"mean" + 0.026*"work" + 0.015*"thing" + 0.012*"best"