In [1]:
# https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6
# import all the necessary libraries
# import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.cluster import DBSCAN
import string
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [2]:
# loading the dataset
train=pd.read_csv("quora.csv")

In [3]:
train.head()

Unnamed: 0,question,Unnamed: 1
0,What is the step by step guide to invest in sh...,
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,
2,How can I increase the speed of my internet co...,
3,Why am I mentally very lonely? How can I solve...,
4,"Which one dissolve in water quikly sugar, salt...",


In [4]:
train["question"].head()


0    What is the step by step guide to invest in sh...
1    What is the story of Kohinoor (Koh-i-Noor) Dia...
2    How can I increase the speed of my internet co...
3    Why am I mentally very lonely? How can I solve...
4    Which one dissolve in water quikly sugar, salt...
Name: question, dtype: object

In [5]:
#convert each question to a list of string
data = pd.Series(train["question"].tolist()).astype(str)

In [6]:
data = data[:500]


## Text Preprocessing

In [7]:
# Text Preprocessing
nlp = spacy.load('en_core_web_sm')
stop_list = ['best','different',"won\'t", "couldn\'t", "mustn\'t", "didn\'t", "dtype object"]
for word in stop_list:
    spacy.lang.en.stop_words.STOP_WORDS.add(word)
    nlp.vocab[word].is_stop = True

In [8]:
# Preprocess the text data
def normalize(data):
    """Run all the functions for preprocessing in a pipeline"""
    clean_data = re.sub(re.compile('<.*?>'), '', data)
    cleaned_list = [ unicodedata.normalize('NFKD', word.text).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in  nlp(clean_data)]
    cleaned_list = " ".join(cleaned_list)
    cleaned_list = [word.text.rstrip('0123456789').lower() for word in nlp(cleaned_list) if word.pos_  in ['NOUN','PROPN', 'VERB'] and not word.is_digit and not word.is_punct and not word.is_stop ]
    return cleaned_list

In [9]:
# Preprocess the text data
normalized_data = []
for i, batch in data.groupby(np.arange(len(data)) // 10):
    for batch_data in batch:
        normalized_data.append(normalize(batch_data))

In [10]:
normalized_data

[['step', 'step', 'guide', 'invest', 'share', 'market', 'india'],
 ['story', 'kohinoor', 'koh', 'noor', 'diamond'],
 ['increase', 'speed', 'internet', 'connection', 'vpn'],
 ['solve'],
 ['dissolve', 'water', 'sugar', 'salt', 'methane', 'carbon', 'oxide'],
 ['astrology', 'capricorn', 'sun', 'cap', 'moon', 'cap', 'rising'],
 ['buy', 'tiago'],
 ['geologist'],
 ['use'],
 ['motorola', 'company', 'hack', 'charter', 'motorolla', 'dcx'],
 ['method', 'find', 'separation', 'slits', 'fresnel', 'biprism'],
 ['read', 'find', 'youtube', 'comments'],
 ['physics', 'learn'],
 ['experience'],
 ['laws',
  'change',
  'status',
  'student',
  'visa',
  'card',
  'compare',
  'immigration',
  'laws',
  'canada'],
 ['trump', 'presidency', 'mean', 'master', 'students', 'f', 'visa'],
 ['manipulation', 'mean'],
 ['girls', 'want', 'friends', 'guy', 'reject'],
 ['quora', 'users', 'posting', 'questions', 'answered', 'google'],
 ['marketing', 'institution', 'banglore'],
 ['rockets', 'look'],
 ['causing'],
 ['quest

In [11]:
sentence = []
sentences = []


def token_2_sentence(normalized_data):
    """Join the tokens in each list with space to form a sentence"""
    for i in normalized_data:
      sentence = " ".join(i)
      sentences.append(sentence)
      sentence = [] 
    return sentences
    
sentences_list = token_2_sentence(normalized_data)

In [12]:

# Display cleaned questions
sentences_list

['step step guide invest share market india',
 'story kohinoor koh noor diamond',
 'increase speed internet connection vpn',
 'solve',
 'dissolve water sugar salt methane carbon oxide',
 'astrology capricorn sun cap moon cap rising',
 'buy tiago',
 'geologist',
 'use',
 'motorola company hack charter motorolla dcx',
 'method find separation slits fresnel biprism',
 'read find youtube comments',
 'physics learn',
 'experience',
 'laws change status student visa card compare immigration laws canada',
 'trump presidency mean master students f visa',
 'manipulation mean',
 'girls want friends guy reject',
 'quora users posting questions answered google',
 'marketing institution banglore',
 'rockets look',
 'causing',
 'questions ask quora',
 'kv hp',
 'mean time look clock numbers',
 'tips making job interview process medicines',
 'web application',
 'society place importance sports',
 'way money',
 'prepare law',
 'thing like',
 'cares nose gets night',
 'game thrones villain mercy',
 'un

## LDA with term frequency ## 

In [13]:
#https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print( "Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))


documents = sentences

no_features = 1000


# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.98, min_df=3, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20


# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
print()
print("LDA topics")
display_topics(lda, tf_feature_names, no_top_words)


LDA topics
Topic 0:
porn come gmail question months password change compare stop star
Topic 1:
google increase account ways play youtube internet language books way
Topic 2:
day quora life look website movies technology increase country death
Topic 3:
mean friends trump sugar want taking girls guy ve books
Topic 4:
life technology internet watch stop experience death access ve android
Topic 5:
india years job pm taking hair change time trump modi
Topic 6:
know review hair causes change people exist book modi notes
Topic 7:
friend guy email want girlfriend recover love access password choose
Topic 8:
data universe modi career win salary experience notes android causes
Topic 9:
compare travel effects exist buy notes website girls want work
Topic 10:
difference use year college cost love gain girl company come
Topic 11:
eat country world death number day mean work love play
Topic 12:
start business book happen movies trump sugar notes salary taking
Topic 13:
learn books read book languag

In [14]:
lda_output = lda.fit_transform(tf)

In [15]:
print(lda)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [16]:
print("Log Likelihood: ", lda.score(tf))

Log Likelihood:  -4937.334991806099


In [17]:
print("Perplexity: ", lda.perplexity(tf))

Perplexity:  21907.958389461783


In [18]:
print(lda.get_params())

{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 50.0, 'max_doc_update_iter': 100, 'max_iter': 5, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': 0, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [19]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

In [20]:
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)

In [21]:
from sklearn.model_selection import GridSearchCV
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

In [22]:
model.fit(tf)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=50.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=5,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1, random_state=0,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
             

In [23]:
best_lda_model = model.best_estimator_

In [24]:
# Model Parameters
print("Best Model's Params: ", model.best_params_)

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}


In [25]:
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

Best Log Likelihood Score:  -1393.205642463917


In [26]:
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf))

Model Perplexity:  282.63649075496323


In [27]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(tf)

In [28]:
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]


In [29]:
# index names
docnames = ["Doc" + str(i) for i in range(len(sentences))]
print((np.round(lda_output, 2)).shape)

(500, 10)


In [30]:
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

In [31]:
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [32]:
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)

In [33]:

def make_bold(val):
 weight = 700 if val > .1 else 400
 return "font-weight: {weight}".format(weight=weight)

In [34]:
# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05,5
Doc1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc2,0.03,0.03,0.7,0.03,0.03,0.03,0.03,0.03,0.03,0.03,2
Doc3,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc4,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.7,9
Doc5,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc6,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,9
Doc7,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0
Doc8,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,7
Doc9,0.05,0.05,0.55,0.05,0.05,0.05,0.05,0.05,0.05,0.05,2


In [35]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

In [36]:
# Assign Column and Index
df_topic_keywords.columns = tf_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames

In [37]:
# View
df_topic_keywords.head(100)

Unnamed: 0,access,account,android,answer,book,books,business,buy,career,causes,...,way,ways,website,win,word,work,world,year,years,youtube
Topic0,0.973612,2.726112,5.971225,0.150628,0.155944,0.159215,0.161505,0.164894,0.159427,0.158714,...,0.15793,0.154382,0.898765,0.16061,0.147024,0.149885,0.156916,0.148789,0.158897,1.011378
Topic1,0.201663,1.022593,0.264526,0.154289,0.155096,0.216758,0.160806,0.156435,0.152052,0.158369,...,0.519034,0.158753,0.15649,0.157608,3.177229,0.149747,0.16204,0.15503,0.200854,0.147202
Topic2,0.146485,0.159058,0.156831,0.146715,0.20769,0.157633,0.154386,0.146943,3.364662,3.326178,...,0.146419,0.152104,1.008746,0.156643,0.159967,0.156824,0.16155,0.151648,0.149802,0.145074
Topic3,0.152522,0.149675,0.161663,0.155229,0.159441,0.160078,0.159394,0.158831,0.159009,0.153673,...,1.864067,0.156742,0.15595,0.15671,0.153496,0.156725,0.150972,0.157811,0.15622,0.154608
Topic4,1.372334,0.159928,0.15964,0.157527,0.148407,0.209307,0.151932,0.156273,0.152298,0.165226,...,0.155863,2.973156,0.154038,0.162188,0.151652,0.154315,0.162685,0.153515,0.156118,1.126064
Topic5,1.013132,0.158018,0.165751,1.119043,0.151303,0.158884,0.15568,0.150862,0.159357,0.154065,...,7.486398,0.155379,1.122881,2.330885,0.156935,0.589658,0.167555,0.149159,8.606199,0.160959
Topic6,0.161983,0.15867,0.158111,3.179994,0.161575,0.155774,0.168778,0.156113,0.152729,0.163049,...,0.15744,0.152805,0.152099,0.150658,0.165457,1.011975,0.152693,4.529206,0.150683,0.161973
Topic7,0.159713,0.160051,0.188378,0.154024,0.171925,0.158159,3.799645,0.151843,0.162311,0.153748,...,0.159906,0.165601,0.147124,0.16206,0.154826,0.14957,2.735106,0.154757,0.162275,0.145765
Topic8,0.155566,0.156043,0.15848,0.156864,4.411312,4.396168,0.156207,0.158356,0.167132,0.152622,...,0.150196,0.151836,0.148355,1.018358,0.150847,0.152523,0.157938,0.159537,0.154095,0.924041
Topic9,0.167892,0.158854,0.153628,0.150594,0.1511,0.160476,0.146868,2.842799,0.15536,0.16266,...,0.159533,0.158367,0.155617,0.157005,0.154478,4.604152,0.160177,1.341787,0.161108,0.165189


In [38]:
# Show top n keywords for each topic
def show_topics(vectorizer=tf_vectorizer, lda_model=lda, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [39]:
topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=best_lda_model, n_words=15)

In [40]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,android,phone,cost,change,hair,months,travel,google,account,gain,play,youtube,come,access,gmail
Topic 1,life,money,examples,word,hillary,clinton,language,choose,account,learn,way,stop,android,books,company
Topic 2,love,movies,career,causes,increase,internet,company,like,quora,english,data,death,guy,look,website
Topic 3,stop,mean,look,trump,friends,taking,girls,series,prepare,way,sugar,feel,want,guy,time
Topic 4,learn,email,ways,recover,experience,gmail,password,death,star,access,youtube,ve,looking,stop,books
Topic 5,india,years,way,time,job,pakistan,pm,girl,win,war,modi,website,answer,friend,access
Topic 6,quora,year,college,war,play,happen,answer,review,questions,number,technology,question,feel,work,google
Topic 7,start,difference,people,use,business,think,eat,friend,want,girlfriend,data,world,government,guy,day
Topic 8,know,book,books,read,looking,universe,exist,universities,modi,ve,majors,grads,salary,question,college
Topic 9,porn,work,come,compare,notes,buy,watch,water,effects,sugar,country,feel,star,year,day


In [41]:
Topics = ["Topic 1","Topic 2","Topic 3","Topic 4","Topic 5", 
          "Topic 6", "Topic 7", "Topic 8", "Topic 9", "Topic 10"]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Topics
Topic 0,android,phone,cost,change,hair,months,travel,google,account,gain,play,youtube,come,access,gmail,Topic 1
Topic 1,life,money,examples,word,hillary,clinton,language,choose,account,learn,way,stop,android,books,company,Topic 2
Topic 2,love,movies,career,causes,increase,internet,company,like,quora,english,data,death,guy,look,website,Topic 3
Topic 3,stop,mean,look,trump,friends,taking,girls,series,prepare,way,sugar,feel,want,guy,time,Topic 4
Topic 4,learn,email,ways,recover,experience,gmail,password,death,star,access,youtube,ve,looking,stop,books,Topic 5
Topic 5,india,years,way,time,job,pakistan,pm,girl,win,war,modi,website,answer,friend,access,Topic 6
Topic 6,quora,year,college,war,play,happen,answer,review,questions,number,technology,question,feel,work,google,Topic 7
Topic 7,start,difference,people,use,business,think,eat,friend,want,girlfriend,data,world,government,guy,day,Topic 8
Topic 8,know,book,books,read,looking,universe,exist,universities,modi,ve,majors,grads,salary,question,college,Topic 9
Topic 9,porn,work,come,compare,notes,buy,watch,water,effects,sugar,country,feel,star,year,day,Topic 10


In [48]:
test_question = "What is the salary of an Indian Engineer?"

In [51]:
normalize(test_question)

['salary', 'engineer']

In [52]:
test_question

'What is the salary of an Indian Engineer?'

In [53]:
test_sentence = token_2_sentence(test_question)

In [54]:
mytext_4 = tf_vectorizer.transform(test_sentence)

In [55]:
topic_probability_scores = best_lda_model.transform(mytext_4)

In [56]:
topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores)%topic_probability_scores.shape[1],1:14].values.tolist()
print(topic)

['difference', 'people', 'use', 'business', 'think', 'eat', 'friend', 'want', 'girlfriend', 'data', 'world', 'government', 'guy']


In [57]:
infer_topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores)%topic_probability_scores.shape[1], -1]
print(infer_topic)

Topic 8
