In [1]:
import pandas as pd

dataset_train=pd.read_csv('/kaggle/input/yelp-csv/train.csv')

In [2]:
data_text=dataset_train['text']
data_text['index']=data_text.index
data_text.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index']=data_text.index


0    dr. goldberg offers everything i look for in a...
1    Unfortunately, the frustration of being Dr. Go...
2    Been going to Dr. Goldberg for over 10 years. ...
3    Got a letter in the mail last week that said D...
4    I don't know what Dr. Goldberg was like before...
Name: text, dtype: object

In [3]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [4]:
#Generate corpus for lda, preprocessed and lemmatized
corpus=[]
for i in range(0,len(data_text)-1):
    text=re.sub('[^a-zA-z]',' ',data_text[i])
    text=text.lower()
    text=text.split()
    text=[PorterStemmer().stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)

In [5]:
#Save corpus, since it is the part which needs more time for being executed. Next time we can directly load the data
import json

with open("corpus.json", 'w') as f:
    # indent=2 is not needed but makes the file human-readable 
    # if the data is nested
    json.dump(corpus, f, indent=2) 

with open("corpus.json", 'r') as f:
    corpus = json.load(f) 

In [6]:
new_data_text=data_text
for i in range(0,len(corpus)-1):
    new_data_text[i]=corpus[i]

In [7]:
new_data_text.head()

0    dr goldberg offer everyth look gener practitio...
1    unfortun frustrat dr goldberg patient repeat e...
2    go dr goldberg year think one st patient start...
3    got letter mail last week said dr goldberg mov...
4    know dr goldberg like move arizona let tell st...
Name: text, dtype: object

In [8]:
processed_docs=[]
for i in range(0,len(data_text)-1):
    text = data_text[i].split()
    processed_docs.append(text)
    

In [9]:
import gensim
dictionary=gensim.corpora.Dictionary(processed_docs)


In [10]:
count=0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count>20:
        break

0 affili
1 alway
2 blank
3 case
4 complaint
5 dr
6 draw
7 easi
8 everyth
9 explain
10 first
11 gener
12 get
13 goldberg
14 happen
15 hospit
16 import
17 look
18 need
19 nice
20 notch


In [11]:
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=1000)

In [12]:
bow_corpus=[dictionary.doc2bow(doc) for doc in processed_docs]

In [13]:
# Prepare corpus for a tfidf lda (term frequency–inverse document frequency)- More attention
#to terms which appear in the document but are less frequent.
from gensim import corpora,models
tfidf=models.TfidfModel(bow_corpus)

In [14]:
corpus_tfidf=tfidf[bow_corpus]

In [15]:
lda_model=gensim.models.LdaMulticore(bow_corpus,
                                    num_topics=20,
                                    id2word=dictionary,
                                    passes=2,
                                    random_state=11
                                    )
lda_model.save('lda_model.model')

In [16]:
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.059*"coffe" + 0.029*"tea" + 0.027*"lunch" + 0.025*"breakfast" + 0.020*"fresh" + 0.017*"cup" + 0.016*"cafe" + 0.015*"juic" + 0.014*"morn" + 0.014*"quick"


Topic: 1 
Words: 0.019*"wine" + 0.019*"delici" + 0.016*"dinner" + 0.016*"dish" + 0.014*"amaz" + 0.014*"meal" + 0.011*"enjoy" + 0.011*"appet" + 0.011*"dessert" + 0.010*"excel"


Topic: 2 
Words: 0.026*"show" + 0.018*"music" + 0.017*"club" + 0.014*"fun" + 0.013*"friend" + 0.011*"guy" + 0.011*"cool" + 0.010*"girl" + 0.010*"line" + 0.009*"danc"


Topic: 3 
Words: 0.036*"chees" + 0.031*"salad" + 0.031*"sauc" + 0.026*"chicken" + 0.025*"meat" + 0.016*"side" + 0.015*"flavor" + 0.015*"fri" + 0.014*"bread" + 0.012*"onion"


Topic: 4 
Words: 0.098*"room" + 0.049*"hotel" + 0.048*"stay" + 0.020*"pool" + 0.017*"check" + 0.017*"clean" + 0.014*"floor" + 0.014*"bed" + 0.013*"bathroom" + 0.012*"strip"


Topic: 5 
Words: 0.310*"e" + 0.093*"de" + 0.077*"steak" + 0.073*"la" + 0.038*"f" + 0.037*"c" + 0.035*"die" + 0.033*"n" + 0.032*"pou

In [17]:
# lda with tfidf
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=20, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")
    
    

Topic: 0 Word: 0.017*"show" + 0.017*"music" + 0.016*"club" + 0.013*"danc" + 0.009*"fun" + 0.009*"bartend" + 0.008*"crowd" + 0.007*"play" + 0.007*"girl" + 0.006*"loud"


Topic: 1 Word: 0.045*"taco" + 0.034*"e" + 0.021*"salsa" + 0.021*"mexican" + 0.020*"burrito" + 0.015*"chip" + 0.014*"de" + 0.010*"bean" + 0.010*"margarita" + 0.009*"die"


Topic: 2 Word: 0.013*"wine" + 0.009*"salad" + 0.008*"dish" + 0.007*"pasta" + 0.007*"steak" + 0.007*"appet" + 0.007*"dinner" + 0.006*"meal" + 0.006*"entre" + 0.005*"bread"


Topic: 3 Word: 0.015*"chocol" + 0.012*"cream" + 0.012*"cake" + 0.011*"egg" + 0.010*"ice" + 0.009*"pancak" + 0.009*"flavor" + 0.009*"breakfast" + 0.008*"bacon" + 0.008*"chees"


Topic: 4 Word: 0.032*"sandwich" + 0.010*"salad" + 0.010*"sub" + 0.009*"chicken" + 0.009*"bread" + 0.008*"terribl" + 0.007*"turkey" + 0.007*"meat" + 0.007*"worst" + 0.007*"horribl"


Topic: 5 Word: 0.059*"sushi" + 0.035*"roll" + 0.016*"fish" + 0.012*"tuna" + 0.011*"happi" + 0.011*"chef" + 0.010*"fresh" + 0.009

In [18]:
print(dataset_train['text'][11])
for index, score in sorted(lda_model[bow_corpus[11]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

This place is absolute garbage...  Half of the tees are not available, including all the grass tees.  It is cash only, and they sell the last bucket at 8, despite having lights.  And if you finish even a minute after 8, don't plan on getting a drink.  The vending machines are sold out (of course) and they sell drinks inside, but close the drawers at 8 on the dot.  There are weeds grown all over the place.  I noticed some sort of batting cage, but it looks like those are out of order as well.  Someone should buy this place and turn it into what it should be.

Score: 0.439907968044281	 
Topic: 0.036*"custom" + 0.018*"walk" + 0.018*"employe" + 0.015*"line" + 0.015*"help" + 0.012*"counter" + 0.012*"manag" + 0.012*"busi" + 0.011*"guy" + 0.011*"rude"

Score: 0.268155038356781	 
Topic: 0.104*"pizza" + 0.051*"store" + 0.037*"shop" + 0.018*"buy" + 0.018*"item" + 0.014*"find" + 0.012*"crust" + 0.012*"product" + 0.010*"new" + 0.010*"select"

Score: 0.23716723918914795	 
Topic: 0.019*"wine" + 0.01