In [2]:
from gensim.models import Phrases
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
stop = stopwords.words("english")

In [9]:
path = 'categories/indian.txt'
sent_stream = []
doc_stream = []
with open(path, 'rt', encoding='UTF8') as rf:
    for line in rf:
        line=' '.join([word.lower() for word in line.split()])
        line=' '.join([word for word in line.split() if word.isalpha()])
        line=' '.join([word for word in line.split() if word not in stop])
        sent_stream.append(line.split(' '))

In [10]:
phrases = Phrases(sent_stream, min_count=1, threshold=1)  # train model
texts = [phrases[line] for line in sent_stream]

In [11]:
new_corpus = []
for doc in texts:
    new_corpus.append(' '.join(doc))

In [12]:
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(new_corpus)

In [13]:
terms = vectorizer.get_feature_names()

# sum tfidf frequency of each term through documents
sums = X.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranked_list = ranking.sort_values(by=['rank'], ascending=False)

In [14]:
ranked_list.values.tolist()[:500]

[['food', 423.1447201710221],
 ['good', 291.9226699804169],
 ['place', 275.0935513890877],
 ['indian', 244.39774677209417],
 ['like', 214.27074823686664],
 ['buffet', 206.71108987474324],
 ['great', 197.51790627110472],
 ['one', 179.21853780939534],
 ['restaurant', 173.43486856560492],
 ['indian_food', 164.14997557462152],
 ['go', 160.6871807823421],
 ['service', 159.74077805559887],
 ['lunch_buffet', 153.81819476695347],
 ['get', 151.28508720306675],
 ['lunch', 144.84054746858217],
 ['also', 140.483831111547],
 ['really', 138.12602898979395],
 ['chicken', 137.190181287967],
 ['chicken_tikka', 130.1810812441697],
 ['naan', 128.70516648482504],
 ['back', 122.08365806948268],
 ['try', 121.54671457140996],
 ['little', 119.70761762499957],
 ['love', 118.47120343383607],
 ['nice', 112.52799800194586],
 ['even', 112.00902423854626],
 ['dishes', 109.41928926011214],
 ['best_indian', 103.60905130705686],
 ['dinner', 101.06179396049083],
 ['best', 100.285008892439],
 ['would', 97.72808933680238

LDA Model

In [13]:
id2word = corpora.Dictionary(texts)
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bit', 1),
  ('buffet', 1),
  ('buffet_dollars', 1),
  ('crazy_weekend', 1),
  ('dinner', 1),
  ('food_still', 1),
  ('good', 1),
  ('great', 1),
  ('happy', 1),
  ('happy_belly', 1),
  ('location', 1),
  ('lunch_buffet', 1),
  ('odana_rd', 1),
  ('price', 1),
  ('recently_went', 1),
  ('stick_lunch', 1),
  ('take_leave', 1),
  ('try_little', 1),
  ('west_side', 1)]]

In [14]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20) 
#                                            random_state=100,
#                                            update_every=1,
#                                            chunksize=30,
#                                            passes=10,
#                                            alpha='auto',
#                                            per_word_topics=True)

In [15]:
from pprint import pprint
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_topics=20, num_words=30))
doc_lda = lda_model[corpus]

[(0,
  '0.025*"food" + 0.021*"chicken_tikka" + 0.019*"one" + 0.016*"masala" + '
  '0.016*"got" + 0.015*"like" + 0.012*"came" + 0.012*"try" + 0.011*"chicken" + '
  '0.010*"place" + 0.010*"think" + 0.010*"dish" + 0.009*"good" + 0.009*"sauce" '
  '+ 0.009*"really" + 0.009*"tried" + 0.009*"also" + 0.009*"amazing" + '
  '0.008*"much" + 0.008*"dishes" + 0.008*"know" + 0.007*"ordered" + '
  '0.007*"first" + 0.007*"curry" + 0.006*"indian" + 0.006*"buffet" + '
  '0.006*"back" + 0.006*"little" + 0.006*"great" + 0.006*"meat"'),
 (1,
  '0.026*"items" + 0.024*"stars" + 0.020*"owner" + 0.019*"friendly" + '
  '0.018*"offer" + 0.016*"oh" + 0.015*"wish" + 0.015*"already" + 0.014*"time" '
  '+ 0.014*"prices" + 0.011*"much_better" + 0.010*"family" + 0.010*"ok" + '
  '0.010*"reasonable" + 0.010*"level" + 0.010*"generous" + 0.009*"tastes" + '
  '0.009*"prepared" + 0.008*"opening" + 0.008*"indian_food" + '
  '0.008*"service_great" + 0.008*"knowing" + 0.008*"compared" + '
  '0.007*"horrible" + 0.007*"shared"

Phrase extraction (using n-grams

In [16]:
import nltk
from nltk.util import ngrams
from collections import Counter
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()
four_gram_counter = Counter()
five_gram_counter = Counter()

In [17]:
for sent in sent_stream:
    unigram = ngrams(sent, 1)
    for each in unigram:
        unigram_counter[each] += 1
#unigram_counter.most_common(300)

In [18]:
for sent in sent_stream:
    bigram = ngrams(sent, 2)
    for each in bigram:
        bigram_counter[each] += 1
bigram_counter.most_common(300)

[(('indian', 'food'), 1970),
 (('lunch', 'buffet'), 1001),
 (('chicken', 'tikka'), 981),
 (('tikka', 'masala'), 674),
 (('indian', 'restaurant'), 641),
 (('best', 'indian'), 622),
 (('garlic', 'naan'), 606),
 (('indian', 'restaurants'), 535),
 (('tandoori', 'chicken'), 365),
 (('go', 'back'), 332),
 (('first', 'time'), 320),
 (('good', 'indian'), 314),
 (('mango', 'lassi'), 230),
 (('food', 'good'), 220),
 (('really', 'good'), 211),
 (('naan', 'bread'), 208),
 (('south', 'indian'), 207),
 (('favorite', 'indian'), 200),
 (('come', 'back'), 200),
 (('indian', 'buffet'), 194),
 (('one', 'best'), 191),
 (('butter', 'chicken'), 189),
 (('ordered', 'chicken'), 184),
 (('even', 'though'), 180),
 (('indian', 'cuisine'), 180),
 (('indian', 'place'), 179),
 (('highly', 'recommend'), 179),
 (('next', 'time'), 177),
 (('good', 'food'), 173),
 (('love', 'indian'), 170),
 (('chicken', 'curry'), 168),
 (('like', 'indian'), 167),
 (('food', 'service'), 165),
 (('food', 'really'), 160),
 (('great', 'fo

In [22]:
for sent in sent_stream:
    trigram = ngrams(sent, 3)
    for each in trigram:
        trigram_counter[each] += 1
trigram_counter.most_common(100)

[(('chicken', 'tikka', 'masala'), 968),
 (('best', 'indian', 'food'), 600),
 (('good', 'indian', 'food'), 188),
 (('indian', 'food', 'ever'), 188),
 (('best', 'indian', 'restaurant'), 172),
 (('ordered', 'chicken', 'tikka'), 154),
 (('one', 'best', 'indian'), 140),
 (('love', 'indian', 'food'), 134),
 (('favorite', 'indian', 'restaurant'), 126),
 (('went', 'lunch', 'buffet'), 118),
 (('indian', 'lunch', 'buffet'), 100),
 (('craving', 'indian', 'food'), 100),
 (('indian', 'food', 'place'), 100),
 (('many', 'indian', 'restaurants'), 96),
 (('like', 'indian', 'food'), 90),
 (('go', 'back', 'try'), 82),
 (('indian', 'restaurant', 'las'), 80),
 (('best', 'indian', 'restaurants'), 76),
 (('wait', 'go', 'back'), 76),
 (('definitely', 'go', 'back'), 76),
 (('chicken', 'tikki', 'masala'), 74),
 (('south', 'indian', 'food'), 74),
 (('best', 'indian', 'buffet'), 72),
 (('definitely', 'come', 'back'), 70),
 (('chicken', 'tikka', 'chicken'), 70),
 (('indian', 'food', 'las'), 70),
 (('would', 'highl