In [1]:
import pandas as pd
from collections import defaultdict
from gensim import corpora, models
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

In [2]:
# import CSV
df = pd.read_csv('../data/review_1819.csv')

In [15]:
# https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html#sphx-glr-auto-examples-core-run-topics-and-transformations-py
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
documents = []
for document in df.iloc[0:10].text:
    documents.append(document)

stoplist = stopwords.words('english')

texts = [
    [word for word in document.translate(str.maketrans('', '', string.punctuation)).lower().split() if word not in stoplist]
    for document in documents
]

# freq = defaultdict(int)
# for text in texts:
#     for token in text:
#         freq[token] += 1
# 
# texts = [
#     [token for token in text if freq[token] > 1]
#     for text in texts
# ]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
tfidf = models.TfidfModel(corpus)

In [17]:
corpus_tfidf = tfidf[corpus]

In [18]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
corpus_lsi = lsi_model[corpus_tfidf]

In [19]:
lsi_model.print_topics(20)

[(0,
  '0.146*"street" + 0.145*"sushi" + 0.135*"food" + 0.124*"good" + 0.121*"want" + 0.119*"long" + 0.118*"cannoli" + 0.116*"usually" + 0.111*"like" + 0.109*"96th"'),
 (1,
  '0.210*"breakfast" + 0.198*"amazing" + 0.184*"great" + 0.161*"extremely" + 0.157*"biscuits" + 0.157*"recommended" + 0.157*"fill" + 0.157*"busy" + 0.157*"cocktails" + 0.157*"blank"'),
 (2,
  '-0.179*"9" + 0.156*"bit" + 0.152*"biscuits" + 0.152*"recommended" + 0.152*"fill" + 0.152*"blank" + 0.152*"highly" + 0.152*"busy" + 0.152*"cocktails" + -0.140*"choose"'),
 (3,
  '-0.239*"cannoli" + 0.152*"long" + 0.151*"choose" + 0.151*"deli" + -0.150*"9" + 0.142*"usually" + -0.120*"awesome" + -0.120*"freshly" + -0.120*"termini" + -0.120*"sweet"'),
 (4,
  '0.322*"parmesan" + 0.186*"breakfast" + 0.177*"got" + 0.161*"wreck" + 0.161*"drinks" + 0.161*"cluelesslike" + 0.161*"bone" + 0.161*"train" + 0.161*"skip" + 0.161*"thought"'),
 (5,
  '0.290*"cannoli" + -0.257*"9" + 0.173*"like" + -0.150*"wait" + 0.145*"awesome" + 0.145*"cannoli

In [20]:
lda_model = models.LdaMulticore(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lda = lda_model[corpus]

In [21]:
lda_model.print_topics()

[(0,
  '0.018*"cannoli" + 0.011*"like" + 0.010*"freshly" + 0.010*"ive" + 0.010*"perfect" + 0.010*"bros" + 0.010*"sweet" + 0.010*"filled" + 0.010*"theyre" + 0.010*"termini"'),
 (1,
  '0.004*"amazing" + 0.004*"busy" + 0.004*"recommended" + 0.004*"blank" + 0.004*"bit" + 0.004*"cocktails" + 0.004*"fill" + 0.004*"cannoli" + 0.004*"parmesan" + 0.004*"u"'),
 (2,
  '0.012*"choose" + 0.012*"deli" + 0.012*"biscuits" + 0.012*"highly" + 0.012*"cocktails" + 0.012*"busy" + 0.012*"fill" + 0.012*"recommended" + 0.012*"blank" + 0.009*"though"'),
 (3,
  '0.012*"9" + 0.010*"usually" + 0.009*"quieter" + 0.009*"sidewalk" + 0.009*"pricey" + 0.009*"outside" + 0.009*"side" + 0.009*"french" + 0.009*"u" + 0.009*"enjoyed"'),
 (4,
  '0.004*"cannoli" + 0.004*"fill" + 0.004*"recommended" + 0.004*"blank" + 0.004*"amazing" + 0.004*"busy" + 0.004*"great" + 0.004*"9" + 0.004*"quarter" + 0.004*"u"'),
 (5,
  '0.004*"blank" + 0.004*"amazing" + 0.004*"fill" + 0.004*"cannoli" + 0.004*"recommended" + 0.004*"great" + 0.004*"b