In [28]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from datasets import load_dataset, Dataset, load_from_disk

## LDA

In [2]:
# Load the dataset
dataset = load_dataset("yelp_review_full")

In [4]:
# Convert the 'text' column of the dataset into a list
data_text = []
for split in dataset.keys():
    data_text.extend(dataset[split]['text'])

# Now convert data_text from a list into a Dataset from Hugging Face
data_text_dataset = Dataset.from_dict({"text": data_text})

In [17]:
# Generate corpus for LDA, preprocessed and lemmatized
stopwords = stopwords.words('english')

def generate_corpus(doc):
    cleaned_text = re.sub('[^a-zA-Z]', ' ', doc['text'])  # Remove non-alphabetic characters
    cleaned_text = cleaned_text.lower()  # Convert to lowercase
    cleaned_text = cleaned_text.split()  # Split into words
    cleaned_text = [PorterStemmer().stem(word) for word in cleaned_text if word not in stopwords]  # Perform stemming and remove stopwords
    doc['text'] = cleaned_text
    return doc

corpus = data_text_dataset.map(generate_corpus)

Map: 100%|██████████| 700000/700000 [25:11<00:00, 463.18 examples/s]  


In [19]:
# Save the corpus dataset to disk, for later use
corpus.save_to_disk('corpus_lemmatized.h5')

Saving the dataset (0/1 shards):   0%|          | 0/700000 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 700000/700000 [00:00<00:00, 1322630.76 examples/s]


In [24]:
# Load the lemmatized dataset
corpus = load_from_disk('corpus_lemmatized.h5')

In [31]:
# Create a dictionary from the corpus, that is, a mapping between words and their integer ids
dictionary = corpora.Dictionary(corpus['text'])

print(dictionary)

Dictionary<181582 unique tokens: ['affili', 'alway', 'blank', 'case', 'complaint']...>


In [51]:
# Filter out tokens that appear in less than 15 documents or more than 10% of the documents.
# Keep only the first 1000 most frequent tokens.
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=1000)

# Sort it in the alphabetical order, and display its elements
sorted_dict_values = dict(sorted(dictionary.items(), key=lambda item: item[1]))
sorted_dict_values

{96: 'abl',
 187: 'absolut',
 855: 'accept',
 610: 'accommod',
 562: 'across',
 845: 'act',
 43: 'actual',
 563: 'ad',
 278: 'add',
 106: 'addit',
 464: 'admit',
 212: 'afford',
 213: 'afternoon',
 242: 'age',
 243: 'ago',
 661: 'agre',
 214: 'ahead',
 465: 'air',
 358: 'airport',
 882: 'alcohol',
 625: 'allow',
 44: 'almost',
 862: 'alon',
 582: 'along',
 718: 'alreadi',
 603: 'alright',
 160: 'although',
 142: 'amaz',
 244: 'ambianc',
 690: 'american',
 682: 'amount',
 865: 'annoy',
 89: 'anoth',
 15: 'answer',
 803: 'anymor',
 16: 'anyon',
 107: 'anyth',
 631: 'anyway',
 406: 'anywher',
 419: 'apart',
 411: 'apolog',
 420: 'appar',
 199: 'appear',
 466: 'appet',
 90: 'appoint',
 539: 'appreci',
 467: 'area',
 161: 'arriv',
 953: 'asian',
 970: 'ass',
 790: 'assum',
 279: 'ate',
 390: 'atmospher',
 646: 'attempt',
 468: 'attend',
 604: 'attent',
 55: 'attitud',
 969: 'authent',
 188: 'avail',
 359: 'averag',
 611: 'avoid',
 45: 'aw',
 56: 'away',
 245: 'awesom',
 626: 'b',
 831: 'bab

In [52]:
# Convert the corpus into a bag of words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

# Create a TF-IDF model from the corpus
tfidf_model = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]

In [None]:
# Train the LDA model for clustering the reviews in 20 topics
lda_model=gensim.models.LdaMulticore(bow_corpus,
                                    num_topics=20,
                                    id2word=dictionary,
                                    passes=2,
                                    random_state=11
                                    )
lda_model.save('lda_model.model')

In [None]:
# Print the topics with the words that belong to them and their weights
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic)) 
    print("\n")

Topic: 0 
Words: 0.059*"coffe" + 0.047*"breakfast" + 0.023*"tea" + 0.018*"lunch" + 0.016*"cafe" + 0.016*"cup" + 0.016*"morn" + 0.016*"juic" + 0.015*"fresh" + 0.013*"open"


Topic: 1 
Words: 0.020*"wine" + 0.019*"delici" + 0.017*"dinner" + 0.015*"dish" + 0.015*"meal" + 0.015*"amaz" + 0.011*"excel" + 0.011*"dessert" + 0.011*"enjoy" + 0.011*"appet"


Topic: 2 
Words: 0.027*"show" + 0.017*"club" + 0.013*"friend" + 0.012*"guy" + 0.012*"fun" + 0.012*"line" + 0.012*"girl" + 0.011*"music" + 0.010*"cool" + 0.010*"danc"


Topic: 3 
Words: 0.034*"chees" + 0.034*"salad" + 0.033*"sauc" + 0.028*"meat" + 0.019*"chicken" + 0.017*"side" + 0.016*"flavor" + 0.014*"bread" + 0.014*"bbq" + 0.013*"potato"


Topic: 4 
Words: 0.098*"room" + 0.050*"hotel" + 0.049*"stay" + 0.020*"pool" + 0.017*"check" + 0.017*"clean" + 0.015*"floor" + 0.014*"bed" + 0.013*"bathroom" + 0.012*"strip"


Topic: 5 
Words: 0.355*"e" + 0.107*"de" + 0.092*"la" + 0.047*"f" + 0.047*"c" + 0.043*"die" + 0.038*"pour" + 0.037*"n" + 0.020*"b" +

In [None]:
# Train the LDA model for clustering the reviews in 20 topics using the TF-IDF representation
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=20, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)

# Print the topics with the words that belong to them and their weights
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")   

Topic: 0 Word: 0.035*"casino" + 0.025*"scottsdal" + 0.019*"locat" + 0.013*"downtown" + 0.013*"close" + 0.009*"town" + 0.009*"street" + 0.008*"strip" + 0.008*"open" + 0.007*"cheap"


Topic: 1 Word: 0.035*"breakfast" + 0.022*"egg" + 0.017*"pancak" + 0.014*"brunch" + 0.012*"bacon" + 0.012*"toast" + 0.012*"coffe" + 0.009*"juic" + 0.008*"french" + 0.008*"morn"


Topic: 2 Word: 0.020*"call" + 0.011*"phone" + 0.010*"told" + 0.009*"compani" + 0.008*"custom" + 0.007*"card" + 0.007*"appoint" + 0.007*"manag" + 0.007*"offic" + 0.007*"busi"


Topic: 3 Word: 0.073*"pizza" + 0.020*"wing" + 0.015*"crust" + 0.014*"dog" + 0.010*"slice" + 0.010*"chees" + 0.009*"pie" + 0.009*"sauc" + 0.008*"hot" + 0.007*"top"


Topic: 4 Word: 0.057*"e" + 0.040*"buffet" + 0.021*"de" + 0.015*"die" + 0.012*"crab" + 0.011*"f" + 0.010*"la" + 0.009*"seafood" + 0.009*"averag" + 0.008*"dessert"


Topic: 5 Word: 0.013*"steak" + 0.013*"bbq" + 0.012*"rib" + 0.011*"chees" + 0.010*"mac" + 0.010*"fri" + 0.010*"pork" + 0.009*"potato" + 

In [None]:
# Now check which topics are present in a random review
import random
review = random.randint(0, len(dataset['text']))

print(dataset['text'][review])
for index, score in sorted(lda_model[bow_corpus[review]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10))) # print first 10 words of the topic

This place is absolute garbage...  Half of the tees are not available, including all the grass tees.  It is cash only, and they sell the last bucket at 8, despite having lights.  And if you finish even a minute after 8, don't plan on getting a drink.  The vending machines are sold out (of course) and they sell drinks inside, but close the drawers at 8 on the dot.  There are weeds grown all over the place.  I noticed some sort of batting cage, but it looks like those are out of order as well.  Someone should buy this place and turn it into what it should be.

Score: 0.42214149236679077	 
Topic: 0.038*"custom" + 0.018*"employe" + 0.018*"walk" + 0.015*"help" + 0.014*"line" + 0.013*"manag" + 0.013*"counter" + 0.012*"busi" + 0.012*"card" + 0.011*"rude"

Score: 0.27001717686653137	 
Topic: 0.105*"pizza" + 0.051*"store" + 0.037*"shop" + 0.018*"buy" + 0.018*"item" + 0.014*"find" + 0.013*"crust" + 0.012*"product" + 0.010*"select" + 0.010*"new"

Score: 0.21490934491157532	 
Topic: 0.020*"wine" +