Testing topic modeling on a tripadvisor reviews dataset

1. Load the dataset and preprocess the reviews

In [35]:
from typing import List
import os

import pandas as pd
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk import pos_tag, word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')

# if you have incompatibility problems between gensim and scipy:
# - uninstall current version of scipy
# - run `pip install scipy==1.10.1`

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Load the dataset and preprocess the reviews

In [36]:
lemmatizer = WordNetLemmatizer()

def preprocess(review: str) -> List[str]:
    tokens = word_tokenize(review.lower())
    # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # keep only nouns
    tokens = [word for word, pos in pos_tag(tokens) if pos.startswith('N')]
    return tokens

In [37]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # preprocess the reviews
    df = pd.read_csv('resources/reviews.csv', nrows=1000)
    reviews = df['Review']
    reviews = [review.strip() for review in reviews] # remove newline characters from each review
    # remove punctuation, stopwords, lemmatize and keep only nouns
    reviews = [preprocess(review) for review in reviews]
else:
    # load the preprocessed reviews
    reviews = []
    with open('resources/reviews_preprocessed.txt', 'r') as f:
        for line in f:
            reviews.append(line.strip().split(','))

In [38]:
# Let's see how the first review looks like after preprocessing
reviews[0]

['hotel',
 'parking',
 'deal',
 'hotel',
 'evening',
 'review',
 'valet',
 'check',
 'view',
 'room',
 'room',
 'size',
 'woke',
 'pillow',
 'soundproof',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'closing',
 'people',
 'neighbor',
 'bath',
 'product',
 'stay',
 'advantage',
 'location',
 'distance',
 'experience',
 'pay',
 'parking',
 'night']

In [39]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # save the preprocessed reviews to a file
    with open('resources/reviews_preprocessed.txt', 'w') as f:
        for review in reviews:
            f.write(','.join(review) + '\n')

In [40]:
# Step 2: Dictionary and Corpus

# Create a dictionary from the processed reviews.
# This dictionary encapsulates the mapping between normalized words (nouns in this case) and their integer ids.
# Each unique word is assigned a unique id.
dictionary = corpora.Dictionary(reviews)

# Create a corpus from the processed reviews using the dictionary.
# The corpus is a list of documents where each document is represented as a list of tuples.
# Each tuple consists of a word's integer id and its frequency in the document.
# This method converts each document (a list of words) into the bag-of-words format.
corpus = [dictionary.doc2bow(text) for text in reviews]

In [41]:
# print dictionary sample
print("Dictionary Sample:")
for i, (word_id, word) in enumerate(dictionary.iteritems()):
    print(f"ID {word_id}: {word}")
    if i == 9:  # limit to the first 10 items
        break
        
print("\n")

# Print corpus sample
# Print the BoW representation for the first 3 documents in the corpus.
print("Corpus Sample:")
# Formatting output to show word counts along with their corresponding words
formatted_doc = [(dictionary[word_id], count) for word_id, count in corpus[0]]
print(formatted_doc[:10])

Dictionary Sample:
ID 0: advantage
ID 1: bang
ID 2: bath
ID 3: check
ID 4: closing
ID 5: deal
ID 6: distance
ID 7: door
ID 8: evening
ID 9: experience


Corpus Sample:
[('advantage', 1), ('bang', 1), ('bath', 1), ('check', 1), ('closing', 1), ('deal', 1), ('distance', 1), ('door', 1), ('evening', 1), ('experience', 1)]


In [42]:
# Step 3: LDA Model
def run_lda(corpus, dictionary, num_topics):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    return lda_model

In [43]:
# Step 4: Evaluate Models
def evaluate_models(corpus, dictionary, texts, start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = run_lda(corpus, dictionary, num_topics)
        model_list.append(model)
        # CoherenceModel evaluates the topic modeling for a certain model https://radimrehurek.com/gensim/models/coherencemodel.html
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [44]:
# Run and evaluate models
model_list, coherence_values = evaluate_models(corpus, dictionary, reviews, start=2, limit=15, step=1)

# Print coherence values to choose the best model
for m, cv in zip(range(2, 15), coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.3682
Num Topics = 3  has Coherence Value of 0.3922
Num Topics = 4  has Coherence Value of 0.4311
Num Topics = 5  has Coherence Value of 0.4567
Num Topics = 6  has Coherence Value of 0.413
Num Topics = 7  has Coherence Value of 0.4716
Num Topics = 8  has Coherence Value of 0.4401
Num Topics = 9  has Coherence Value of 0.5054
Num Topics = 10  has Coherence Value of 0.4384
Num Topics = 11  has Coherence Value of 0.4697
Num Topics = 12  has Coherence Value of 0.4536
Num Topics = 13  has Coherence Value of 0.4399
Num Topics = 14  has Coherence Value of 0.4539
