Testing topic modeling on a tripadvisor reviews dataset

1. Load the dataset and preprocess the reviews
2. Topic Modeling using two different libraries:
    1. sklearn LDA
        1. Tune the number of topics and learning decay values
    2. gensim LDA
        1. Tune the number of topics, chunksize and passes values

In [1]:
from typing import List
import os

import pandas as pd
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import nltk
from nltk import pos_tag, word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

nltk.download('stopwords')

# if you have incompatibility problems between gensim and scipy:
# - uninstall current version of scipy
# - run `pip install scipy==1.10.1`

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gianl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Load the dataset and preprocess the reviews

In [2]:
lemmatizer = WordNetLemmatizer()

def preprocess(review: str) -> List[str]:
    tokens = word_tokenize(review.lower())
    # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # keep only nouns
    tokens = [word for word, pos in pos_tag(tokens) if pos.startswith('N')]
    return tokens

### If the preprocessed reviews file does not exist, preprocess the reviews and save them to a file

In [10]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # preprocess the reviews
    df = pd.read_csv('resources/reviews.csv', nrows=3000)
    reviews = df['Review']
    reviews = [review.strip() for review in reviews] # remove newline characters from each review
    # remove punctuation, stopwords, lemmatize and keep only nouns
    reviews = [preprocess(review) for review in reviews]
    # save the preprocessed reviews to a file
    with open('resources/reviews_preprocessed.txt', 'w') as f:
        for review in reviews:
            f.write(','.join(review) + '\n')
else:
    # load the preprocessed reviews
    reviews = []
    with open('resources/reviews_preprocessed.txt', 'r') as f:
        for line in f:
            reviews.append(line.strip().split(','))

Print the first review after preprocessing

In [4]:
reviews[0]

['hotel',
 'parking',
 'deal',
 'hotel',
 'evening',
 'review',
 'valet',
 'check',
 'view',
 'room',
 'room',
 'size',
 'woke',
 'pillow',
 'soundproof',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'closing',
 'people',
 'neighbor',
 'bath',
 'product',
 'stay',
 'advantage',
 'location',
 'distance',
 'experience',
 'pay',
 'parking',
 'night']

## 2. Topic Modeling

### 2.1. sklearn LDA

Evaluate the LDA model using sklearn's implementation tuning the following parameters
- Number of topics
- Learning decay values

Note: the search parameters can be different because 

In [6]:
def sklearn_lda_evaluate_models(reviews: List[List[str]], search_params: dict):
    reviews = [' '.join(review) for review in reviews]
    
    # convert the reviews to a document-term matrix
    vectorizer = CountVectorizer()
    data_vectorized = vectorizer.fit_transform(reviews)
    
    lda = LatentDirichletAllocation()
    
    # initiate GridSearchCV
    model = GridSearchCV(lda, param_grid=search_params)
    
    # fit the GridSearchCV model
    model.fit(data_vectorized)
    
    return model

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -228915.35298277353
Model Perplexity:  899.1034547729259


### 2.2. gensim LDA

#### Create a dictionary and a corpus from the preprocessed reviews as required by gensim's LDA model

In [11]:
# Create a dictionary from the processed reviews.
# This dictionary encapsulates the mapping between normalized words (nouns in this case) and their integer ids.
# Each unique word is assigned a unique id.
dictionary = corpora.Dictionary(reviews)

# Create a corpus from the processed reviews using the dictionary.
# The corpus is a list of documents where each document is represented as a list of tuples.
# Each tuple consists of a word's integer id and its frequency in the document.
# This method converts each document (a list of words) into the bag-of-words format.
corpus = [dictionary.doc2bow(text) for text in reviews]

Print dictionary and corpus samples

In [12]:
# print dictionary sample
print("Dictionary Sample:")
for i, (word_id, word) in enumerate(dictionary.iteritems()):
    print(f"ID {word_id}: {word}")
    if i == 9:  # limit to the first 10 items
        break
        
print("\n")

# Print corpus sample
# Print the BoW representation for the first 3 documents in the corpus.
print("Corpus Sample:")
# Formatting output to show word counts along with their corresponding words
formatted_doc = [(dictionary[word_id], count) for word_id, count in corpus[0]]
print(formatted_doc[:10])

Dictionary Sample:
ID 0: advantage
ID 1: bang
ID 2: bath
ID 3: check
ID 4: closing
ID 5: deal
ID 6: distance
ID 7: door
ID 8: evening
ID 9: experience


Corpus Sample:
[('advantage', 1), ('bang', 1), ('bath', 1), ('check', 1), ('closing', 1), ('deal', 1), ('distance', 1), ('door', 1), ('evening', 1), ('experience', 1)]


#### Evaluate the LDA model using gensim's implementation for different number of topics, chunksize and passes values

In [15]:
def gensim_lda_evaluate_models(corpus: List[List[str]],
                    dictionary: corpora.Dictionary,
                    texts: List[List[str]],
                    topic_numbers: List[int],
                    chunksize_values: List[int],
                    passes_values: List[int],):
    results = []
    for num_topics in topic_numbers:
        for chucksize_value in chunksize_values:
            for passes_value in passes_values:
    
                print("Evaluating model with:")
                print(f"num_topics={num_topics}, chucksize={chucksize_value}, passes={passes_value}")
                
                model = gensim.models.ldamodel.LdaModel(
                    corpus=corpus,
                    id2word=dictionary,
                    num_topics=num_topics,
                    random_state=100,
                    chunksize=chucksize_value,
                    passes=passes_value,
                    alpha="auto",
                    eta="auto",
                    per_word_topics=True
                )
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_score = coherencemodel.get_coherence()
                results.append((num_topics, chucksize_value, passes_value, coherence_score))
    return results

### Run the model evaluation for sklearn LDA

In [None]:
sklearn_search_params = {'n_components': [5, 10, 15], 'learning_decay': [0.5, 0.7, 0.9]}

sklearn_lda_evaluate_models(reviews, sklearn_search_params)

### Run the model evaluation for gensim LDA

In [None]:
topic_numbers = list(range(2, 15))
chunksize_values = [50, 100, 200]
passes_values = [5, 10, 20]

# Print coherence values to choose the best model
results = gensim_lda_evaluate_models(corpus=corpus,
                           dictionary=dictionary,
                           texts=reviews,
                           topic_numbers=topic_numbers,
                           chunksize_values=chunksize_values,
                           passes_values=passes_values)

# Sort results by coherence score and print the top 5 models
results = sorted(results, key=lambda x: x[3], reverse=True)
for num_topics, chucksize_value, passes_value, coherence_score in results[:5]:
    print(f"Num Topics: {num_topics}, Chucksize: {chucksize_value}, Passes: {passes_value}, Coherence Score: {coherence_score}")