Testing topic modeling on a tripadvisor reviews dataset

1. Load the dataset and preprocess the reviews

In [23]:
from typing import List
import os

import pandas as pd
from gensim import corpora
import nltk
from nltk import pos_tag, word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')

# if you have incompatibility problems between gensim and scipy:
# - uninstall current version of scipy
# - run `pip install scipy==1.10.1`

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Load the dataset and preprocess the reviews

In [24]:
lemmatizer = WordNetLemmatizer()

def preprocess(review: str) -> List[str]:
    tokens = word_tokenize(review.lower())
    # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # keep only nouns
    tokens = [word for word, pos in pos_tag(tokens) if pos.startswith('N')]
    return tokens

In [25]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # preprocess the reviews
    df = pd.read_csv('resources/reviews.csv', nrows=1000)
    reviews = df['Review']
    reviews = [review.strip() for review in reviews] # remove newline characters from each review
    # remove punctuation, stopwords, lemmatize and keep only nouns
    reviews = [preprocess(review) for review in reviews]
else:
    # load the preprocessed reviews
    reviews = []
    with open('resources/reviews_preprocessed.txt', 'r') as f:
        for line in f:
            reviews.append(line.strip().split(','))

In [26]:
# Let's see how the first review looks like after preprocessing
reviews[0]

['hotel',
 'parking',
 'deal',
 'hotel',
 'evening',
 'review',
 'valet',
 'check',
 'view',
 'room',
 'room',
 'size',
 'woke',
 'pillow',
 'soundproof',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'closing',
 'people',
 'neighbor',
 'bath',
 'product',
 'stay',
 'advantage',
 'location',
 'distance',
 'experience',
 'pay',
 'parking',
 'night']

In [27]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # save the preprocessed reviews to a file
    with open('resources/reviews_preprocessed.txt', 'w') as f:
        for review in reviews:
            f.write(','.join(review) + '\n')

In [28]:
# Step 2: Dictionary and Corpus

# Create a dictionary from the processed reviews.
# This dictionary encapsulates the mapping between normalized words (nouns in this case) and their integer ids.
# Each unique word is assigned a unique id.
dictionary = corpora.Dictionary(reviews)

# Create a corpus from the processed reviews using the dictionary.
# The corpus is a list of documents where each document is represented as a list of tuples.
# Each tuple consists of a word's integer id and its frequency in the document.
# This method converts each document (a list of words) into the bag-of-words format.
corpus = [dictionary.doc2bow(text) for text in reviews]

In [34]:
# print dictionary sample
print("Dictionary Sample:")
for i, (word_id, word) in enumerate(dictionary.iteritems()):
    print(f"ID {word_id}: {word}")
    if i == 9:  # limit to the first 10 items
        break
        
print("\n")

# Print corpus sample
# Print the BoW representation for the first 3 documents in the corpus.
print("Corpus Sample:")
# Formatting output to show word counts along with their corresponding words
formatted_doc = [(dictionary[word_id], count) for word_id, count in corpus[0]]
print(formatted_doc[:10])

Dictionary Sample:
ID 0: advantage
ID 1: bang
ID 2: bath
ID 3: check
ID 4: closing
ID 5: deal
ID 6: distance
ID 7: door
ID 8: evening
ID 9: experience


Corpus Sample:
[('advantage', 1), ('bang', 1), ('bath', 1), ('check', 1), ('closing', 1), ('deal', 1), ('distance', 1), ('door', 1), ('evening', 1), ('experience', 1)]


In [33]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 3), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)], [(11, 7), (14, 1), (17, 2), (24, 6), (29, 1), (31, 1), (32, 3), (33, 2), (34, 2), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 2), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 3), (54, 4), (55, 1), (56, 1), (57, 1), (58, 3), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 3), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 3), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 2), (93, 2), (94, 1), (95, 1), (96, 1), (97, 3), (98, 1), (99, 1), (100, 1), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1)], 