Testing topic modeling on a tripadvisor reviews dataset

1. Load the dataset and preprocess the reviews

In [23]:
from typing import List
import os

import pandas as pd
from gensim import corpora
import nltk
from nltk import pos_tag, word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')

# if you have incompatibility problems between gensim and scipy:
# - uninstall current version of scipy
# - run `pip install scipy==1.10.1`

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amato\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Load the dataset and preprocess the reviews

In [24]:
lemmatizer = WordNetLemmatizer()

def preprocess(review: str) -> List[str]:
    tokens = word_tokenize(review.lower())
    # remove punctuation
    tokens = [word for word in tokens if word.isalpha()]
    # remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # keep only nouns
    tokens = [word for word, pos in pos_tag(tokens) if pos.startswith('N')]
    return tokens

In [25]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # preprocess the reviews
    df = pd.read_csv('resources/reviews.csv', nrows=1000)
    reviews = df['Review']
    reviews = [review.strip() for review in reviews] # remove newline characters from each review
    # remove punctuation, stopwords, lemmatize and keep only nouns
    reviews = [preprocess(review) for review in reviews]
else:
    # load the preprocessed reviews
    reviews = []
    with open('resources/reviews_preprocessed.txt', 'r') as f:
        for line in f:
            reviews.append(line.strip().split(','))

In [26]:
# Let's see how the first review looks like after preprocessing
reviews[0]

['hotel',
 'parking',
 'deal',
 'hotel',
 'evening',
 'review',
 'valet',
 'check',
 'view',
 'room',
 'room',
 'size',
 'woke',
 'pillow',
 'soundproof',
 'heard',
 'music',
 'room',
 'night',
 'morning',
 'loud',
 'bang',
 'door',
 'closing',
 'people',
 'neighbor',
 'bath',
 'product',
 'stay',
 'advantage',
 'location',
 'distance',
 'experience',
 'pay',
 'parking',
 'night']

In [27]:
if 'reviews_preprocessed.txt' not in os.listdir('resources'):
    # save the preprocessed reviews to a file
    with open('resources/reviews_preprocessed.txt', 'w') as f:
        for review in reviews:
            f.write(','.join(review) + '\n')

In [28]:
# Step 2: Dictionary and Corpus

# Create a dictionary from the processed reviews.
# This dictionary encapsulates the mapping between normalized words (nouns in this case) and their integer ids.
# Each unique word is assigned a unique id.
dictionary = corpora.Dictionary(reviews)

# Create a corpus from the processed reviews using the dictionary.
# The corpus is a list of documents where each document is represented as a list of tuples.
# Each tuple consists of a word's integer id and its frequency in the document.
# This method converts each document (a list of words) into the bag-of-words format.
corpus = [dictionary.doc2bow(text) for text in reviews]