In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import plotly.graph_objects as go
import itertools
import re

from sklearn.model_selection import train_test_split
from pprint import pprint
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l1, l2
from sklearn.metrics import confusion_matrix, classification_report
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Load data

In [None]:
df = pd.read_csv("/kaggle/input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv")
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


The reviews appear to be somewhat preprocessed:
* There are no capital letters or full stops, but there is other punctuation like commas and apostrophes.
* Sentences don't flow very well, presumably because stop words like "I", "the" and "a" have been removed.
* Words have clearly been tokenized, because "n't" appears in isolation a lot.
* There are many plurals, suggesting that words were not converted to singular.
* Some words weren't tokenized correctly due to missing spaces, for example "shower.before".
* There are some misspellings, for example "ass told" instead of "as told".
* The reviews seem to all end with commas, suggesting that full stops have been converted to commas.

It seems we can leverage this preprocessing and just tokenize by splitting on spaces.

The ratings appear to be integers between 1 and 5.

In [None]:
df.info()

There are 20491 data points. There are no null reviews or ratings. All ratings are integers.

# Exploratory data analysis

## Are all the ratings between 1 and 5? What's the distribution of ratings?

In [None]:
df["Rating"].value_counts()

In [None]:
sns.countplot(data=df, x="Rating", palette="viridis").set_title("Distribution of ratings")

The ratings are all integers between 1 and 5.

The ratings are unevenly distributed, with more positive reviews being more common.

## What's the longest length of a review?

In [None]:
df["Length"] = df["Review"].apply(len)
df.head()

In [None]:
sns.displot(data=df, x="Length", hue="Rating", palette="viridis", kind="kde", fill=True, aspect=4)

In [None]:
df["Length"].describe()

In [None]:
df["Length"].mode()

Now let's look at the length in terms of words.

In [None]:
df["Length in words"] = df["Review"].str.split().apply(len)
df.head()

In [None]:
sns.displot(data=df, x="Length in words", hue="Rating", palette="viridis", kind="kde", fill=True, aspect=4)

In [None]:
df["Length in words"].describe()

In [None]:
df["Length in words"].mode()

The reviews range from 44 to 13501 characters, with a median of 537. The most common lengths are 403 and 444.

In terms of words, the reviews range from length 7 to 1931, with a median of 77. The most common length is 48.

## Are there any odd characters?

In [None]:
chars = set()

for item in df["Review"]:
    chars = chars.union(item)
    
chars = sorted(chars)

print(chars)

Indeed, there are some odd characters. We should probably do something about them.

## Are all the reviews in English?

From the list of characters, we see that a lot of non-English characters appear. Let's find some reviews containing these characters.

In [None]:
def find_review_by_character(c):
    for review in df["Review"]:
        if c in review and len(review) < 1000:
            return review

# Print a review for each non-English character
for c in ['À', 'Â', 'Ä', 'Ç', 'È', 'Ù', 'Û', 'Ü', 'à', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ö']:
    pprint(find_review_by_character(c))
    print()

The reviews above are all English, and the non-English characters look like artifacts from some kind of data processing or encoding error.

## What are the most common words?

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words=1000, min_font_size=10, height=800, width=1600,
               background_color="white", colormap="viridis").generate(" ".join(df["Review"]))

plt.imshow(wc)

In [None]:
texts = df["Review"]
new = texts.str.split()
new = new.values.tolist()
corpus = [word for i in new for word in i]
counter = Counter(corpus)
most = counter.most_common()
x, y = [], []
for word, count in most[:30]:
    x.append(word)
    y.append(count)

fig = go.Figure(go.Bar(
            x=y,
            y=x,
            orientation='h',  marker=dict(
        color='rgba(50, 171, 96, 0.6)',
        line=dict(
            color='rgba(50, 171, 96, 1.0)',
            width=1),
    ),
    name='Most common Word',))

fig.update_layout( title={
        'text': "Most Common Words",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'}, font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))

fig.show()

## What are some common n-grams?

In [None]:
def _get_top_ngram(corpus, n=None):
    #getting top ngrams
    vec = CountVectorizer(ngram_range=(n, n),
                          max_df=0.9,
                          ).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:15]

In [None]:
# Show most common 3-grams

fig = make_subplots(rows=1, cols=1)

texts = df["Review"]

new = texts.str.split()
new = new.values.tolist()
corpus = [word for i in new for word in i]

top_n_bigrams = _get_top_ngram(texts, 3)[:15]
x, y = map(list, zip(*top_n_bigrams))

fig.add_trace(go.Bar(
            x=y,
            y=x,
            orientation='h', type="bar",
    name="3-grams", marker=dict(color="lightgreen")), 1, 1),

fig.update_layout(
    autosize=False,
    width=2000,
    height=600,title=dict(
        text='<b>Most Common trigrams</b>',
        x=0.5,
        y=0.95,
        font=dict(
        family="Courier New, monospace",
        size=24,
        color="RebeccaPurple"
        )
    ))
    
fig.show()

## How many distinct words are there?

In [None]:
reviews = df["Review"]
reviews = reviews.str.split()
reviews = reviews.values.tolist()
words = set(word for review in reviews for word in review)
print(f"There are {len(words)} distinct words.")

In [None]:
# Sanity check
print("Some example words:", list(itertools.islice(words, 10)))

There are 102,008 different words, which is quite a lot!

According to [a blog post on wordcounter.io](https://wordcounter.io/blog/how-many-words-are-in-the-english-language#:~:text=The%20Second%20Edition%20of%20the,Section%2C%20includes%20some%20470%2C000%20entries.), 3,000 commonly used words cover 95% of everyday writing. This means there must be a huge number of spurious words in this dataset. We should probably clean up this data.

# Train model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df["Review"], df["Rating"], test_size=0.2, random_state=RANDOM_SEED)

# Sparse categorical crossentropy needs labels between 0 and N-1, so we need to subtract 1 from the ratings
y_train -= 1
y_val -= 1

In [None]:
tmp_train_ds = tf.data.Dataset.from_tensor_slices(X_train)

tmp_vectorize = tf.keras.layers.TextVectorization()

tmp_vectorize.adapt(tmp_train_ds)

vocab_size = tmp_vectorize.vocabulary_size()
print(f"Vocab size before preprocessing: {vocab_size}")

In [None]:
stemmer = SnowballStemmer("english")

def custom_preprocessing(text):
    # Stem words to reduce the number of distinct words - this should reduce overfitting
    text = " ".join(stemmer.stem(word) for word in text.split())
    # Ensure the comma (sentence separator) gets its own token and is not stripped out
    text = re.sub(r', ', r' newsentence ', text)
    return text

In [None]:
X_train = X_train.apply(custom_preprocessing)
X_val = X_val.apply(custom_preprocessing)

In [None]:
tmp_train_ds = tf.data.Dataset.from_tensor_slices(X_train)

tmp_vectorize = tf.keras.layers.TextVectorization()

tmp_vectorize.adapt(tmp_train_ds)

vocab_size = tmp_vectorize.vocabulary_size()
print(f"Vocab size after preprocessing: {vocab_size}")

In [None]:
batch_size = 32
VOCAB_TOKENS = 50000

train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).shuffle(len(X_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_text = train_ds.map(lambda text, labels: text)

vectorize = tf.keras.layers.TextVectorization(max_tokens=VOCAB_TOKENS)

vectorize.adapt(train_text)

vocab_size = vectorize.vocabulary_size()
print(f"Total distinct words: {vocab_size}")

In [None]:
epochs = 20

def fit_model(model):
    return model.fit(train_ds, epochs=epochs, validation_data=val_ds, callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, monitor="val_accuracy")])

def plot_metrics(history):
    metrics = pd.DataFrame(history.history)
    metrics[['accuracy', 'val_accuracy']].plot()
    metrics[['loss', 'val_loss']].plot()

In [None]:
%%time

model = tf.keras.models.Sequential([
    vectorize,
    tf.keras.layers.Embedding(vocab_size, 8),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(5, activation='softmax')])

model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = fit_model(model)
print(f"\n***Final val_accuracy: {history.history['val_accuracy'][-1]:.2%}***\n")

plot_metrics(history)

# Model analysis

## Confusion matrix

In [None]:
train_text = train_ds.map(lambda text, labels: text)

pred = model.predict(train_text)
true_labels = y_train
pred_labels = np.argmax(pred, axis=-1)

cm = confusion_matrix(true_labels, pred_labels)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=[1, 2, 3, 4, 5])
cm_disp.plot()

In [None]:
val_text = val_ds.map(lambda text, labels: text)

pred = model.predict(val_text)
true_labels = y_val
pred_labels = np.argmax(pred, axis=-1)

cm = confusion_matrix(true_labels, pred_labels)
cm_disp = ConfusionMatrixDisplay(cm, display_labels=[1, 2, 3, 4, 5])
cm_disp.plot()

## Show bad predictions

In [None]:
# Save some incorrect predictions as a csv

num_samples = 20

df_incorrect = pd.DataFrame(columns=["Review", "True label", "Predicted label"])

for review_batch, label_batch in val_ds:
    pred_batch = tf.argmax(model.predict(review_batch), axis=-1)
    incorrect_flags = (pred_batch != label_batch)
    incorrect_reviews = review_batch[incorrect_flags]
    df_incorrect = pd.concat(
        [
            df_incorrect,
            pd.DataFrame(
                {
                    "Review": review_batch[incorrect_flags],
                    "True label": label_batch[incorrect_flags] + 1,
                    "Predicted label": pred_batch[incorrect_flags] + 1
                }
            )
        ],
        ignore_index=True
    )
    
    if df_incorrect.shape[0] >= num_samples:
        break
        
df_incorrect = df_incorrect.iloc[:num_samples, :]
df_incorrect["Review"] = df_incorrect["Review"].astype(str)

df_incorrect.to_csv("incorrect.csv", index=False)

In [None]:
# Save some correct predictions as a csv

num_samples = 20

df_correct = pd.DataFrame(columns=["Review", "True label", "Predicted label"])

for review_batch, label_batch in val_ds:
    pred_batch = tf.argmax(model.predict(review_batch), axis=-1)
    correct_flags = (pred_batch == label_batch)
    correct_reviews = review_batch[correct_flags]
    df_correct = pd.concat(
        [
            df_correct,
            pd.DataFrame(
                {
                    "Review": review_batch[correct_flags],
                    "True label": label_batch[correct_flags] + 1,
                    "Predicted label": pred_batch[correct_flags] + 1
                }
            )
        ],
        ignore_index=True
    )
    
    if df_correct.shape[0] >= num_samples:
        break

df_correct = df_correct.iloc[:num_samples, :]
df_correct["Review"] = df_correct["Review"].astype(str)

df_correct.to_csv("correct.csv", index=False)

# Next steps

* Do stemming after stripping punctuation, so that words next to punctuation can be stemmed as well.
* Tune `TextVectorization` layer `max_tokens`.