# Trip advisor reviews sentiment classification [PyTorch]
* Multiclass sentiment classification of hotel's reviews.
* Reference notebook: <https://www.kaggle.com/code/emirkocak/in-depth-series-sentiment-analysis-w-transformers/notebook>
* Dataset: <https://www.kaggle.com/code/emirkocak/in-depth-series-sentiment-analysis-w-transformers/input>

## Import libraries

In [2]:
import os
import re
import string
import random
import warnings
from typing import Union, List
from dataclasses import dataclass

import nltk
import torch
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from sklearn.metrics import classification_report
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

import giskard
from giskard import Dataset, Model

2023-05-19 13:44:12.676739: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Notebook-level settings

In [3]:
# Disable warnings prints.
warnings.filterwarnings('ignore')

# Download list of english stopwords.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mykytaalekseiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Define constants

In [4]:
# Constants.
STOP_WORDS = set(stopwords.words('english'))

PRETRAINED_WEIGHTS_NAME = "distilbert-base-uncased"

TEXT_COLUMN_NAME = "Review"
TARGET_COLUMN_NAME = "label"

RANDOM_SEED = 0

# Paths.
DATA_PATH = os.path.join(".", "datasets", "trip_advisor_sentiment_classification_dataset", "tripadvisor_hotel_reviews.csv")

## Set random seeds

In [5]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

## Load data

In [6]:
def create_label(x: int) -> int:
    """Map rating to the label."""
    if x in [1, 2]:
        return 0
    if x == 3:
        return 1
    if x in [4, 5]:
        return 2

In [7]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    df = pd.read_csv(DATA_PATH, **kwargs)

    # Obtain labels for our task.
    df[TARGET_COLUMN_NAME] = df.Rating.apply(lambda x: create_label(x))
    df.drop(columns="Rating", inplace=True)

    return df

reviews_df = load_data(nrows=2000)
reviews_df.head()

Unnamed: 0,Review,label
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not 4* experience hotel monaco seat...,1
3,"unique, great stay, wonderful time hotel monac...",2
4,"great stay great stay, went seahawk game aweso...",2


## Wrap dataset

In [8]:
wrapped_data = Dataset(reviews_df,
                       name="trip_advisor_reviews_sentiment",
                       target=TARGET_COLUMN_NAME,
                       column_types={TEXT_COLUMN_NAME: "text"})

Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."


## Data preprocessing

In [9]:
class CleanText:
    """Helper class to preprocess review's text."""
    def __init__(self, clean_pattern: str = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
        """Constructor of the class."""
        self.clean_pattern = clean_pattern

    def __call__(self, text: Union[str, list]) -> List[List[str]]:
        """Perform cleaning."""
        if isinstance(text, str):
            docs = [[text]]

        if isinstance(text, list):
            docs = text

        text = [[re.sub(self.clean_pattern, " ", sentence) for sentence in sentences] for sentences in docs]
        return text
    
def remove_emoji(data: str) -> str:
    """Remove emoji from the text."""
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoji, '', data)

regex = re.compile('[%s]' % re.escape(string.punctuation))
def remove_punctuation(text: str) -> str:
    """Remove punctuation from the text."""
    text = regex.sub(" ", text)
    return text

clean = CleanText()

In [10]:
def text_preprocessor(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess text."""
    # Remove emoji.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: remove_emoji(x))

    # Lower.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: x.lower())

    # Clean.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: clean(x)[0][0])

    # Remove punctuation.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: remove_punctuation(x))

    return df

reviews_df = text_preprocessor(reviews_df)

# Init Distill-BERT model

In [11]:
@dataclass
class Config:
    """Configuration of Distill-BERT model."""
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    batch_size = 128
    seq_length = 150
    add_special_tokens = True
    return_attention_mask = True
    pad_to_max_length = True
    return_tensors = 'pt'

In [12]:
# Load tokenizer.
tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_WEIGHTS_NAME)

# Load model.
model = DistilBertForSequenceClassification.from_pretrained(PRETRAINED_WEIGHTS_NAME,
                                                            num_labels=3,
                                                            output_attentions=False,
                                                            output_hidden_states=False).to(Config.device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [13]:
def create_dataloader(df: pd.DataFrame) -> DataLoader:
    """Create dataloader object with input data."""
    def _create_dataset(_encoded_data: dict) -> TensorDataset:
        """Create dataset object with input data."""
        input_ids = encoded_data['input_ids']
        attention_masks = encoded_data['attention_mask']
        return TensorDataset(input_ids, attention_masks)

    # Tokenize data.
    encoded_data = tokenizer.batch_encode_plus(df.Review.values,
                                               add_special_tokens=Config.add_special_tokens,
                                               return_attention_mask=Config.return_attention_mask,
                                               pad_to_max_length=Config.pad_to_max_length,
                                               max_length=Config.seq_length,
                                               return_tensors=Config.return_tensors)

    # Create dataset object.
    dataset = _create_dataset(encoded_data)

    # Create and return dataloader object.
    return DataLoader(dataset, batch_size=Config.batch_size)

dataloader = create_dataloader(reviews_df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Inference

In [14]:
def infer_predictions(_model: torch.nn.Module, _dataloader: DataLoader) -> np.ndarray:
    """Perform inference using given model on given dataloader."""
    _model.eval()

    y_pred = list()
    for batch in _dataloader:
        batch = tuple(b.to(Config.device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }

        with torch.no_grad():
            outputs = _model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits).detach().cpu().numpy()
        y_pred.append(probs)

    y_pred = np.concatenate(y_pred, axis=0)
    return y_pred

In [17]:
raw_predictions = infer_predictions(model, dataloader)
print(classification_report(np.argmax(raw_predictions, axis=1).flatten(), reviews_df.label))

              precision    recall  f1-score   support

           0       0.48      0.20      0.29        54
           1       0.33      0.10      0.15        42
           2       0.03      0.50      0.06         4

    accuracy                           0.17       100
   macro avg       0.28      0.27      0.16       100
weighted avg       0.40      0.17      0.22       100

CPU times: user 19.5 s, sys: 576 ms, total: 20.1 s
Wall time: 11.1 s


## Create custom model wrapper

In [17]:
class CustomWrapper(Model):
    """Custom giskard model wrapper."""
    def model_predict(self, df: pd.DataFrame) -> np.ndarray:
        """Perform inference using overwritten prediction logic."""
        cleaned_df = text_preprocessor(df)
        data_loader = create_dataloader(cleaned_df)
        predicted_probabilities = infer_predictions(self.model, data_loader)
        return predicted_probabilities

In [18]:
wrapped_model = CustomWrapper(model,
                              model_type="classification",
                              classification_labels=[0, 1, 2],
                              name="trip_advisor_sentiment_classifier",
                              feature_names=[TEXT_COLUMN_NAME])

## Validate wrapped model

In [31]:
wrapped_predictions = wrapped_model.predict(wrapped_data)
validation_result = (wrapped_predictions.all_predictions.values == raw_predictions).all()
print(f"Outputs of raw and wrapped model are equal: {validation_result}")

Outputs of raw and wrapped model are equal: True


## Perform scan

In [17]:
%%time
scanning_results = giskard.scan(wrapped_model, wrapped_data)

Your model is successfully validated.
Running scan…
Running detector ModelBiasDetector… 15 issues detected.
Running detector TextPerturbationDetector…Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Hint: "Your tar

In [18]:
display(scanning_results)