# Resturant Review Sentiment - Transformer
### Matthew Newton
* Can a transformer model provide better results?
* Imported an existing LLM for review classification "bert-base-multilingual-uncased-sentiment".
* Tune model on training data.
* The training data will be input into this model to predict the review scores.

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [3]:
# Split training data into train data, cross validation and test data
#df_review = df_long
df_long = df_review
df_review = df_review.dropna()
df_review['text'] = df_review['title'] + " " + df_review['text']
df_review = df_review[:100]
features = ['text']
#features = ['text', 'title', 'type', 'priceInterval', 'date']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [4]:
y_train = np.array(y_train) - 1
y_cv = np.array(y_cv) - 1
y_test = np.array(y_test) - 1

(70,)


In [6]:
# Extract text data from DataFrame
X_train = X_train['text'].values
X_cv = X_cv['text'].values
X_test = X_test['text'].values

In [7]:
from transformers import BertTokenizer
import torch

tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment') # ('bert-base-uncased')

def tokenize_data(texts, labels=None, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_data = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',  # Ensure padding to max_len
            truncation=True,       # Ensure truncation to max_len
            return_attention_mask=True,
            return_tensors='pt'    # Return PyTorch tensors
        )
        input_ids.append(encoded_data['input_ids'].squeeze(0))  # Remove extra dimension
        attention_masks.append(encoded_data['attention_mask'].squeeze(0))  # Remove extra dimension

    # Concatenate tensors along the batch dimension
    input_ids = torch.stack(input_ids)  # Stack tensors along a new dimension
    attention_masks = torch.stack(attention_masks)  # Stack tensors along a new dimension

    # Convert labels to tensor if provided
    if labels is not None:
        labels = torch.tensor(labels, dtype=torch.long)
        return input_ids, attention_masks, labels
    else:
        return input_ids, attention_masks

# Example usage
train_inputs, train_masks, train_labels = tokenize_data(X_train, y_train)
val_inputs, val_masks, val_labels = tokenize_data(X_cv, y_cv)
test_inputs, test_masks, test_labels = tokenize_data(X_test, y_test)  # Test labels might not be available


In [9]:
class ReviewDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

train_dataset = ReviewDataset(train_inputs, train_masks, train_labels)
val_dataset = ReviewDataset(val_inputs, val_masks, val_labels)
test_dataset = ReviewDataset(test_inputs, test_masks, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)


In [10]:
# print("Train inputs shape:", train_inputs.shape)
# print("Train masks shape:", train_masks.shape)
# print("Train labels shape:", train_labels.shape)

# print("Val inputs shape:", val_inputs.shape)
# print("Val masks shape:", val_masks.shape)
# print("Val labels shape:", val_labels.shape)

# print("Test inputs shape:", test_inputs.shape)
# print("Test masks shape:", test_masks.shape)

Train inputs shape: torch.Size([70, 128])
Train masks shape: torch.Size([70, 128])
Train labels shape: torch.Size([70])
Val inputs shape: torch.Size([15, 128])
Val masks shape: torch.Size([15, 128])
Val labels shape: torch.Size([15])
Test inputs shape: torch.Size([15, 128])
Test masks shape: torch.Size([15, 128])


In [11]:
model = BertForSequenceClassification.from_pretrained(
    'nlptown/bert-base-multilingual-uncased-sentiment', 
    num_labels=5,  # Number of classes
    output_attentions=False,
    output_hidden_states=False
)

# NOT RECOMMENED TO RUN ON CPU, USE COLLAB FOR FREE GPU USAGE
# If GPU available, move the model to it
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [12]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

In [13]:
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Training loss: {avg_train_loss:.4f}")

        eval_model(model, val_dataloader)

def eval_model(model, val_dataloader):
    model.eval()
    val_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_dataloader)
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(classification_report(true_labels, predictions))

train(model, train_dataloader, val_dataloader)

Epoch 1/4
Training loss: 1.0032
Validation loss: 0.5493
Validation Accuracy: 0.7333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         3
           4       0.77      1.00      0.87        10

    accuracy                           0.73        15
   macro avg       0.35      0.40      0.37        15
weighted avg       0.58      0.73      0.65        15

Epoch 2/4
Training loss: 0.7178
Validation loss: 0.5516
Validation Accuracy: 0.7333
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         3
           4       0.77      1.00      0.87        10

    accuracy     

In [14]:
# Predict
model.eval()
test_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        test_predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [15]:
y_test_pred = np.array(test_predictions) + 1
y_test = y_test + 1
print(y_test_pred)
print(y_test)

[5 2 4 5 5 2 4 4 5 4 4 1 4 5 5]
[5 1 4 5 5 4 5 4 5 4 3 1 4 5 4]


In [16]:
# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Accuracy: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.50      0.67         2
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         1
           4       0.67      0.67      0.67         6
           5       0.83      0.83      0.83         6

    accuracy                           0.67        15
   macro avg       0.50      0.40      0.43        15
weighted avg       0.73      0.67      0.69        15


Confusion Matrix:
 [[1 1 0 0 0]
 [0 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 4 1]
 [0 0 0 1 5]]
