<h3> Import libraries </h3>

In [99]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers
import math
import tqdm

import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

torch.autograd.set_detect_anomaly(True)

from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.nn import MultiheadAttention
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import Embedding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<h3> Load data </h3>

In [100]:
df = pd.read_csv('./data/unprocessed/tripadvisor_hotel_reviews.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [129]:
class ReviewDataset(Dataset):
	def __init__(self, reviews, targets, tokenizer, max_length=512):
		self.reviews = reviews
		self.targets = targets
		self.tokenizer = tokenizer
		self.max_length = max_length

	def __len__(self):
		return len(self.reviews)

	def __getitem__(self, idx):
		review = str(self.reviews[idx])
		target = self.targets[idx]
		
		encoding = self.tokenizer.encode_plus(
			review,
			add_special_tokens=True,
			max_length=self.max_length,
			truncation=True,
			padding='max_length',
			return_attention_mask=True,
			return_token_type_ids=False,
			return_tensors='pt'
		)

		return {
			"review": encoding['input_ids'].squeeze(0),
			"attention_mask": encoding['attention_mask'].squeeze(0),
			"target": torch.tensor(target, dtype=torch.float)
		}

max_length = 100
dataset = ReviewDataset(df['Review'], df['Rating'], tokenizer, max_length=max_length)

# Calculate the class weights
class_counts = df['Rating'].value_counts().sort_index()
num_samples = len(df)
class_weights = [num_samples/class_counts[i] for i in range(1, 6)]

# Assign a weight to every sample in the dataset
sample_weights = [class_weights[target - 1] for target in df['Rating']]

# Create a WeightedRandomSampler
sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights))

# Create the DataLoader with the WeightedRandomSampler
dataloader = DataLoader(dataset, batch_size=64, sampler=sampler)

for data in dataloader:
	print(data['review'].shape)
	print(data['attention_mask'].shape)
	print(data['target'].shape)
	print("Review:", data['review'][0])
	print("Attention mask:", data['attention_mask'][0])
	print("Target:", data['target'][0])
	break

print(len(dataloader))

torch.Size([32, 100])
torch.Size([32, 100])
torch.Size([32])
Review: tensor([  101, 12476,  2155, 10885,  2074,  2288,  2155,  1018,  4268,  1019,
         1020,  2307,  2051,  1010,  4766, 19184,  1012,  4734,  7564,  3528,
         1010,  3733,  6942,  2342,  1010,  1050,  1005,  1056,  4895, 23947,
         2305,  2298,  7001,  2215,  1010,  2155,  2282,  2542,  2282,  1059,
         1013,  1016,  2420,  9705,  4606,  5010,  1016,  7695,  1016, 28942,
         1010, 11673,  2694,  7163,  1011, 16716,  1010,  3822,  2723,  4734,
        24616, 21098,  1010,  3253, 10417,  2098,  2542, 29020,  1012,  2833,
         3492,  2204,  1010,  6142,  2489,  3059,  1022,  9397,  1013,  1018,
         3211,  5104,  2307,  1010,  2657, 23621,  2423,  2204,  1010, 13675,
        13699,  2229,  3256,  6949, 24608,  2379,  3509,  1010,  3347,   102])
Attention mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

<h3> Transformer Architecture </h3>

In [130]:
class TransformerBlock(nn.Module):
	def __init__(self, embed_dim, num_heads, ff_dim, rate=0.2):
		super(TransformerBlock, self).__init__()
		self.att = nn.MultiheadAttention(num_heads=num_heads, embed_dim=embed_dim)
		self.ffn = nn.Sequential(
			nn.Linear(embed_dim, ff_dim),
			nn.ReLU(),
			nn.Linear(ff_dim, embed_dim),
		)
		self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
		self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
		self.dropout = nn.Dropout(rate)

	def forward(self, x, mask=None):
		# Apply multi-head attention with optional mask
		attn_output, _ = self.att(x, x, x)

		# Apply dropout to attention output
		attn_output = self.dropout(attn_output)

		# Add & Normalize
		out1 = self.layernorm1(x + attn_output)

		# Feed-forward network
		ffn_output = self.ffn(out1)

		# Add & Normalize
		out2 = self.layernorm2(out1 + ffn_output)

		# Apply dropout to entire output
		return self.dropout(out2)

class TokenAndPositionEmbedding(nn.Module):
	def __init__(self, vocab_size, embed_dim, max_length):
		super(TokenAndPositionEmbedding, self).__init__()
		self.token_emb = nn.Embedding(vocab_size, embed_dim)
		self.pos_emb = nn.Embedding(max_length, embed_dim)

	def forward(self, x):
		batch_size, seq_length = x.size()
		positions = torch.arange(0, seq_length, dtype=torch.long, device=x.device).expand(batch_size, seq_length)
		x = self.token_emb(x)
		assert not torch.isnan(x).any(), "There are NaN values in the token embeddings tensor."
		pos = self.pos_emb(positions)
		return x + pos

class Transformer(nn.Module):
	def __init__(self, num_classes, embed_size, num_layers, heads, forward_expansion, dropout, max_length, vocab_size):
		super(Transformer, self).__init__()

		self.embedding = TokenAndPositionEmbedding(vocab_size, embed_size, max_length)
		self.layers = nn.ModuleList([TransformerBlock(embed_size, heads, forward_expansion, dropout)
										for _ in range(num_layers)])
		self.fc = nn.Linear(embed_size, num_classes)

	def forward(self, x, mask=None):
		x = self.embedding(x)
		assert not torch.isnan(x).any(), "There are NaN values in the embedding layer output."
		for layer in self.layers:
			x = layer(x)
		assert not torch.isnan(x).any(), "There are NaN values in the layer masking."
		x = torch.mean(x, dim=1)
		x = self.fc(x)
		return x

<h3> Train model </h3>

In [132]:
model = Transformer(num_classes=5, embed_size=32, num_layers=4, heads=2, forward_expansion=4, 
                        dropout=0.1, max_length=max_length, vocab_size=tokenizer.vocab_size)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train(dataloader, model, criterion, optimizer):
	model.train()
	total_loss = 0
	all_predictions = []
	all_targets = []
	pbar = tqdm.tqdm(dataloader)
	for batch_idx, batch in enumerate(pbar):
		#print(batch['review'].shape)
		#print(batch['attention_mask'].shape)
		#print(batch['target'].shape)
		#print(batch['target'])
		reviews = batch['review'].to(device)
		targets = batch['target'].long().to(device)
		targets = targets - 1
		mask = batch['attention_mask'].bool().to(device)

		optimizer.zero_grad()
		outputs = model(reviews, mask=mask)
		loss = criterion(outputs, targets)
		loss.backward()
		optimizer.step()

		total_loss += loss.item()	

		_, predicted = torch.max(outputs, 1)
		all_predictions.extend(predicted.cpu().numpy())
		all_targets.extend(targets.cpu().numpy())

	average_loss = total_loss / len(dataloader)
	accuracy = accuracy_score(all_targets, all_predictions)
	precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_predictions, average='weighted')

	print("Accuracy: ", accuracy)
	print("Precision: ", precision)
	print("Recall: ", recall)
	print("F1 Score: ", f1)
	print("Average Training Loss: ", average_loss)

def evaluate(dataloader, model, criterion):
	model.eval()
	total_loss = 0
	with torch.no_grad():
		for batch in dataloader:
			reviews = batch['review'].to(device)
			targets = batch['target'].long().to(device)
			targets = targets - 1
			mask = batch['attention_mask'].bool().to(device)
			
			outputs = model(reviews, mask=mask)
			loss = criterion(outputs, targets)
			total_loss += loss.item()
	average_loss = total_loss / len(dataloader)
	print("Average Evaluation Loss: ", average_loss)

for epoch in range(5):
    train(dataloader, model, criterion, optimizer)
    evaluate(dataloader, model, criterion)

  2%|▏         | 14/641 [00:05<04:28,  2.34it/s]


KeyboardInterrupt: 

<h3> Evaluate model </h3>

In [None]:
data = {
    'Review': [
        'It was fine.',
        'Poor service and the food quality was below average.',
        'An excellent stay, the staff was friendly and the location perfect.',
        'The hotel is overpriced for the quality of the amenities provided.',
        'Great experience, the room was well-appointed and the service impeccable.',
		'Good service',
		'Bad service'
    ],
    'Rating': [3, 2, 5, 2, 5, 4, 1]
}

test_df = pd.DataFrame(data)

dataset = ReviewDataset(test_df['Review'], test_df['Rating'], tokenizer, max_length=max_length)

model.eval()

with torch.no_grad():
	for idx in range(len(dataset)):
		sample = dataset[idx]
		review = sample['review'].unsqueeze(0).to(device)
		mask = sample['attention_mask'].unsqueeze(0).bool().to(device)
		target = sample['target'].unsqueeze(0).long().to(device)
		target = target - 1

		output = model(review, mask=mask)
		probabilities = torch.softmax(output, dim=1)
		predictions = torch.argmax(probabilities, dim=1)
		print(f"Review: {test_df['Review'][idx]}")
		print(f"Actual Rating: {target[0] + 1}")
		print(f"Predicted Rating: {predictions[0] + 1}")
		print(f"Probabilities: {probabilities}")
		print()