## Loading Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
import json

with open("review-Iowa_10.json", 'r') as f:
    lines = f.readlines()

# Filter out empty lines and attempt to parse each line as JSON
parsed_data = []
for line in lines:
    stripped_line = line.strip()
    if stripped_line:
        try:
            parsed_data.append(json.loads(stripped_line))
        except json.JSONDecodeError:
            # Skip lines that cause JSONDecodeError
            pass

# Create DataFrame from the list of parsed JSON objects
df = pd.DataFrame(parsed_data)
df.head()

In [None]:
df.info()

In [None]:
df['rating'].unique()

In [None]:
drop_columns = ["user_id", "name", "time", "pics", "resp", "gmap_id"]
df = df.drop(columns=drop_columns)
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df1 = df[df['rating'] == 1]
df2 = df[df['rating'] == 2]
df3 = df[df['rating'] == 3]
df4 = df[df['rating'] == 4]
df5 = df[df['rating'] == 5]

print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)
print(df5.shape)

In [None]:
"""
Since this dataset is extremely skewed for positive ratings, and this is
realistic with real-world data but for learning and fine-tuning BERT, I think
it's good to rebalance by randomly sampling some percentage of the data.

For ratings 1 and 2, I will keep 2k reviews and making this class 'negative'.
For rating 3, I will keep 4k reviews and make this class 'neutral'.
For ratings 4 and 5, I will keep 2k reviews and make this class 'positive'.
"""
df1 = df1.iloc[np.random.choice(len(df1), size=2000, replace=False)]
df2 = df2.iloc[np.random.choice(len(df2), size=2000, replace=False)]
df3 = df3.iloc[np.random.choice(len(df3), size=4000, replace=False)]
df4 = df4.iloc[np.random.choice(len(df4), size=2000, replace=False)]
df5 = df5.iloc[np.random.choice(len(df5), size=2000, replace=False)]
df = pd.concat([df1, df2, df3, df4, df5])
df

In [None]:
def create_label(row):
  if row['rating'] <= 2:
    return 0
  elif row['rating'] == 3:
    return 1
  else:
    return 2

df['label'] = df.apply(create_label, axis=1)
df.head()

In [None]:
df.tail()

In [None]:
#df.to_csv('final_review.csv')

df = pd.read_csv('final_review.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

In [None]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

counts = df['rating'].value_counts().sort_index()

sns.barplot(x=counts.index, y=counts.values)
plt.xlabel("Review Scores")
plt.ylabel("Count")

In [None]:
counts = df['label'].value_counts().sort_index()

sns.barplot(x=counts.index, y=counts.values)
plt.xlabel("Review Scores")
plt.ylabel("Count")

## Text Preprocessing

In [None]:
!pip install transformers

In [None]:
import transformers
import torch

import copy
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from torch import nn, optim
from torch.utils import data

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
sample_sentence = "When was I last outside? I've been at home for the last 4 days..."

In [None]:
tokens = tokenizer.tokenize(sample_sentence)

In [None]:
print(len(tokens))
print(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
print(len(token_ids))
print(token_ids)

## Special Tokens

These tokens let's BERT know that we are interested in sequence classification.

 - [SEP] separates the sentences.
 - [CLS] is a special token whose final embedding is essentially a summary for the whole input sequence and is commonly used for classificiation.
 - [PAD] is used to pad sequences to the same length for batching.
 - [UNK] is a word/token that isn't recognized by the tokenizer, unknown.

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
encoding = tokenizer.encode_plus(
    sample_sentence,
    max_length=32,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False,
    return_tensors='pt'
)

encoding.keys()

In [None]:
encoding['input_ids'][0]

# Below you can see the '101' and '102' ids, which are the classification and separation
# special tokens.

In [None]:
encoding['attention_mask'][0]

# In the print statement below, we can see how we don't make the model pay attention
# to the padding, and only on the original tokens.

### Choosing sequence length

Seems like most of my articles are around 0-100 tokens so I think 128 length is good.

In [None]:
token_lens = []

for article in df['text']:
  tokens = tokenizer.encode(article, max_length=512, truncation=True)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens);

### Creating PyTorch Dataset

In [None]:
class PTDataset(data.Dataset):

  def __init__(self, text, target, tokenizer, max_len):
    self.text = text
    self.target = target
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.text)

  def __getitem__(self, item):
    text = str(self.text[item])

    encoding = self.tokenizer.encode_plus(
      text,
      max_length=self.max_len,
      add_special_tokens=True,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_token_type_ids=False,
      return_tensors='pt'
    )

    return {
        'text': text,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(self.target[item], dtype=torch.long)
    }

In [None]:
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 20

In [None]:
# Splitting data into train and test
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size, shuffle=False):
    ds = PTDataset(
        text=df['text'].to_numpy(),
        target=df['label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return data.DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2,
        shuffle=shuffle
    )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE, shuffle=True)
val_data_loader   = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE, shuffle=False)
test_data_loader  = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE, shuffle=False)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'])
data['input_ids'].shape

In [None]:
print(data['attention_mask'])
data['attention_mask'].shape

In [None]:
data['targets'].shape

## Loading BERT Model

In [None]:
from transformers import BertModel
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
outputs = bert_model(
    input_ids=data["input_ids"],
    attention_mask=data["attention_mask"]
)

In [None]:
last_hidden_state = outputs.last_hidden_state
last_hidden_state.shape

"""
This gets us the last encoder's output values. The shape is (batch, seq_len, hidden_size),
each token is a 768-dim vector. Each token knows about all the other tokens in the review,
this last output is the deep semantic representations. BERT's final understanding of each
token. This was BERT's task #1, contextualized word embeddings with bi-directional language
modeling.
"""

In [None]:
pooled_output = outputs.pooler_output
pooled_output.shape

"""
The pooled output takes the [CLS] token embedding from last_hidden_state[:, 0, :] and
passes it through a linear layer and tanh activation layer. It represents the whole
sequence, designed for classification tasks. This exists because BERT used [CLS] for
next-sentence prediction which was task #2 of BERT. It's basically sentence/review-level
meaning. This shape is (batch, hidden_size).
"""

## Building the Sentiment Classifier

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("bert-base-uncased")
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
    )
    pooled_output = outputs.pooler_output
    output = self.drop(pooled_output)
    output = self.out(output)
    return output

In [None]:
model = SentimentClassifier(len(df['label'].unique()))
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)

In [None]:
model(input_ids, attention_mask)

### Training

In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    targets = d['targets'].to(device)

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    _, pred = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(pred == targets)
    losses.append(loss.item())

    loss.backward()

    # Gradient clipping to avoid gradient explosion
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
from sklearn.metrics import f1_score

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  all_preds = []
  all_targets = []

  with torch.no_grad():
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
      )

      _, pred = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      all_preds.extend(pred.cpu().numpy())
      all_targets.extend(targets.cpu().numpy())
      correct_predictions += torch.sum(pred == targets)
      macro_f1 = f1_score(all_targets, all_preds, average="macro")
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses), macro_f1

In [None]:
if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    print(f"Current CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available. Using CPU.")

In [None]:
%%time

from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0

best_f1 = -1
patience = 3
patience_counter = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(df_train)
  )

  print(f'Train loss: {train_loss}, accuracy: {train_acc}')

  val_acc, val_loss, val_f1 = eval_model(
      model,
      val_data_loader,
      loss_fn,
      device,
      len(df_val)
  )

  print(f'Val loss: {val_loss}, accuracy: {val_acc}, macroF1: {val_f1}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  # if val_acc > best_accuracy:
  #   torch.save(model.state_dict(), 'best_model_state.bin')
  #   best_accuracy = val_acc
  if val_f1 > best_f1:
    torch.save(model.state_dict(), "best_model_state.bin")
    best_f1 = val_f1
    patience_counter = 0
  else:
    patience_counter += 1
    if patience_counter >= patience:
        print(f"Early stoppage: no improvement in macro F1 for {patience} epochs.")
        break

## Evaluation

In [None]:
class_names = ['negative', 'neutral', 'positive']
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [None]:
def get_reviews(model, data_loader):
    model = model.eval()
    
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    
    with torch.no_grad():
        for d in data_loader:
            texts = d['text']
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            
            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)
            
    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    
    return review_texts, predictions, prediction_probs, real_values    

In [None]:
test_acc, test_loss, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test))

In [None]:
test_acc

### Accuracy

Accuracy was about 71%, not bad at all especially since reviews can be inherently so noisy. For example, "Food was ok, drinks were really good, but prices are fair.", even for human it can be difficult to say "Yes this definitely this sentiment class".

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_reviews(model, test_data_loader)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=class_names))

### Precision, Recall, F1

We can see that model does a better job with 'negative' & 'positive' class compared to 'neutral'. Given my example earlier, it makes sense to rule it either 'positive' or 'neutral' so it can even be hard for humans.

In [None]:
def visualize_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha="right")
    hmap.xaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha="right")

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
visualize_confusion_matrix(df_cm)

In [None]:
from textwrap import wrap

idx = 69
review_text = y_review_texts[idx]
true_sentiment = y_test[idx]

pred_df = pd.DataFrame(
    {
        'class_names': class_names,
        'values': y_pred_probs[idx]
    }
)

print("\n".join(wrap(review_text)))
print()
print(f'True Sentiment: {class_names[true_sentiment]}')

In [None]:
sns.barplot(x="values", y="class_names", data=pred_df, orient="h")
plt.ylabel('Sentiment')
plt.xlabel('Probability')
plt.xlim([0,1]);

## Predict Sentiment on Raw Text of our own

In [None]:
review_text = "I really like their chicken. It's so crispy and not dry!"

In [None]:
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_token_type_ids=False,
    return_tensors='pt'
)

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

In [None]:
output = model(input_ids, attention_mask)
_, pred = torch.max(output, dim=1)

In [None]:
print(f'Review Text: {review_text}')
print(f'Sentiment