In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Install the transformers library
!pip install transformers



In [None]:
%cd /content/drive/MyDrive/CS5344_Project_L/

/content/drive/MyDrive/CS5344_Project_L


In [None]:
import os
os.getcwd()

'/content/drive/MyDrive/CS5344_Project_L'

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load train test val csv files
train_df = pd.read_csv('/content/drive/MyDrive/CS5344_Project_L/Dataset/train.csv')
test_df = pd.read_csv('/content/drive//MyDrive/CS5344_Project_L/Dataset/test.csv')
val_df = pd.read_csv('/content/drive//MyDrive/CS5344_Project_L/Dataset/val.csv')

In [None]:
train_df.head()

Unnamed: 0,asin,overall,vote,reviewTime,reviewerID,reviewerName,reviewText,summary,label
0,B00005MOZG,1.0,0.0,"01 27, 2008",A2QQOKUF4XWHFC,Mirian Vandama,i got the speakers for the laptop i was hoping...,Disappointed!,0.0
1,B00005N6KG,3.0,0.0,"01 18, 2012",A2BGBYTKP2PWNQ,ES,i use mine every day the only problem with the...,Breaks easily,1.0
2,B00005B8M3,1.0,28.0,"11 10, 2006",A1X8JQMZF1WJR,Henry J. Eichman,i have no idea why amazon recommends this for ...,Garmin C330 adapter 010-10085-00 cigarette lig...,0.0
3,B000062VUQ,5.0,0.0,"01 25, 2014",A38RDAV40TAD6O,Amazon Customer,the 1 star reviews for these are from people w...,All the loser reviews...,2.0
4,B00005BKZZ,3.0,23.0,"05 30, 2001",A231WM2Z2JL0U3,Rheumor,series camera and do not have a battery rechar...,If you've bought a Kodak DX.........,1.0


In [None]:
# Set constants for model and data processing
MAX_LEN = 512
BATCH_SIZE = 16
model_name = 'bert-base-uncased'

In [None]:
# Load bert tokenizer use bert large cased
tokenizer = BertTokenizer.from_pretrained(model_name, max_length=512)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Define a function to create a data loader for the AmazonReviewDataset
class AmazonReviewDataset(Dataset):
    # Constructor Function
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    # Length magic method
    def __len__(self):
        return len(self.reviews)

    # get item magic method
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        # Encoded format to be returned
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
# Define a function to create a data loader for the AmazonReviewDataset
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = AmazonReviewDataset(
        reviews=df.reviewText.to_numpy(),
        targets=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
    )

In [None]:

# Create data loader for train, validation and test data for pyspark dataframes
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
print(len(train_data_loader.dataset))

36330


In [None]:
# Examples
data = next(iter(train_data_loader))
print(data.keys())

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
print(data['input_ids'])

dict_keys(['review_text', 'input_ids', 'attention_mask', 'targets'])
torch.Size([16, 512])
torch.Size([16, 512])
torch.Size([16])
tensor([[ 101, 1045, 2288,  ...,    0,    0,    0],
        [ 101, 1045, 2224,  ...,    0,    0,    0],
        [ 101, 1045, 2031,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2031,  ...,    0,    0,    0],
        [ 101, 4921, 2063,  ...,    0,    0,    0],
        [ 101, 1045, 2031,  ...,    0,    0,    0]])


In [None]:
# Load pre-trained BERT model
bert_model = BertModel.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Build the Sentiment Classifier class
class SentimentClassifier(nn.Module):

    # Constructor class
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.5)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    # Forward propagaion class
    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        pooled_output = bert_output.pooler_output # Accessing the pooled output
        #  Add a dropout layer
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
# Plot the distribution
class_names = ['negative', 'neutral', 'positive']

In [None]:
# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Instantiate the model and move to classifier
model = SentimentClassifier(n_classes = 3)
model = model.to(device)

In [None]:
# Number of hidden units
print(bert_model.config.hidden_size)

768


In [None]:
# Number of iterations
EPOCHS = 10

# Optimizer Adam
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the loss function
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        # Backward prop
        loss.backward()

        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


In [None]:
# Define a function for model evaluation
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
#  Define a function to save model checkpoints
def save_checkpoint(model, optimizer, epoch, train_loss, train_acc, val_loss, val_acc):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'train_acc': train_acc,
        'val_loss': val_loss,
        'val_acc': val_acc
    }

    checkpoint_path = f'/content/drive/MyDrive/CS5344_Project_L/checkpoint/checkpoint_epoch_{epoch}.pth'
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved: {checkpoint_path}")

In [None]:
# Set the checkpoint directory
checkpoint_dir = '/content/drive/MyDrive/CS5344_Project_L/checkpoint/'

In [None]:
load_checkpoint = False
# Checkpoint filename to load if load_checkpoint = True
checkpoint_file_to_load = 'checkpoint_epoch_2.pth'

# Load checkpoint if specified
if load_checkpoint:
  if device == 'cpu':
    checkpoint = torch.load(checkpoint_dir + checkpoint_file_to_load, map_location=torch.device('cpu'))
  else:
    checkpoint = torch.load(checkpoint_dir + checkpoint_file_to_load)

  print('Loaded {} checkpoint...'.format(checkpoint_file_to_load))
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch_to_start = checkpoint['epoch'] + 1
  print('Will start training from epoch {} onwards...'.format(epoch_to_start))
else:
  epoch_to_start = 1

In [None]:
%%time
# Training
history = defaultdict(list)
best_val_acc = 0

for epoch in range(epoch_to_start, EPOCHS):
    # Show details
    print(f"Epoch {epoch}/{EPOCHS}")
    print("-" * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_df)
    )


    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(val_df)
    )

   # Print or log training and validation metrics
    print(f"Train loss: {train_loss}, Train accuracy: {train_acc}")
    print(f"Validation loss: {val_loss}, Validation accuracy: {val_acc}")

    # Save checkpoint if validation accuracy improves
    save_checkpoint(model, optimizer, epoch, train_loss, train_acc, val_loss, val_acc)

Epoch 1/10
----------
Train loss: 0.40243272053734785, Train accuracy: 0.8425268373245252
Validation loss: 0.5762602985584857, Validation accuracy: 0.7774395399966176
Checkpoint saved: /content/drive/MyDrive/CS5344_Project_L/checkpoint/checkpoint_epoch_1.pth
Epoch 2/10
----------
Train loss: 0.27629117380506973, Train accuracy: 0.9067712634186623
Validation loss: 0.7782654498143726, Validation accuracy: 0.769660071029934
Checkpoint saved: /content/drive/MyDrive/CS5344_Project_L/checkpoint/checkpoint_epoch_2.pth
Epoch 3/10
----------
Train loss: 0.20153439454154043, Train accuracy: 0.9416184971098266
Validation loss: 0.9580456946880519, Validation accuracy: 0.7770167427701674
Checkpoint saved: /content/drive/MyDrive/CS5344_Project_L/checkpoint/checkpoint_epoch_3.pth
Epoch 4/10
----------
Train loss: 0.1591280746301993, Train accuracy: 0.956785026149188
Validation loss: 1.090356416920376, Validation accuracy: 0.7755792322002367
Checkpoint saved: /content/drive/MyDrive/CS5344_Project_L/ch