# Finetune a BERT model to classify reviews in HuggingFace

# NOTEBOOK DEFUNCT: SEE 3_finetune_bert.py
The dataprocessing step remains the same and is useful for understanding the data pipeline

## Data Processing

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

### Read data and get labels

In [2]:
from src.data_processing.process_labels import *

In [3]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')
# Separate reviews and labels
X = df.text # review text
food_labels = df.food
service_labels = df.service

In [5]:
y = label_generator(food_labels=food_labels.values, 
                    service_labels=service_labels.values).trim_and_fetch_labels()

In [6]:
# Trim reviews to size of labels (y)
X = X[:len(y)].copy()

In [7]:
len(X), len(y)

(1000, 1000)

### Train/validate/test split

In [7]:
from src.data_processing.train_val_test import train_val_test

In [8]:
X_train, X_test, _ = train_val_test(data=X, train_frac=0.8, val_frac=0.2, test_frac=0)
y_train, y_test, _ = train_val_test(data=y, train_frac=0.8, val_frac=0.2, test_frac=0)

### Tokenize reviews using BERT tokenizer

In [20]:
import transformers
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [10]:
# Load Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
# Get Bert encodings
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')

In [12]:
# Create custom PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, encodings, targets):
        """
        Params: 
        encodings -- dictionary, contains 'input_ids', 'token_type_ids', 'attention_mask'
        targets -- Pytorch tensor of shape (# reviews, 4), one-hot labels
        """
        self.input_ids = encodings['input_ids'] # tensor of shape (# reviews, max review length)
        self.token_type_ids = encodings['token_type_ids'] # tensor of shape (# reviews, max review length)
        self.attention_mask = encodings['attention_mask'] # tensor of shape (# reviews, max review length)
        self.targets = targets # tensor of shape (# reviews, 4)
        return
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return {
            'ids' : self.input_ids[index, :],
            'mask' : self.attention_mask[index, :],
            'token_type_ids' : self.token_type_ids[index, :],
            'targets' : self.targets[index, :]
        }


In [13]:
# Create training and testing PyTorch datasets
training_set = CustomDataset(train_encodings, y_train)
testing_set = CustomDataset(test_encodings, y_test)

In [14]:
# Create PyTorch dataloaders
train_params = {'batch_size': 100,
                'shuffle': True}

test_params = {'batch_size': 100,
                'shuffle': True}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Finetune BERT model

In [23]:
from src.models.model_zoo import *
from torch import optim

In [24]:
# instantiate model
bert_model = BERTClass()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bert_model.parameters(), lr=0.0001)

In [30]:
def bert_train_one_epoch(bert_model, criterion, optimizer, dataloader_train):
    """ 
    Fine tune BERT model for sequence classification for ONE EPOCH
    Params:
    bert_model -- custom torch model to finetune BERT
    criterion -- torch criterion object to evaluate loss
    optimizer -- torch optimizer object
    dataloader_train -- torch dataloader for train set, each minibatch is a dictionary containing 
                        ['ids', 'mask', 'token_type_ids', 'targets']
    """
    bert_model.train()
    total_loss = 0  #initialize total loss
    for data in dataloader_train: # data is a dictionary
        optimizer.zero_grad()

        # cast data types
        ids = data['ids'].to(dtype = torch.long)
        mask = data['mask'].to(dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(dtype = torch.long)
        targets = data['targets'].to(dtype = torch.float)

        # forward pass
        outputs = bert_model(ids, mask, token_type_ids)

        # calculate loss over batch
        loss = criterion(outputs, targets)

        # backward pass and optimization step
        loss.backward() # find gradients
        optimizer.step() # updating parameters

        # update total loss
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader_train) # average over number of minibatches
    return avg_loss

In [None]:
def bert_val_one_epoch(bert_model, criterion, dataloader_test):
    """ 
    Validate fine tuning BERT model for ONE EPOCH
    Params:
    bert_model -- custom torch model to finetune BERT
    criterion -- torch criterion object to evaluate loss
    dataloader_test -- torch dataloader for test set, each minibatch is a dictionary containing 
                        ['ids', 'mask', 'token_type_ids', 'targets']
    """
    bert_model.eval()
    with torch.no_grad():
        total_val_loss = 0
        for data in dataloader_test:
            # cast data types
            ids = data['ids'].to(dtype = torch.long)
            mask = data['mask'].to(dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(dtype = torch.long)
            val_targets = data['targets'].to(dtype = torch.float)

            val_outputs = bert_model(ids, mask, token_type_ids)
            val_loss = criterion(val_outputs, val_targets)
            total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(dataloader_test)
    
    return avg_val_loss

In [31]:
bert_train_one_epoch(bert_model=bert_model, criterion=criterion, optimizer=optimizer, dataloader_train=training_loader)

RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1258291200 bytes.