# Finetune a BERT model to classify reviews in HuggingFace

## Data Processing

In [46]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

### Read data and get labels

In [15]:
from src.data_processing.process_labels import *

In [20]:
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')
# Separate reviews and labels
X = df.text # review text
food_labels = df.food
service_labels = df.service

In [21]:
y = label_generator(food_labels=food_labels.values, 
                    service_labels=service_labels.values).trim_and_fetch_labels()

In [22]:
# Trim reviews to size of labels (y)
X = X[:len(y)].copy()

In [23]:
len(X), len(y)

(1000, 1000)

### Train/validate/test split

In [None]:
from src.data_processing.train_val_test import train_val_test

In [29]:
X_train, X_test, _ = train_val_test(data=X, train_frac=0.8, val_frac=0.2, test_frac=0)
y_train, y_test, _ = train_val_test(data=y, train_frac=0.8, val_frac=0.2, test_frac=0)

### Tokenize reviews using BERT tokenizer

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [26]:
# Load Bert tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [32]:
# Get Bert encodings
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')

In [62]:
# Create custom PyTorch datasets
class CustomDataset(Dataset):
    def __init__(self, encodings, targets):
        """
        Params: 
        encodings -- dictionary, contains 'input_ids', 'token_type_ids', 'attention_mask'
        targets -- Pytorch tensor of shape (# reviews, 4), one-hot labels
        """
        self.input_ids = encodings['input_ids'] # tensor of shape (# reviews, max review length)
        self.token_type_ids = encodings['token_type_ids'] # tensor of shape (# reviews, max review length)
        self.attention_mask = encodings['attention_mask'] # tensor of shape (# reviews, max review length)
        self.targets = targets # tensor of shape (# reviews, 4)
        return
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return {
            'ids' : self.input_ids[index, :],
            'mask' : self.attention_mask[index, :],
            'token_type_ids' : self.token_type_ids[index, :],
            'targets' : self.targets[index, :]
        }


In [63]:
# Create training and testing PyTorch datasets
training_set = CustomDataset(train_encodings, y_train)
testing_set = CustomDataset(test_encodings, y_test)

In [68]:
# Create PyTorch dataloaders
train_params = {'batch_size': 100,
                'shuffle': True}

test_params = {'batch_size': 100,
                'shuffle': True}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)