In [3]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
import transformers
from transformers import BertTokenizer

In [5]:
from src.data_processing.process_labels import *
from src.data_processing.process_reviews import *
from src.data_processing.train_val_test import train_val_test
from src.models.model_evalaute import *

## Data processing
Get BERT encodings for the train and test set.

In [6]:
### DATA PROCESSING ###
# Read data
df = pd.read_csv('data/raw_reviews/reviews_v1.csv')
# Separate reviews and labels
X = df.text # review text
food_labels = df.food
service_labels = df.service
y = label_generator(food_labels=food_labels.values, 
                    service_labels=service_labels.values).trim_and_fetch_labels()

In [7]:
X_train, X_test, _ = train_val_test(data=X, train_frac=0.8, val_frac=0.2, test_frac=0)
y_train, y_test, _ = train_val_test(data=y, train_frac=0.8, val_frac=0.2, test_frac=0)

In [9]:
# Get Bert encodings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load Bert tokenizer
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, return_tensors='pt')

## Load in saved finetuned BERT model

In [10]:
from src.models.model_zoo import *
from src.models.model_train import *

# instatiate model
bert_model = BERTClass()

In [11]:
count_parameters(bert_model)

109485316

In [12]:
# load saved parameters
bert_model.eval()
bert_model.load_state_dict(torch.load('src/models/saved_models/bert_fine_tuned.pt'))

<All keys matched successfully>

## Sanity check: try made up reviews

In [13]:
# Test review
input = "Their customer service is horrible!"
input_list = [input]

In [14]:
tokenized_input = tokenizer(input_list,truncation=True, padding=True, return_tensors='pt')

In [15]:
ids, mask, token_type_ids = (tokenized_input['input_ids'], tokenized_input['attention_mask'], 
                             tokenized_input['token_type_ids'])


In [16]:
# Get BERT prediction output
bert_model(ids, mask, token_type_ids)

tensor([[0.0116, 0.0081, 0.9660, 0.0143]], grad_fn=<SoftmaxBackward0>)

In [20]:
bert_model(ids, mask, token_type_ids).detach().numpy().argmax(axis=1)

array([2], dtype=int64)

## Evaluate performance on test set

In [24]:
ids, mask, token_type_ids = (test_encodings['input_ids'], test_encodings['attention_mask'], 
                             test_encodings['token_type_ids'])

In [None]:
# Get softmax probabilities for test set
y_scores = bert_model(ids, mask, token_type_ids)