<a href="https://colab.research.google.com/github/JRasmusBm/chatbot-
epsilon/blob/master/Trainer.ipynb" target="_parent"><img
src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In
Colab"/></a>

# This is the file in which we perform training of the NN

# Load
Data

## In Colab

In [1]:
#from google.colab import files
#uploaded = files.upload()
#file_name = "amazon_cells_labelled.txt"
#uploaded[file_name].decode("utf-8")

## Locally

In [2]:
data_folder = "../../data"
trained_models_folder = "../../trained_models"
file_name = f"{data_folder}/amazon_cells_labelled.txt"
json_file = f"{data_folder}/amazon_cells_labelled.json"

# Import code (from TA)

# Imports

In [3]:
from torchtext import data
import torch
import torch.nn as nn
import json
import time
import numpy as np
from transformers import BertModel, BertTokenizer

# Extract Data

First, we create lists of labels and sentences. The indices in
the one
correspond to those in the other. Due to restrictions in torchtext,
write it as
json to disk.

In [4]:
with open(file_name) as f:
    contents = f.read()
labels = []
sentences = []
for line in (l for l in contents.split("\n") if l):
    labels.append(int(line[-1]))
    sentence = str.strip(line[:-1])
    while len(sentence.split(" ")) < 5:
        sentence += " a"
    sentences.append(sentence)
data_json = [
    dict(label=label, text=text) for label, text in zip(labels, sentences)
]
with open(json_file, "w") as f:
    text = "\n".join(json.dumps(line) for line in data_json)
    f.write(text)

## Validate

In [5]:
with open(json_file) as f:
    json_written = [json.loads(line) for line in f.read().split("\n")]
    for line in json_written:
        if line["label"] not in [0, 1]:
            print(line)
        if len(line["text"].split(" ")) < 5:
            print(line)

# Generate Torchtext Dataset

In [6]:
def generate_bigrams(x):
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(" ".join(n_gram))
    return x

In [7]:
import random

from IPython.core.debugger import set_trace
from torch.utils.data.dataset import random_split

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# help(dataset)

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

init_token_index = tokenizer.cls_token_id
end_of_string_token_index = tokenizer.sep_token_id
padding_token_index = tokenizer.pad_token_id
unknown_token_index = tokenizer.unk_token_id

In [9]:
max_input_length = tokenizer.max_model_input_sizes["bert-base-uncased"]

TEXT = data.Field(
    batch_first=True,
    use_vocab=False,
    preprocessing=tokenizer.convert_tokens_to_ids,
    init_token=init_token_index,
    eos_token=end_of_string_token_index,
    pad_token=padding_token_index,
    unk_token=unknown_token_index,
)
LABEL = data.LabelField(dtype=torch.float)

fields = dict(text=("text", TEXT), label=("label", LABEL),)

dataset = data.TabularDataset(path=json_file, format="json", fields=fields,)

In [10]:
training_data, test_data, validation_data = dataset.split(
    split_ratio=[0.7, 0.2, 0.1], random_state=random.seed(SEED)
)

In [11]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = (tokens[: max_input_length - 2],)
    return tokens

## Validate

In [12]:
print(f"Length (Training Data): {len(training_data)}")
print(f"Length (Test Data): {len(test_data)}")
print(f"Length (Validation Data): {len(validation_data)}")

Length (Training Data): 700
Length (Test Data): 100
Length (Validation Data): 200


# Build Vocab

In [13]:
MAX_VOCAB_SIZE = 25_000

LABEL.build_vocab(training_data)

## Validate

In [14]:
print(f"Unique tokens in TEXT vocabulary: {len(tokenizer.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 30522
Unique tokens in LABEL vocabulary: 2


# Create Iterators

In [15]:
BATCH_SIZE = 64

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

training_iterator, validation_iterator, test_iterator = data.BucketIterator.splits(
    (training_data, validation_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch = True,
    device = device)

# Build Model

In [16]:
bert = BertModel.from_pretrained("bert-base-uncased")

class BERTGRUSentiment(nn.Module):
    def __init__(
        self, bert, hidden_dim, output_dim, n_layers, bidirectional, dropout
    ):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()["hidden_size"]
        self.rnn = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=0 if n_layers < 2 else dropout,
        )
        self.out = nn.Linear(
            hidden_dim * 2 if bidirectional else hidden_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        with torch.no_grad():
            embedded = bert(text)[0]
        _, hidden = self.rnn(embedded)
        if self.rnn.bidirectional:
            hidden = self.dropout(
                torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
            )
        else:
            hidden = self.dropout(hidden[-1, :, :])
        output = self.out(hidden)
        return output

# Instantiate Model

In [17]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
    HIDDEN_DIM,
    OUTPUT_DIM,
    N_LAYERS,
    BIDIRECTIONAL,
    DROPOUT,
)

## Validate

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


# Freeze Parameters

In [19]:
for name, param in model.named_parameters():
  if name.startswith("bert"):
    param.requires_grad = False

## Validate

In [20]:
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [21]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [22]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [24]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Run Training

In [None]:
N_EPOCHS = 10

best_validation_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    training_loss, training_acc = train(model, training_iterator, optimizer, criterion)
    validation_loss, validation_acc = evaluate(model, validation_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(model.state_dict(), f"{trained_models_folder}/bert_10_given_dataset.pt")
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\ttraining Loss: {training_loss:.3f} | training Acc: {training_acc*100:.2f}%')
    print(f'\t Val. Loss: {validation_loss:.3f} |  Val. Acc: {validation_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 33s
	training Loss: 0.754 | training Acc: 51.87%
	 Val. Loss: 0.700 |  Val. Acc: 44.14%
Epoch: 02 | Epoch Time: 0m 33s
	training Loss: 0.636 | training Acc: 66.07%
	 Val. Loss: 0.648 |  Val. Acc: 67.19%
Epoch: 03 | Epoch Time: 0m 37s
	training Loss: 0.553 | training Acc: 72.91%
	 Val. Loss: 0.737 |  Val. Acc: 59.38%
Epoch: 04 | Epoch Time: 0m 39s
	training Loss: 0.631 | training Acc: 68.02%
	 Val. Loss: 0.879 |  Val. Acc: 43.36%
Epoch: 05 | Epoch Time: 0m 34s
	training Loss: 0.656 | training Acc: 61.92%
	 Val. Loss: 0.655 |  Val. Acc: 64.45%
Epoch: 06 | Epoch Time: 0m 34s
	training Loss: 0.524 | training Acc: 73.48%
	 Val. Loss: 0.497 |  Val. Acc: 76.56%
Epoch: 07 | Epoch Time: 0m 33s
	training Loss: 0.496 | training Acc: 73.87%
	 Val. Loss: 0.532 |  Val. Acc: 73.05%
Epoch: 08 | Epoch Time: 0m 33s
	training Loss: 0.465 | training Acc: 75.34%
	 Val. Loss: 0.518 |  Val. Acc: 71.88%
