In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
sys.path.append('/content/gdrive/My Drive/ECE 661/Final')

! pip install transformers

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import functools
import sys
import numpy as np
import tqdm
import random
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from torch.utils.data import Dataset
import transformers
from transformer import TransformerEncoder
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
# USE_PRETRAINED_TOKENIZER = True
PAD_INDEX = 0
UNK_INDEX = 1
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
BATCH_SIZE = 16
MAX_LENGTH = 256
HIDDEN_DIM = 256
OUTPUT_DIM = 2
N_LAYERS = 3
ATTN_HEADS = 4
DROPOUT_RATE = 0.1
LR = 3e-4
N_EPOCHS = 20


seed = 0
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

transformer_name = 'bert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)
vocab_size = len(tokenizer)
pad_index = PAD_INDEX

In [None]:
def load_imdb(base_csv:str = './IMDBDataset.csv'):
    """
    Load the IMDB dataset
    :param base_csv: the path of the dataset file.
    :return: train, validation and test set.
    """
    
    df = pd.read_csv(base_csv)
    x_train, x_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.3, random_state=42)
    x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

    print(f'shape of train data is {x_train.shape}')
    print(f'shape of test data is {x_test.shape}')
    print(f'shape of valid data is {x_valid.shape}')
    return x_train, x_valid, x_test, y_train, y_valid, y_test

x_train, x_valid, x_test, y_train, y_valid, y_test = load_imdb('/content/gdrive/My Drive/ECE 661/Final/IMDBDataset.csv')

shape of train data is (35000,)
shape of test data is (7500,)
shape of valid data is (7500,)


In [None]:
def create_bag(data, span):
    bag = []
    reviews = [review.split('.') for review in data]
    for review in reviews:
        span = span
        if len(review) < span:
            span = len(review)
        else:
            pass
        bag += ['.'.join(review[i:i+span]) for i in range(0, len(review), span)]
    bag_size = len(bag)
    return bag, bag_size

def create_nsp_sop_data(data, bag, bag_size, span, mode):
    sentence_a = []
    sentence_b = []
    label = []
    for review in data:
        sentences = [sentence for sentence in review.split('.') if sentence != '']
        span = span
        if len(sentences) < span:
            span = len(sentences)
        else:
            pass
        sentences = ['.'.join(sentences[i:i+span]) for i in range(0, len(sentences), span)]
        num_sentences = len(sentences)
        if num_sentences > 1:
            start = random.randint(0, num_sentences - 2)
            if mode == 'nsp':
                sentence_a.append(sentences[start])
                if random.random() > 0.5:
                    sentence_b.append(bag[random.randint(0, bag_size - 1)])
                    label.append(0)
                else:
                    sentence_b.append(sentences[start + 1])
                    label.append(1)
            else:
                if random.random() > 0.5:
                    sentence_a.append(sentences[start])
                    sentence_b.append(sentences[start + 1])
                    label.append(1)
                else:
                    sentence_a.append(sentences[start + 1])
                    sentence_b.append(sentences[start])
                    label.append(0)
    return sentence_a, sentence_b, label

def create_sp_data(bag):
    sentences = []
    label = []
    for review in bag:
        review = [sentence for sentence in review.split('.') if sentence != '']
        if len(review) == 1:
            review = '.'.join(review)
            sentences.append(review)
            label.append(1)
        else:
            if random.random() > 0.5:
                shuffled_review = '.'.join(random.sample(review, len(review)))
                sentences.append(shuffled_review)
                if [sentence for sentence in shuffled_review.split('.') if sentence != ''] == review:
                    label.append(1)
                else:
                    label.append(0)
            else:
                review = '.'.join(review)
                sentences.append(review)
                label.append(1)
    return sentences, label

In [None]:
class IMDB(Dataset):
    def __init__(self, x, y, tokenizer, max_length=256, mode = 'mlm') -> None:
        """
        :param split: can be either "train", "val" or "test".
        :param tokenizer: a simple tokenizer object.
        """
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mode = mode

        SPAN = 5
        bag, bag_size = create_bag(self.x, SPAN)

        if self.mode == 'mlm':
            pass
        elif self.mode == 'nsp':
            self.sentence_a, self.sentence_b, self.label = create_nsp_sop_data(self.x, bag, bag_size, SPAN, 'nsp')
        elif self.mode == 'sop':
            self.sentence_a, self.sentence_b, self.label = create_nsp_sop_data(self.x, bag, bag_size, SPAN, 'sop')
        elif self.mode == 'sp':
            self.sentences, self.label = create_sp_data(bag)

    def __getitem__(self, idx: int):
        if self.mode == 'mlm':
            self.inputs = tokenizer(self.x.iloc[idx], return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
            self.inputs["labels"] = self.inputs['input_ids'].detach().clone()
            rand = torch.rand(self.inputs['input_ids'].shape)
            mask_101 = (self.inputs['input_ids']== 101)[0]
            mask_102 = (self.inputs['input_ids'] == 102)[0]
            mask_ran = (rand < 0.15)[0]
            mask_pad = (self.inputs['input_ids'] == 0)[0]
            mask_arr = ~(mask_101 | mask_102 | mask_pad) * mask_ran
            selection = torch.flatten(mask_arr.nonzero()).tolist()
            self.inputs['input_ids'][0, selection] = 103
            self.inputs["labels"][0, ~mask_ran | mask_pad | mask_101 | mask_102] = -100

            self.label = self.inputs["labels"].flatten()
            return {"ids": self.inputs['input_ids'].flatten(),
                    "length": self.inputs['input_ids'].shape[1],
                    "label": self.label}
        elif self.mode == 'nsp' or self.mode == 'sop':
            self.inputs = self.tokenizer(self.sentence_a[idx], self.sentence_b[idx], 
                                        return_tensors='pt', 
                                        max_length = self.max_length, 
                                        truncation=True, 
                                        padding = 'max_length')
            self.inputs['labels'] = torch.unsqueeze(torch.LongTensor(self.label), 1)[idx]

            return {'ids': self.inputs['input_ids'].flatten(), 
                    'length': len(self.inputs['input_ids'].flatten()),
                    'label': self.inputs['labels']}
        elif self.mode == 'sp':
            inputs = tokenizer(self.x.iloc[idx],
                               return_tensors='pt',
                               max_length=self.max_length,
                               padding='max_length',
                               truncation=True)
            sents = [sent.strip() for sent in self.x.iloc[idx].split(".") if sent is not '']
            permuted_idx = np.random.permutation(len(sents))
            permuted_sents = [sents[i] for i in permuted_idx]
            permuted_sents = ". ".join(permuted_sents)
            label = tokenizer(permuted_sents,
                              return_tensors='pt',
                              max_length=self.max_length,
                              padding='max_length',
                              truncation=True).input_ids
            label = label.flatten()

            return {"ids": inputs.input_ids.flatten(),
                    "length": inputs.input_ids.shape[1],
                    "label": label}
        
    def __len__(self) -> int:
        if self.mode == 'mlm' or self.mode == 'sp':
            return len(self.x)
        else:
            return len(self.label)

In [None]:
def mlm_collate(batch, pad_index):
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = [torch.LongTensor(i['label']) for i in batch]
    batch_label = nn.utils.rnn.pad_sequence(batch_label, padding_value=pad_index, batch_first=True)
    batch = {'ids': batch_ids, 'label': batch_label}
    return batch
def collate(batch, pad_index):
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = torch.LongTensor([i['label'] for i in batch])
    batch = {'ids': batch_ids, 'label': batch_label}
    return batch

mlm_collate = functools.partial(mlm_collate, pad_index=pad_index)
collate = functools.partial(collate, pad_index=pad_index)

In [None]:
# mlm dataloader
mlm_train_data = IMDB(x_train, y_train, tokenizer, MAX_LENGTH, mode = 'mlm')
mlm_valid_data = IMDB(x_valid, y_valid, tokenizer, MAX_LENGTH, mode = 'mlm')
mlm_test_data = IMDB(x_test, y_test, tokenizer, MAX_LENGTH, mode = 'mlm')

mlm_train_dataloader = torch.utils.data.DataLoader(mlm_train_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate, shuffle=True)
mlm_valid_dataloader = torch.utils.data.DataLoader(mlm_valid_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate)
mlm_test_dataloader = torch.utils.data.DataLoader(mlm_test_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate)
# nsp dataloader
nsp_train_data = IMDB(x_train, y_train, tokenizer, MAX_LENGTH, mode = 'nsp')
nsp_valid_data = IMDB(x_valid, y_valid, tokenizer, MAX_LENGTH, mode = 'nsp')
nsp_test_data = IMDB(x_test, y_test, tokenizer, MAX_LENGTH, mode = 'nsp')

nsp_train_dataloader = torch.utils.data.DataLoader(nsp_train_data, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True)
nsp_valid_dataloader = torch.utils.data.DataLoader(nsp_valid_data, batch_size=BATCH_SIZE, collate_fn=collate)
nsp_test_dataloader = torch.utils.data.DataLoader(nsp_test_data, batch_size=BATCH_SIZE, collate_fn=collate)
# sop dataloader
sop_train_data = IMDB(x_train, y_train, tokenizer, MAX_LENGTH, mode = 'sop')
sop_valid_data = IMDB(x_valid, y_valid, tokenizer, MAX_LENGTH, mode = 'sop')
sop_test_data = IMDB(x_test, y_test, tokenizer, MAX_LENGTH, mode = 'sop')

sop_train_dataloader = torch.utils.data.DataLoader(sop_train_data, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True)
sop_valid_dataloader = torch.utils.data.DataLoader(sop_valid_data, batch_size=BATCH_SIZE, collate_fn=collate)
sop_test_dataloader = torch.utils.data.DataLoader(sop_test_data, batch_size=BATCH_SIZE, collate_fn=collate)
# sp dataloader 
sp_train_data = IMDB(x_train, y_train, tokenizer, MAX_LENGTH, mode = 'sp')
sp_valid_data = IMDB(x_valid, y_valid, tokenizer, MAX_LENGTH, mode = 'sp')
sp_test_data = IMDB(x_test, y_test, tokenizer, MAX_LENGTH, mode = 'sp')

sp_train_dataloader = torch.utils.data.DataLoader(sp_train_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate, shuffle=True)
sp_valid_dataloader = torch.utils.data.DataLoader(sp_valid_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate)
sp_test_dataloader = torch.utils.data.DataLoader(sp_test_data, batch_size=BATCH_SIZE, collate_fn=mlm_collate)

In [None]:
next(iter(sop_train_dataloader))

{'ids': tensor([[  101,  1049,   102,  ...,     0,     0,     0],
         [  101,  1996,  3772,  ...,     0,     0,     0],
         [  101,  1996, 12703,  ...,     0,     0,     0],
         ...,
         [  101,  1996,  3772,  ...,     0,     0,     0],
         [  101,  1037,  2210,  ...,     0,     0,     0],
         [  101,  2059,  2009,  ...,     0,     0,     0]]),
 'label': tensor([0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0])}

In [None]:
class Transformer(nn.Module):
    def __init__(self, transformer, output_dim, freeze):
        super().__init__()
        self.transformer = transformer
        hidden_dim = transformer.hidden
        self.fc = nn.Linear(hidden_dim, output_dim)

        if freeze:
            for param in self.transformer.parameters():
                param.requires_grad = False

    def forward(self, ids):
        """
        :param ids: [batch size, seq len]
        :return: prediction of size [batch size, output dim]
        """
        output = self.transformer(ids)
        cls_hidden = output[:,0,:]
        prediction = self.fc(torch.tanh(cls_hidden))
        return prediction

class Transformer_mlm(nn.Module):
    def __init__(self, transformer, output_dim, freeze):
        super().__init__()
        self.transformer = transformer
        hidden_dim = transformer.hidden
        self.fc = nn.Linear(hidden_dim, output_dim)

        if freeze:
            for param in self.transformer.parameters():
                param.requires_grad = False

    def forward(self, ids):
        """
        :param ids: [batch size, seq len]
        :return: prediction of size [batch size, output dim]
        """
        output = self.transformer(ids)
        cls_hidden = output
        prediction = self.fc(torch.tanh(cls_hidden))
        return prediction

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        label = batch['label'].to(device)
        prediction = model(ids)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def train_mlm(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []

    for batch in tqdm.tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        label = batch['label'].to(device)
        prediction = model(ids)
        loss = criterion(prediction.transpose(1, 2), label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(ids)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def evaluate_mlm(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []

    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            prediction = model(ids)
            loss = criterion(prediction.transpose(1, 2), label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

    return epoch_losses, epoch_accs

def get_accuracy(prediction, label):
    batch_size = prediction.shape[0]
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / (label.shape[0] * label.shape[1])
    return accuracy


In [None]:
# Model
transformer = TransformerEncoder(len(tokenizer), hidden=256, n_layers=3, attn_heads=4, dropout=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')                                                       
# -----
mlm_model = Transformer_mlm(transformer, vocab_size, False)
print(f'The mlm model has {count_parameters(mlm_model):,} trainable parameters')
mlm_optimizer = optim.Adam(mlm_model.parameters(), lr=LR)
mlm_scheduler = transformers.get_cosine_schedule_with_warmup(mlm_optimizer,num_warmup_steps = N_EPOCHS//2,
                                                         num_training_steps = N_EPOCHS)
mlm_criterion= nn.CrossEntropyLoss(reduction='mean')
mlm_model = mlm_model.to(device)
mlm_criterion = mlm_criterion.to(device)
# -----
nsp_model = Transformer(transformer, OUTPUT_DIM, False)    
print(f'The model has {count_parameters(nsp_model):,} trainable parameters')
nsp_optimizer = optim.Adam(nsp_model.parameters(), lr=LR)
nsp_scheduler = transformers.get_cosine_schedule_with_warmup(nsp_optimizer,num_warmup_steps = N_EPOCHS//2,
                                                         num_training_steps = N_EPOCHS)
nsp_criterion = nn.CrossEntropyLoss()
nsp_model = nsp_model.to(device)
nsp_criterion = nsp_criterion.to(device)
# -----
sop_model = Transformer(transformer, OUTPUT_DIM, False)    
print(f'The model has {count_parameters(sop_model):,} trainable parameters')
sop_optimizer = optim.Adam(sop_model.parameters(), lr=LR)
sop_scheduler = transformers.get_cosine_schedule_with_warmup(sop_optimizer,num_warmup_steps = N_EPOCHS//2,
                                                         num_training_steps = N_EPOCHS)
sop_criterion = nn.CrossEntropyLoss()
sop_model = sop_model.to(device)
sop_criterion = sop_criterion.to(device)
# -----
sp_model = Transformer(transformer, OUTPUT_DIM, False)    
print(f'The model has {count_parameters(sp_model):,} trainable parameters')
sp_optimizer = optim.Adam(sp_model.parameters(), lr=LR)
sp_scheduler = transformers.get_cosine_schedule_with_warmup(sp_optimizer,num_warmup_steps = N_EPOCHS//2,
                                                         num_training_steps = N_EPOCHS)
sp_criterion = nn.CrossEntropyLoss()
sp_model = sp_model.to(device)
sp_criterion = sp_criterion.to(device)

The mlm model has 18,027,834 trainable parameters
The model has 10,184,194 trainable parameters
The model has 10,184,194 trainable parameters
The model has 10,184,194 trainable parameters


In [None]:
# Start training
mlm_best_valid_loss = float('inf')
mlm_train_losses = []
mlm_train_accs = []
mlm_valid_losses = []
mlm_valid_accs = []

for epoch in range(N_EPOCHS):

    mlm_train_loss, mlm_train_acc = train_mlm(mlm_train_dataloader, mlm_model, mlm_criterion, mlm_optimizer, device)
    mlm_valid_loss, mlm_valid_acc= evaluate_mlm(mlm_valid_dataloader, mlm_model, mlm_criterion, device)

    mlm_scheduler.step()

    mlm_train_losses.extend(mlm_train_loss)
    mlm_train_accs.extend(mlm_train_acc)
    mlm_valid_losses.extend(mlm_valid_loss)
    mlm_valid_accs.extend(mlm_valid_acc)

    mlm_epoch_train_loss = np.mean(mlm_train_loss)
    mlm_epoch_train_acc = np.mean(mlm_train_acc)
    mlm_epoch_valid_loss = np.mean(mlm_valid_loss)
    mlm_epoch_valid_acc = np.mean(mlm_valid_acc)

    if mlm_epoch_valid_loss < mlm_best_valid_loss:
        mlm_best_valid_loss = mlm_epoch_valid_loss
        torch.save(mlm_model.state_dict(), '/content/gdrive/My Drive/ECE 661/Final/mlm_pretrain.pt')

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {mlm_epoch_train_loss:.3f}, train_acc: {mlm_epoch_train_acc:.3f}')
    print(f'valid_loss: {mlm_epoch_valid_loss:.3f}' valid_acc: {mlm_epoch_valid_acc:.3f}')

training...: 100%|██████████| 2188/2188 [05:57<00:00,  6.11it/s]
evaluating...: 100%|██████████| 469/469 [00:40<00:00, 11.72it/s]
epoch: 1
train_loss: 10.393
valid_loss: 10.402
training...: 100%|██████████| 2188/2188 [05:58<00:00,  6.11it/s]
evaluating...: 100%|██████████| 469/469 [00:39<00:00, 11.73it/s]
epoch: 2
train_loss: 7.053
valid_loss: 6.719
training...: 100%|██████████| 2188/2188 [05:58<00:00,  6.11it/s]
evaluating...: 100%|██████████| 469/469 [00:39<00:00, 11.74it/s]
epoch: 3
train_loss: 6.679
valid_loss: 6.730
training...: 100%|██████████| 2188/2188 [05:58<00:00,  6.11it/s]
evaluating...: 100%|██████████| 469/469 [00:40<00:00, 11.70it/s]
epoch: 4
train_loss: 6.674
valid_loss: 6.688
training...: 100%|██████████| 2188/2188 [05:58<00:00,  6.10it/s]
evaluating...: 100%|██████████| 469/469 [00:39<00:00, 11.74it/s]
epoch: 5
train_loss: 6.580
valid_loss: 6.527
training...: 100%|██████████| 2188/2188 [05:58<00:00,  6.11it/s]
evaluating...: 100%|██████████| 469/469 [00:39<00:00, 11.7

In [None]:
# Start training
nsp_best_valid_loss = float('inf')
nsp_train_losses = []
nsp_train_accs = []
nsp_valid_losses = []
nsp_valid_accs = []

for epoch in range(N_EPOCHS):

    nsp_train_loss, nsp_train_acc = train(nsp_train_dataloader, nsp_model, nsp_criterion, nsp_optimizer, device)
    nsp_valid_loss, nsp_valid_acc = evaluate(nsp_valid_dataloader, nsp_model, nsp_criterion, device)

    nsp_scheduler.step()

    nsp_train_losses.extend(nsp_train_loss)
    nsp_train_accs.extend(nsp_train_acc)
    nsp_valid_losses.extend(nsp_valid_loss)
    nsp_valid_accs.extend(nsp_valid_acc)

    nsp_epoch_train_loss = np.mean(nsp_train_loss)
    nsp_epoch_train_acc = np.mean(nsp_train_acc)
    nsp_epoch_valid_loss = np.mean(nsp_valid_loss)
    nsp_epoch_valid_acc = np.mean(nsp_valid_acc)

    if nsp_epoch_valid_loss < nsp_best_valid_loss:
        nsp_best_valid_loss = nsp_epoch_valid_loss
        torch.save(nsp_model.state_dict(), '/content/gdrive/My Drive/ECE 661/Final/nsp_pretrain.pt')

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {nsp_epoch_train_loss:.3f}, train_acc: {nsp_epoch_train_acc:.3f}')
    print(f'valid_loss: {nsp_epoch_valid_loss:.3f}, valid_acc: {nsp_epoch_valid_acc:.3f}')

training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.11it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.96it/s]
epoch: 1
train_loss: 0.740, train_acc: 0.503
valid_loss: 0.730, valid_acc: 0.503
training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.06it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 58.02it/s]
epoch: 2
train_loss: 0.704, train_acc: 0.519
valid_loss: 0.687, valid_acc: 0.534
training...: 100%|██████████| 2178/2178 [01:49<00:00, 19.95it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 58.24it/s]
epoch: 3
train_loss: 0.687, train_acc: 0.538
valid_loss: 0.668, valid_acc: 0.560
training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.13it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.84it/s]
epoch: 4
train_loss: 0.663, train_acc: 0.560
valid_loss: 0.649, valid_acc: 0.571
training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.06it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 58.23it/s]
epoch: 5
train_loss: 0.655

In [None]:
# Start training
sop_best_valid_loss = float('inf')
sop_train_losses = []
sop_train_accs = []
sop_valid_losses = []
sop_valid_accs = []

for epoch in range(N_EPOCHS):

    sop_train_loss, sop_train_acc = train(sop_train_dataloader, sop_model, sop_criterion, sop_optimizer, device)
    sop_valid_loss, sop_valid_acc = evaluate(sop_valid_dataloader, sop_model, sop_criterion, device)

    sop_scheduler.step()

    sop_train_losses.extend(sop_train_loss)
    sop_train_accs.extend(sop_train_acc)
    sop_valid_losses.extend(sop_valid_loss)
    sop_valid_accs.extend(sop_valid_acc)

    sop_epoch_train_loss = np.mean(sop_train_loss)
    sop_epoch_train_acc = np.mean(sop_train_acc)
    sop_epoch_valid_loss = np.mean(sop_valid_loss)
    sop_epoch_valid_acc = np.mean(sop_valid_acc)

    if sop_epoch_valid_loss < sop_best_valid_loss:
        sop_best_valid_loss = sop_epoch_valid_loss
        torch.save(sop_model.state_dict(), '/content/gdrive/My Drive/ECE 661/Final/sop_pretrain.pt')

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {sop_epoch_train_loss:.3f}, train_acc: {sop_epoch_train_acc:.3f}')
    print(f'valid_loss: {sop_epoch_valid_loss:.3f}, valid_acc: {sop_epoch_valid_acc:.3f}')

training...: 100%|██████████| 2178/2178 [01:49<00:00, 19.97it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.69it/s]
epoch: 1
train_loss: 0.744, train_acc: 0.498
valid_loss: 0.751, valid_acc: 0.496
training...: 100%|██████████| 2178/2178 [01:48<00:00, 19.98it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.50it/s]
epoch: 2
train_loss: 0.710, train_acc: 0.504
valid_loss: 0.698, valid_acc: 0.500
training...: 100%|██████████| 2178/2178 [01:49<00:00, 19.95it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.63it/s]
epoch: 3
train_loss: 0.707, train_acc: 0.499
valid_loss: 0.694, valid_acc: 0.510
training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.02it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.43it/s]
epoch: 4
train_loss: 0.703, train_acc: 0.501
valid_loss: 0.694, valid_acc: 0.499
training...: 100%|██████████| 2178/2178 [01:48<00:00, 20.05it/s]
evaluating...: 100%|██████████| 467/467 [00:08<00:00, 57.60it/s]
epoch: 5
train_loss: 0.701

In [None]:
# Start training
sp_best_valid_loss = float('inf')
sp_train_losses = []
sp_train_accs = []
sp_valid_losses = []
sp_valid_accs = []

for epoch in range(N_EPOCHS):

    sp_train_loss, sp_train_acc = train_mlm(sp_train_dataloader, sp_model, sp_criterion, sp_optimizer, device)
    sp_valid_loss, sp_valid_acc = evaluate_mlm(sp_valid_dataloader, sp_model, sp_criterion, device)

    sp_scheduler.step()

    sp_train_losses.extend(sp_train_loss)
    sp_train_accs.extend(sp_train_acc)
    sp_valid_losses.extend(sp_valid_loss)
    sp_valid_accs.extend(sp_valid_acc)

    sp_epoch_train_loss = np.mean(sp_train_loss)
    sp_epoch_train_acc = np.mean(sp_train_acc)
    sp_epoch_valid_loss = np.mean(sp_valid_loss)
    sp_epoch_valid_acc = np.mean(sp_valid_acc)

    if sp_epoch_valid_loss < sp_best_valid_loss:
        sp_best_valid_loss = sp_epoch_valid_loss
        torch.save(sp_model.state_dict(), '/content/gdrive/My Drive/ECE 661/Final/sp_pretrain.pt')

    print(f'epoch: {epoch+1}')
    print(f'train_loss: {sp_epoch_train_loss:.3f}, train_acc: {sp_epoch_train_acc:.3f}')
    print(f'valid_loss: {sp_epoch_valid_loss:.3f}, valid_acc: {sp_epoch_valid_acc:.3f}')

In [None]:
# fig = plt.figure(figsize=(10,6))
# ax = fig.add_subplot(1,1,1)
# ax.plot(train_losses, label='train loss')
# ax.plot(valid_losses, label='valid loss')
# plt.legend()
# ax.set_xlabel('updates')
# ax.set_ylabel('loss')

# fig = plt.figure(figsize=(10,6))
# ax = fig.add_subplot(1,1,1)
# ax.plot(train_accs, label='train accuracy')
# ax.plot(valid_accs, label='valid accuracy')
# plt.legend()
# ax.set_xlabel('updates')
# ax.set_ylabel('accuracy')

# model.load_state_dict(torch.load('/content/gdrive/My Drive/ECE 661/Final/nsp.pt'))

# test_loss, test_acc = evaluate(test_dataloader, model, criterion, device)

# epoch_test_loss = np.mean(test_loss)
# epoch_test_acc = np.mean(test_acc)
# print(f'test_loss: {epoch_test_loss:.3f}, test_acc: {epoch_test_acc:.3f}')