# Loading and fine-tuning/benchmarking

This is only for the serial case. See the other notebook for the parallel stuff. 

In [1]:
import torch
import torch.nn as nn

# Needed for parallel 
from collections import OrderedDict

## Let's load the serial model

In [2]:
# Load dictionary
model_dicts = torch.load(f'model_serial_checkpoint_0_epoch=10')

# Load actual model 
model_serial = torch.load('serialnet_bert_32')

In [3]:
model_serial.load_state_dict(model_dicts['model_state'])

<All keys matched successfully>

In [4]:
model_serial.serial_nn[0].layer.layer.layer.attention.self.query.weight.flatten()[0:10]

tensor([ 8.2220e-20, -3.1414e-21,  8.5347e-19, -3.4963e-23, -1.8939e-12,
        -8.1679e-17,  4.8803e-15, -2.3748e-15, -1.2632e-13,  1.1441e-06],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [5]:
#print(model_dicts['model_state'].keys())

# With the models loaded, it's time to alter to train 

Remember that this fine tuning task means we have to alter the actual model and transfer things over. 

In [6]:
from network_architecture_v2 import MyBertForSequenceClassification

In [7]:
model_serial.close_nn_nsp.pooler

BertPooler(
  (dense): Linear(in_features=1024, out_features=1024, bias=True)
  (activation): Tanh()
)

In [8]:
training_serial = MyBertForSequenceClassification(model_serial)

## With models defined, load dataset 

In [9]:
from datasets import load_dataset, load_metric
from transformers import BertTokenizer
from transformers import Trainer, TrainingArguments
import numpy as np

In [10]:
# Load dataset
dataset = load_dataset('glue', 'sst2')

# I believe this is the tokenizer I used... 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", 
                     max_length=64, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [11]:
print(len(tokenized_datasets['train'][1]['input_ids']))
print(len(tokenized_datasets['train'][0]['input_ids']))

64
64


In [12]:
print(tokenized_datasets['train'][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0, 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-4,
    dataloader_drop_last=True,
    # warmup_steps=10,
    # weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)


In [14]:
# Load the accuracy metric
metric = load_metric("accuracy")

# Define the compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")


In [15]:
# Initialize the Trainer
trainer = Trainer(
    model=training_serial,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6896,0.694709,0.507812
2,0.6903,0.695426,0.507812
3,0.4116,0.461916,0.798177
4,0.268,0.458745,0.809896
5,0.209,0.53853,0.790365
6,0.2045,0.548699,0.807292
7,0.1668,0.605398,0.796875
8,0.1433,0.669445,0.802083
9,0.1432,0.700075,0.78776
10,0.1151,0.724311,0.791667


TrainOutput(global_step=5260, training_loss=0.32825333231302267, metrics={'train_runtime': 7064.3358, 'train_samples_per_second': 95.337, 'train_steps_per_second': 0.745, 'total_flos': 0.0, 'train_loss': 0.32825333231302267, 'epoch': 10.0})