# Loading and fine-tuning/benchmarking

This is only for the parallel case. See the other notebook for the serial stuff. 

In [1]:
import torch
import torch.nn as nn

# Needed for parallel 
from collections import OrderedDict

## Now let's load the parallel cases 

In [2]:
checkpoint_0 = torch.load('model_checkpoint_procs=2_0_epoch=2')
checkpoint_1 = torch.load('model_checkpoint_procs=2_1_epoch=2')

In [3]:
keys_0 = checkpoint_0['model_state'].keys()
keys_1 = checkpoint_1['model_state'].keys()

In [4]:
# Ugh, this is 
new_dict = OrderedDict()
keys_0 = checkpoint_0['model_state'].keys()
counter = 0
for key in keys_0:
    if 'parallel_nn' in key:
        split = key.split('.')
        split[1] = 'serial_nn'
        if int(split[2]) > counter:
            counter = int(split[2])
            
        split.insert(3, 'layer')
        new_key = '.'.join(split[1:])
        new_dict[new_key] = checkpoint_0['model_state'][key]
    else:
        new_key = key
        if 'close_nsp' in key:
            # print(key)
            split = key.split('.')
            split[0] = 'close_nn_nsp'
            new_key = '.'.join(split)
        if 'close_mlm' in key:
            # print(key)
            split = key.split('.')
            split[0] = 'close_nn_mlm'
            new_key = '.'.join(split)
        
        new_dict[new_key] = checkpoint_0['model_state'][key]
print(counter)

# Now for the remaining parts? 
keys_1 = checkpoint_1['model_state'].keys()
for key in keys_1:
    if 'parallel_nn' in key:
        split = key.split('.')
        split[1] = 'serial_nn'
        split[2] = str(int(split[2]) + counter + 1)
        split.insert(3, 'layer')
        
        new_key = '.'.join(split[1:])
        new_dict[new_key] = checkpoint_1['model_state'][key]
    else:
        new_dict[key] = checkpoint_1['model_state'][key]

16


In [5]:
model_parallel = torch.load('serialnet_bert_32')
model_parallel.load_state_dict(new_dict)

<All keys matched successfully>

# With the models loaded, it's time to alter to train 

Remember that this fine tuning task means we have to alter the actual model and transfer things over. 

In [6]:
from network_architecture_v2 import MyBertForSequenceClassification

In [7]:
model_parallel.close_nn_nsp.pooler

BertPooler(
  (dense): Linear(in_features=1024, out_features=1024, bias=True)
  (activation): Tanh()
)

In [8]:
training_parallel = MyBertForSequenceClassification(model_parallel)

## With models defined, load dataset 

In [9]:
from datasets import load_dataset, load_metric
from transformers import BertTokenizer
from transformers import Trainer, TrainingArguments
import numpy as np

In [10]:
# Load dataset
dataset = load_dataset('glue', 'sst2')

# I believe this is the tokenizer I used... 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", 
                     max_length=64, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [11]:
print(len(tokenized_datasets['train'][1]['input_ids']))
print(len(tokenized_datasets['train'][0]['input_ids']))

64
64


In [12]:
print(tokenized_datasets['train'][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0, 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=1e-4,
    dataloader_drop_last=True,
    # warmup_steps=10,
    # weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)


In [14]:
# Load the accuracy metric
metric = load_metric("accuracy")

# Define the compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")


In [15]:
# Initialize the Trainer
trainer = Trainer(
    model=training_parallel,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5634,0.538941,0.729167
2,0.3823,0.476314,0.773438
3,0.3461,0.474827,0.803385
4,0.305,0.474221,0.803385
5,0.2651,0.523299,0.795573
6,0.2682,0.521275,0.795573
7,0.2566,0.495785,0.791667
8,0.1914,0.581848,0.78125


KeyboardInterrupt: 

In [None]:
trainer.train()
