# Benchmark serial case

We first load the data and model

In [33]:
import torch
import torch.nn as nn

# Needed for parallel 
from collections import OrderedDict

# For training 
from network_architecture_v2 import MyBertForSequenceClassification

In [34]:
# For fine tuning
from datasets import load_dataset #, load_metric
from transformers import BertTokenizer
from transformers import Trainer, TrainingArguments
import numpy as np

In [35]:
# Load dataset
dataset = load_dataset('glue', 'sst2')

# I believe this is the tokenizer I used... 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", 
                     max_length=64, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



# Load the saved model! 


In [36]:
# Load dictionary
model_dicts = torch.load(f'bert-save-1/model_serial_checkpoint_batch_idx=80000')
new_dict = OrderedDict(model_dicts['model_state_dict'])
# Load actual model 
model_serial = torch.load('serialnet_bert_32')

In [37]:
model_serial.load_state_dict(new_dict)

<All keys matched successfully>

# With model loaded, time to change the final layers

In [38]:
training_serial = MyBertForSequenceClassification(model_serial)

# Define trainers and train

In [39]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    adam_beta1=0.9,
    adam_beta2=0.988,
    adam_epsilon=1e-6,
    dataloader_drop_last=True,
    warmup_steps=100,
    weight_decay=1e-4,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)


In [40]:

# Define the compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(np.float32).mean().item()
    return {"accuracy": accuracy}

In [41]:
# Initialize the Trainer
trainer = Trainer(
    model=training_serial,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [42]:
sum(p.numel() for p in training_serial.parameters() if p.requires_grad)

251241218

In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2518,0.390535,0.826389


KeyboardInterrupt: 

In [None]:
training_serial = MyBertForSequenceClassification(model_serial)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1158,0.468917,0.840278


In [None]:
# Load dataset
dataset = load_dataset('glue', 'cola')

# I believe this is the tokenizer I used... 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding="max_length", 
                     max_length=128, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
training_serial = MyBertForSequenceClassification(model_serial)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    adam_beta1=0.9,
    adam_beta2=0.988,
    adam_epsilon=1e-6,
    dataloader_drop_last=True,
    warmup_steps=100,
    weight_decay=1e-4,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=training_serial,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)


In [None]:
# For COLA
trainer.train()

In [None]:
# Load dataset
dataset = load_dataset('glue', 'mrpc')
training_serial = MyBertForSequenceClassification(model_serial)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"], 
        examples["sentence2"], 
        padding="max_length", 
        truncation=True,
        max_length=256
    )
    
tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [None]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Load dictionary
model_dicts = torch.load(f'bert-save-1/model_serial_checkpoint_batch_idx=80000')
new_dict = OrderedDict(model_dicts['model_state'])
# Load actual model 
model_serial = torch.load('serialnet_bert_32')

model_serial.load_state_dict(new_dict)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    adam_beta1=0.9,
    adam_beta2=0.988,
    adam_epsilon=1e-8,
    dataloader_drop_last=True,
    warmup_steps=5,
    weight_decay=1e-4,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Initialize the Trainer
trainer = Trainer(
    model=training_serial,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics
)


In [None]:
# For MRPC
trainer.train()

In [None]:
from transformers import get_linear_schedule_with_warmup
optimizer = trainer.optimizer

In [None]:
optim_schedule = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=10,  # Number of warmup steps
    num_training_steps=800000   # Total number of training steps
)

from matplotlib import pyplot as plt
learning_rates = []
for step in range(800000):
    # Get the learning rate for the current step
    lr = optim_schedule.get_lr()[0]  # Get the learning rate for the first parameter group
    learning_rates.append(lr)
    
    # Step the scheduler
    optim_schedule.step()

# Plot the learning rate schedule
plt.figure(figsize=(10, 5))
plt.plot(learning_rates)
plt.title('Learning Rate Schedule')
plt.xlabel('Training Steps')
plt.ylabel('Learning Rate')
plt.grid()
plt.show()

In [None]:
optim_schedule
optim_schedule.state_dict()

In [None]:
optim_schedule.step()
optim_schedule.state_dict()

In [None]:
optimizer.state_dict()