In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from opacus import PrivacyEngine
import torch
import tqdm
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from opacus.utils.uniform_sampler import UniformWithReplacementSampler
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'sep_token': '[SEP]'})
tokenizer.add_special_tokens({'bos_token': '[BOS]'})
model.resize_token_embeddings(len(tokenizer))
LABELS = [
    "sadness",
    "joy",
    "love",
    "anger",
    "fear",
    "suprise"
]
dataset = load_dataset("dair-ai/emotion", split="train")
test_dataset = load_dataset("dair-ai/emotion", split="test")
MAX_LEN = 128
input_ids = [
    tokenizer.encode(
        t, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True
    )
    for t in dataset["text"]
]
labels = dataset["label"]
test_input_ids = [
    tokenizer.encode(
        t, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True
    )
    for t in test_dataset["text"]
]
test_labels = test_dataset["label"]

# print(f"Actual text before tokenization: {df_train_syn['text'][2]}")
# print(f"Encoded input: {input_ids[2]}")

attention_masks = []
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]
test_attention_masks = [[float(i > 0) for i in seq] for seq in test_input_ids]

#print(attention_masks[2])

#nvert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(input_ids)
train_labels = torch.tensor(labels)
train_masks = torch.tensor(attention_masks)


test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 4

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)





# def tokenization(example):
#     # print(f"I have {len(example['text'])} rows in example")
#     # print(example)

#     for idx, row in enumerate(example["text"]):
#         example["text"][idx] = f'{tokenizer.bos_token}{LABELS[example["label"][idx]]}{tokenizer.sep_token}{row}{tokenizer.eos_token}'
     
#     out_dict = tokenizer(example["text"], max_length=MAX_LEN, pad_to_max_length=True)

#     out_dict["text"] = example["text"]

#     return out_dict

# BATCH_SIZE = 4

# dataset = dataset.map(tokenization, batched=True,remove_columns=["label"], )
# dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# test_dataset = test_dataset.map(tokenization, batched=True,remove_columns=["label"])
# test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
# # test_dataset = test_dataset.with_format("torch", devi# ce=device)
# test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=BATCH_SIZE)


# Convert model for DP
model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, eps=1e-8)

model = model.train()


EPOCHS = 3
LOGGING_INTERVAL = 5000 # once every how many steps we run evaluation cycle and report metrics
EPSILON = 7.5
DELTA = 1 / len(train_dataloader) # Parameter for privacy accounting. Probability of not achieving privacy guarantees
MAX_GRAD_NORM = 0.1
MAX_PHYSICAL_BATCH_SIZE = 8


# Initialize Privacy Engine
privacy_engine = PrivacyEngine()
model, optimizer, data_loader = privacy_engine.make_private_with_epsilon(
    module=model,
    optimizer=optimizer,
    data_loader= train_dataloader,
    target_delta=DELTA,
    target_epsilon=EPSILON, 
    epochs=EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
)

import numpy as np
from tqdm.notebook import tqdm

def accuracy(preds, labels):
    return (preds == labels).mean()

# define evaluation cycle
def evaluate(model):    
    model.eval()

    loss_arr = []
    accuracy_arr = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      }

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = inputs['labels'].detach().cpu().numpy()
            
            loss_arr.append(loss.item())
            accuracy_arr.append(accuracy(preds, labels))
    
    model.train()
    return np.mean(loss_arr), np.mean(accuracy_arr)




# # Define the number of training epochs
# num_epochs = 1  # Set the number of epochs

# # Training loop
# for epoch in range(num_epochs):
#     for batch in data_loader:
#         inputs = batch["input_ids"].to(model.device)
#         labels = batch["labels"].to(model.device)  # Assuming labels are provided in the data_loader

#         # Forward pass
#         outputs = model(inputs, labels=labels)
#         loss = outputs.loss

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     # Track privacy budget
#     epsilon, best_alpha = privacy_engine.get_privacy_spent(delta=1e-5)
#     print(f"Epoch {epoch + 1}: Spent privacy budget: ε = {epsilon}, δ = {best_alpha}")

# # Save the model
# model.save_pretrained("my_private_gpt2_model")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  z = np.log((np.exp(t) + q - 1) / q)
  warn_deprecated('make_functional', 'torch.func.functional_call')
  warn_deprecated('grad')
  warn_deprecated('vmap', 'torch.vmap')


In [13]:
from opacus.utils.batch_memory_manager import BatchMemoryManager

for epoch in range(1, EPOCHS+1):
    losses = []

    with BatchMemoryManager(
        data_loader=train_dataloader, 
        max_physical_batch_size=MAX_PHYSICAL_BATCH_SIZE, 
        optimizer=optimizer
    )   as memory_safe_data_loader:

        for step, batch in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            batch = tuple(t.to(device) for t in batch)
  
            inputs = {'input_ids':    batch[0],
                    'attention_mask': batch[1]
    
                    }

            outputs = model(**inputs, labels=batch[0]) # output = loss, logits, hidden_states, attentions
            # loss = optimizer(logits.view(-1, logits.shape[-1]), train_labels.view(-1))
            loss = outputs[0]
            loss = outputs.loss
           
            losses.append(loss.item())
            loss.backward()
            optimizer.step()

            if step > 0 and step % LOGGING_INTERVAL == 0:
                train_loss = np.mean(losses)
                eps = privacy_engine.get_epsilon(DELTA)

                eval_loss, eval_accuracy = evaluate(model)

                print(
                  f"Epoch: {epoch} | "
                  f"Step: {step} | "
                  f"Train loss: {train_loss:.3f} | "
                  f"Eval loss: {eval_loss:.3f} | "
                  f"Eval accuracy: {eval_accuracy:.3f} | "
                  f"ɛ: {eps:.2f}"
                )


  0%|          | 0/4000 [00:00<?, ?it/s]

ValueError: Per sample gradient is not initialized. Not updated in backward pass?