In [1]:
import transformers
import torch
import torch.nn.functional as F
import opacus
from opacus import PrivacyEngine
from datasets import load_dataset
from torch.utils.data import DataLoader

In [2]:

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the GPT-2 model
model = transformers.GPT2LMHeadModel.from_pretrained('distilgpt2').to(device)
tokenizer = transformers.GPT2Tokenizer.from_pretrained('distilgpt2')
# Set up the optimizer
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

In [3]:

dataset = load_dataset("dair-ai/emotion", split="test")
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples['text'], return_tensors="pt", max_length=512, truncation=True, padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
# Convert the dataset to PyTorch format
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [6]:


train_loader = DataLoader(tokenized_datasets, batch_size=4, shuffle=True)

In [7]:
# Initialize the Privacy Engine
privacy_engine = PrivacyEngine(
    model,
    batch_size=4,
    sample_size=len(tokenized_datasets),
    epochs=3,
    max_grad_norm=0.1,
    target_epsilon=3,
)
privacy_engine.attach(optimizer)

In [8]:
# Training loop
num_epochs = 3  # Set the number of epochs

for epoch in range(num_epochs):
    for batch in train_loader:
        # Ensure model is in train mode

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        model.train()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        
        # GPT-2 outputs loss as a scalar by default, averaging over the entire batch.
        # We reshape it to (-1, model's output size) to compute per-token loss
        logits = outputs.logits.view(-1, outputs.logits.size(-1))
        labels = input_ids.view(-1)
        loss = F.cross_entropy(logits, labels, reduction='none')

        # Reshape loss to obtain per-example loss (assuming equal length sequences)
        # loss = loss.view(batch['input_ids'].size(0), -1).mean(dim=1)
        # Backward and optimize
        optimizer.zero_grad()  # Clear existing gradients
        optimizer.step(loss=loss)

    # Print privacy budget spent so far
    epsilon, best_alpha = optimizer.
    print(f"Epoch: {epoch}, Epsilon spent: {epsilon}, Best alpha: {best_alpha}")





AttributeError: 'PrivacyEngine' object has no attribute 'accountant'

In [None]:
# Save the model
model.save_pretrained("my_private_gpt2_model")