# Setup


In [1]:
# 1. Imports e setup
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from opacus import PrivacyEngine

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.metrics import accuracy_score, f1_score
def evaluate_model(model, dataloader):
    """
    Avalia o modelo em um conjunto de dados e retorna as métricas de acurácia e F1-score
    """
    # Certifique-se de que o modelo esteja em modo de avaliação
    model.eval()

    # Coletar todas as previsões e rótulos reais
    y_true = []
    y_pred = []

    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            preds = torch.argmax(logits, dim=1)
            y_true.extend(y.cpu().tolist())
            y_pred.extend(preds.cpu().tolist())

    # Calcular métricas
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')  # usa média ponderada por classe

    print(f"Acurácia: {acc:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")


In [3]:
# 2. Carregamento dos dados tokenizados
df = pd.read_csv("dataPrep/data/movies_tokenized.csv")  # Certifique-se de que esse arquivo existe

# Extraindo apenas o primeiro gênero como rótulo (ou 'Unknown' se vazio)
df['genre'] = df['genres'].fillna("[]").apply(lambda x: eval(x)[0]['name'] if eval(x) else 'Unknown')

# Converter a string dos tokens para lista de inteiros
df['overview_tokens'] = df['overview_tokens'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# Remover linhas com tokens vazios
df = df[df['overview_tokens'].apply(len) > 0]

# Codificar os gêneros em inteiros
le = LabelEncoder()
df['label'] = le.fit_transform(df['genre'])

  df = pd.read_csv("dataPrep/data/movies_tokenized.csv")  # Certifique-se de que esse arquivo existe


In [4]:
# 3. Dataset PyTorch
class MovieDataset(Dataset):
    def __init__(self, token_ids, labels, max_len=128):
        self.token_ids = token_ids
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.token_ids)

    def __getitem__(self, idx):
        x = self.token_ids[idx][:self.max_len]
        x = x + [0] * (self.max_len - len(x))  # Padding manual
        return torch.tensor(x), torch.tensor(self.labels[idx])

# Criar o dataset e dataloader
dataset = MovieDataset(df['overview_tokens'].tolist(), df['label'].tolist())
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model Definition

In [5]:
# 4. Modelo simples: embedding + média + linear
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)              # [B, T, D]
        pooled = embedded.mean(dim=1)             # [B, D]
        return self.fc(pooled)

# Tamanho do vocabulário e número de classes
vocab_size = max([max(seq) for seq in df['overview_tokens']]) + 1
num_classes = df['label'].nunique()

# Instanciar o modelo
model = TextClassifier(vocab_size, 128, num_classes).to(device)

# DP-SGD

In [6]:
# 5. Treinamento com DP-SGD (Opacus)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Attach o PrivacyEngine
privacy_engine = PrivacyEngine()
model, optimizer, dataloader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=dataloader,
    noise_multiplier=1.0,
    max_grad_norm=1.0,
)




In [None]:
# Loop de treino
model.train()
for epoch in range(3):
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")

In [None]:
# 6. Cálculo do epsilon após o treino E avaliação do modelo
epsilon = privacy_engine.accountant.get_epsilon(delta=1e-5)
print(f"ε = {epsilon:.2f}, δ = 1e-5")
evaluate_model(model, dataloader)

  mesh_size = eps_error / np.sqrt(
  t_min = np.floor(t_min / dt) * dt
  t_max = np.ceil(t_max / dt) * dt


ValueError: cannot convert float NaN to integer

# Baseline

In [None]:
# --- BASELINE SEM PRIVACIDADE ---
from torch.optim import Adam

model_baseline = TextClassifier(vocab_size, 128, num_classes).to(device)
optimizer = Adam(model_baseline.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

model_baseline.train()
for epoch in range(3):
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model_baseline(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[BASELINE] Epoch {epoch+1} - Loss: {total_loss:.4f}")


KeyboardInterrupt: 

In [None]:
evaluate_model(model_baseline, dataloader)

Acurácia: 0.4552
F1-score (weighted): 0.3749


# Embedding Noise

In [None]:
def add_noise_to_embeddings(embedding_layer, sigma=0.1):
    with torch.no_grad():
        noise = torch.randn_like(embedding_layer.weight) * sigma
        embedding_layer.weight.add_(noise)


In [None]:
# --- EMBEDDING PERTURBATION ---
model_embed = TextClassifier(vocab_size, 128, num_classes).to(device)

# Aplica ruído antes do treino
add_noise_to_embeddings(model_embed.embedding, sigma=0.1)

optimizer = Adam(model_embed.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

model_embed.train()
for epoch in range(3):
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model_embed(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[EMBED NOISE] Epoch {epoch+1} - Loss: {total_loss:.4f}")


[EMBED NOISE] Epoch 1 - Loss: 3374.7267
[EMBED NOISE] Epoch 2 - Loss: 3036.5760
[EMBED NOISE] Epoch 3 - Loss: 2778.3562


In [None]:
evaluate_model(model_embed, dataloader)

Acurácia: 0.4502
F1-score (weighted): 0.3667


In [None]:
import numpy as np

# Parâmetros
sigma = 0.1          # mesmo que você usou ao aplicar ruído
delta = 1e-5
sensitivity = 1.0    # padrão

# Fórmula para mecanismo Gaussiano (epsilon aproximado)
epsilon = (np.sqrt(2 * np.log(1.25 / delta)) * sensitivity) / sigma

# Se aplicou ruído T vezes (ex: por época), multiplique:
T = 3  # ou 3, se adicionou ruído por época
epsilon_total = epsilon * T

print(f"ε ≈ {epsilon_total:.4f} (para σ = {sigma}, δ = {delta}, T = {T})")


ε ≈ 1.4534 (para σ = 10, δ = 1e-05, T = 3)


# TEM

In [None]:
model_tem = TextClassifier(vocab_size, 128, num_classes).to(device)

In [None]:
def apply_tem_noise(embedding_layer: torch.nn.Embedding, sigma: float = 0.1):
    """
    Aplica ruído gaussiano diretamente na camada de embeddings.
    """
    with torch.no_grad():
        noise = torch.randn_like(embedding_layer.weight) * sigma
        embedding_layer.weight.add_(noise)

# --- Aplicar TEM antes do treino ---
sigma_tem = 1.0  # ajuste para obter melhor privacidade (ε ↓)
apply_tem_noise(model_tem.embedding, sigma=sigma_tem)
print(f"TEM aplicado com sigma = {sigma_tem}")


TEM aplicado com sigma = 1.0


In [None]:
optimizer = torch.optim.Adam(model_tem.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

model_tem.train()
for epoch in range(3):
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model_tem(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[TEM] Epoch {epoch+1} - Loss: {total_loss:.4f}")


[TEM] Epoch 1 - Loss: 3390.7705
[TEM] Epoch 2 - Loss: 3147.3419
[TEM] Epoch 3 - Loss: 2942.7239


In [None]:
evaluate_model(model_tem, dataloader)

Acurácia: 0.4174
F1-score (weighted): 0.3331


# MADLIB - TODO

# DP-SGD + TEM

In [None]:
# Instancia o modelo
model_combo = TextClassifier(vocab_size, 128, num_classes).to(device)

# --- Aplica ruído TEM às embeddings ---
sigma_tem = 1.0
def apply_tem_noise(embedding_layer: torch.nn.Embedding, sigma: float = 0.1):
    with torch.no_grad():
        noise = torch.randn_like(embedding_layer.weight) * sigma
        embedding_layer.weight.add_(noise)

apply_tem_noise(model_combo.embedding, sigma=sigma_tem)
print(f"[COMBO] TEM aplicado com σ = {sigma_tem}")


[COMBO] TEM aplicado com σ = 1.0


In [None]:
optimizer = torch.optim.Adam(model_combo.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Configura o PrivacyEngine
privacy_engine = PrivacyEngine()
model_combo, optimizer, dataloader_combo = privacy_engine.make_private(
    module=model_combo,
    optimizer=optimizer,
    data_loader=dataloader,
    noise_multiplier=1.0,      # σ do DP-SGD
    max_grad_norm=1.0,
)

# Treinamento
model_combo.train()
for epoch in range(3):
    total_loss = 0
    for x, y in dataloader_combo:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model_combo(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[COMBO] Epoch {epoch+1} - Loss: {total_loss:.4f}")




[COMBO] Epoch 1 - Loss: 3618.0475
[COMBO] Epoch 2 - Loss: 3465.2856
[COMBO] Epoch 3 - Loss: 3453.5662


In [None]:
epsilon_combo = privacy_engine.accountant.get_epsilon(delta=1e-5)
print(f"[COMBO] ε (DP-SGD): {epsilon_combo:.2f} | δ = 1e-5")
print(f"[COMBO] TEM aplicado com σ = {sigma_tem} (ε estimado separadamente ≈ {4.84 if sigma_tem==1.0 else 'recalcular'})")
evaluate_model(model_combo, dataloader)

[COMBO] ε (DP-SGD): 0.22 | δ = 1e-5
[COMBO] TEM aplicado com σ = 1.0 (ε estimado separadamente ≈ 4.84)
Acurácia: 0.2633
F1-score (weighted): 0.1101
