In [33]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import random
import pandas as pd

# ==========================================
# 1. SETUP DO MODELO
# ==========================================
model_name = "SamLowe/roberta-base-go_emotions"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

labels = model.config.id2label
label2id = model.config.label2id

# Configurações do Alvo
base_text = "I am having a great day at the park."
target_emotion = "anger"
joy_emotion = "joy"

target_idx = label2id[target_emotion]
joy_idx = label2id[joy_emotion]

# ==========================================
# 2. FUNÇÕES DE ATAQUE
# ==========================================

def get_loss_and_grads(model, input_ids, target_idx, joy_idx):
    embeds = model.get_input_embeddings()(input_ids).detach().requires_grad_(True)
    attention_mask = torch.ones(input_ids.shape, device=device)
    
    logits = model(inputs_embeds=embeds, attention_mask=attention_mask).logits
    
    # LOGIT MARGIN LOSS: Puxa Anger pra cima e Joy pra baixo
    loss = logits[0, joy_idx] - logits[0, target_idx]
    
    model.zero_grad()
    loss.backward()
    return loss.detach(), embeds.grad.detach()

def get_top_k_candidates(grads, suffix_slice, topk=256):
    suffix_grads = grads[0, suffix_slice, :]
    return torch.topk(-suffix_grads, topk, dim=-1).indices

def get_best_candidate(model, input_ids, suffix_slice, top_candidates, target_idx, joy_idx, batch_size=512):
    batch_input_ids = input_ids.repeat(batch_size, 1)
    
    # Criar variações aleatórias baseadas nos Top-K
    for i in range(batch_size):
        pos = random.randint(0, (suffix_slice.stop - suffix_slice.start) - 1)
        new_token = random.choice(top_candidates[pos])
        batch_input_ids[i, suffix_slice.start + pos] = new_token
        
    with torch.no_grad():
        logits = model(batch_input_ids).logits
        # Perda: queremos o menor valor de (Joy - Anger)
        losses = logits[:, joy_idx] - logits[:, target_idx]
        
    best_idx = torch.argmin(losses)
    return batch_input_ids[best_idx : best_idx + 1], losses[best_idx]

# ==========================================
# 3. LOOP DE ATAQUE COM PACIÊNCIA
# ==========================================

# O fator multiplicador quanto maior o número, mais "agressivo" será o sufixo adversarial, 

adv_suffix = "!!! " * 4
input_ids = tokenizer.encode(base_text + " " + adv_suffix, return_tensors="pt").to(device)

base_len = len(tokenizer.encode(base_text, add_special_tokens=False))
suffix_len = len(tokenizer.encode(adv_suffix, add_special_tokens=False))
suffix_slice = slice(base_len + 1, base_len + 1 + suffix_len)

history = []
best_prob = 0.0
best_input_ids = input_ids.clone()
patience = 0
max_patience = 40  # Se não melhorar em 40 passos, reverte

print(f"Buscando 'Jailbreak' Semântico,: {joy_emotion} -> {target_emotion}")
print("base_text:", repr(base_text))
for i in range(1000):
    # Passo de Gradiente
    loss, grads = get_loss_and_grads(model, input_ids, target_idx, joy_idx)
    top_candidates = get_top_k_candidates(grads, suffix_slice, topk=512)
    
    # Busca de Candidatos
    input_ids, best_loss = get_best_candidate(model, input_ids, suffix_slice, top_candidates, target_idx, joy_idx)
    
    # Avaliação
    with torch.no_grad():
        res_logits = model(input_ids).logits
        probs = torch.softmax(res_logits, dim=-1)[0]
        prob_target = probs[target_idx].item()
        pred_label = labels[torch.argmax(res_logits).item()]
    
    # Lógica de Best Suffix e Reversão
    if prob_target > best_prob:
        best_prob = prob_target
        best_input_ids = input_ids.clone()
        patience = 0
    else:
        patience += 1
    
    # Reversão se estagnar
    if patience >= max_patience:
        input_ids = best_input_ids.clone()
        # Injeta mutação aleatória para sair do buraco
        for _ in range(2):
            p_mut = random.randint(0, suffix_len - 1)
            input_ids[0, suffix_slice.start + p_mut] = random.randint(0, tokenizer.vocab_size - 1)
        patience = 0
        print(f" -> [REVERSÃO] Voltando para melhor prob: {best_prob:.2%}")

    history.append({'passo': i, 'adv_suffix': repr(sfx_text), 'prob': prob_target, 'pred': pred_label})

    if i % 20 == 0 or prob_target > 0.7:
        sfx_text = tokenizer.decode(input_ids[0, suffix_slice])
        print(f"Passo {i:03d} | Prob Anger: {prob_target:.2%} | Pred: {pred_label} | Sufixo: {repr(sfx_text)[:40]}")
        
        if prob_target > 0.7:
            print("\n!!! SUCESSO !!!")
            break

# Exportar Resultados
df = pd.DataFrame(history)
print("\nTop 5 Melhores Momentos:")
print(df.sort_values(by='prob', ascending=False).head(5))

Buscando 'Jailbreak' Semântico,: joy -> anger
base_text: 'I am having a great day at the park.'
Passo 000 | Prob Anger: 0.05% | Pred: excitement | Sufixo: ' !!! when!!! !!! !!!'
Passo 020 | Prob Anger: 0.35% | Pred: desire | Sufixo: ' want some international." because G de
Passo 040 | Prob Anger: 16.05% | Pred: desire | Sufixo: ' want that international." F You very d
Passo 060 | Prob Anger: 22.37% | Pred: desire | Sufixo: ' want that 2017." F You political K'
Passo 080 | Prob Anger: 23.03% | Pred: desire | Sufixo: ' want that 2018." F You little director
 -> [REVERSÃO] Voltando para melhor prob: 33.64%
Passo 100 | Prob Anger: 7.52% | Pred: desire | Sufixo: ' want that May." 0 You politicalit'
Passo 120 | Prob Anger: 34.67% | Pred: anger | Sufixo: ' want that National members." You polit
Passo 140 | Prob Anger: 21.04% | Pred: desire | Sufixo: ' want that Mr health." You political F'
Passo 160 | Prob Anger: 22.03% | Pred: desire | Sufixo: ' want that Mr health." You total F'
Passo 180 |

In [49]:
model.eval()

adv_suffix = ' but run away of." You billion F'
prompt = base_text + " " + adv_suffix
print(prompt)
with torch.no_grad():
    # Passa o texto base + o sufixo adversarial encontrado
    final_input = tokenizer(prompt, return_tensors="pt").to(device)
    logits = model(**final_input).logits
    
    # Pega o índice da classe com maior valor
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    prediction_label = model.config.id2label[predicted_class_id]
    
    # Calcula as probabilidades (0 a 100%)
    probs = torch.softmax(logits, dim=-1)[0]
    
    print(f"\nRESULTADO DO ATAQUE:")
    print(f"Frase: {base_text} {adv_suffix}")
    print(f"Predição Final: {prediction_label} ({probs[predicted_class_id]:.2%})")
    print(f"Confiança na classe alvo ({target_emotion}): {probs[target_idx]:.2%}")

I am having a great day at the park.  but run away of." You billion F

RESULTADO DO ATAQUE:
Frase: I am having a great day at the park.  but run away of." You billion F
Predição Final: anger (42.48%)
Confiança na classe alvo (anger): 42.48%
