In [1]:
!pip install torch transformers --quiet

In [2]:
import torch
from transformers import AutoTokenizer
print("PyTorch version:", torch.__version__)

PyTorch version: 2.7.0+cpu


In [3]:
# Load dataset and prepare "surprise" tweets
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch

# 1. Load emotion dataset
dataset = load_dataset("dair-ai/emotion")
df = pd.DataFrame(dataset["train"])

# 2. Get "surprise" tweets (label=5)
surprise_tweets = df[df["label"] == 5]["text"].tolist()

# 3. Tokenize with BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
inputs = tokenizer(surprise_tweets, return_tensors="pt", padding=True, truncation=True, max_length=64)

# 4. Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    real_embeddings = outputs.last_hidden_state.mean(dim=1)

print(f"Preprocessed {len(real_embeddings)} 'surprise' tweets")

Preprocessed 572 'surprise' tweets


In [4]:
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(100 + 6, 256),  # Noise (100) + one-hot label (6 emotions)
            nn.LeakyReLU(0.2),
            nn.Linear(256, 768)  # Output size = BERT embedding size
        )
    
    def forward(self, noise, labels):
        one_hot = torch.zeros(labels.size(0), 6)
        one_hot.scatter_(1, labels.unsqueeze(1), 1)
        x = torch.cat([noise, one_hot], dim=1)
        return self.main(x)

In [5]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(768 + 6, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
    
    def forward(self, emb, labels):
        one_hot = torch.zeros(labels.size(0), 6)
        one_hot.scatter_(1, labels.unsqueeze(1), 1)
        x = torch.cat([emb, one_hot], dim=1)
        return self.main(x)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = Generator().to(device)
discriminator = Discriminator().to(device)

optimizer_G = torch.optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=0.0002)

# Training loop
for epoch in range(1000):
    # Real data
    real_emb = real_embeddings.to(device)
    real_labels = torch.full((32,), 5).to(device)  # All "surprise"
    
    # Train Discriminator
    noise = torch.randn(32, 100).to(device)
    fake_emb = generator(noise, real_labels)
    
    d_real = discriminator(real_emb[:32], real_labels)
    d_fake = discriminator(fake_emb.detach(), real_labels)
    d_loss = -(torch.log(d_real).mean() + torch.log(1 - d_fake).mean())
    
    optimizer_D.zero_grad()
    d_loss.backward()
    optimizer_D.step()
    
    # Train Generator
    g_loss = -torch.log(discriminator(fake_emb, real_labels)).mean()
    optimizer_G.zero_grad()
    g_loss.backward()
    optimizer_G.step()
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: D_loss={d_loss.item():.4f}, G_loss={g_loss.item():.4f}")

Epoch 0: D_loss=1.3844, G_loss=0.6940
Epoch 100: D_loss=0.7941, G_loss=1.2693
Epoch 200: D_loss=0.7440, G_loss=1.6468
Epoch 300: D_loss=0.3233, G_loss=2.4636
Epoch 400: D_loss=0.3661, G_loss=1.8779
Epoch 500: D_loss=0.7391, G_loss=1.0690
Epoch 600: D_loss=0.8308, G_loss=1.0031
Epoch 700: D_loss=0.6263, G_loss=1.3456
Epoch 800: D_loss=0.6436, G_loss=1.3527
Epoch 900: D_loss=1.1535, G_loss=0.9475


In [7]:
# Generate fake "surprise" tweets
noise = torch.randn(1000, 100).to(device)
fake_labels = torch.full((1000,), 5).to(device)  # All "surprise"
fake_embeddings = generator(noise, fake_labels)

# Save for later
torch.save(fake_embeddings, "fake_surprise_embeddings.pt")
print("Generated 1000 synthetic 'surprise' samples!")

Generated 1000 synthetic 'surprise' samples!
