<a href="https://colab.research.google.com/github/Naomie25/DI-Bootcamp/blob/main/Week8_Day4_DailyChallenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import string

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertConfig
from transformers.models.bert.modeling_bert import BertEncoder
from sklearn.metrics import roc_auc_score

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
TRAIN_PATH = "/content/train_essays.csv"
TEST_PATH = "/content/test_essays.csv"
PROMPT_PATH = "/content/train_prompts.csv"

src_train = pd.read_csv(TRAIN_PATH)
src_prompt = pd.read_csv(PROMPT_PATH)
src_test=pd.read_csv(TEST_PATH)
#src_sub = TODO

In [4]:
src_train.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [5]:
print(src_train.shape)
print(src_prompt.shape)
print(src_test.shape)

(1378, 4)
(2, 4)
(3, 3)


In [20]:
tokenizer_save_path = "bert-base-uncased"
model_save_path = "fine_tuned_bert"

tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

pretrained_model = BertForSequenceClassification.from_pretrained(tokenizer_save_path, num_labels=2)

embedding_model = BertEncoder(BertConfig())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
#4. Set Hyperparameters
train_batch_size = 16           # or 32 if your GPU allows it
test_batch_size = 32            # inference usually uses a larger batch size
lr = 2e-5                       # standard learning rate for BERT fine-tuning
beta1 = 0.9                     # Adam optimizer's beta1 parameter (common default)
nz = 100                        # latent vector size (used in GANs, not typical for BERT)
num_epochs = 3                  # start with 3-5 epochs for BERT fine-tuning
num_hidden_layers = 12          # BERT-base uses 12 hidden layers
train_ratio = 0.8               # 80% of the data for training, 20% for validation

In [22]:
print(src_train.columns)

Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')


In [23]:
#5. Prepare the Data for Training
# Assuming src_train = pd.read_csv(TRAIN_PATH)
# It contains columns: 'text' (input), 'label' (0=human, 1=AI-generated)

all_num = len(src_train)
train_num = int(all_num * train_ratio)
test_num = all_num - train_num

# Split your dataset
train_set = src_train.iloc[:train_num]
test_set = pd.concat([
    src_train.iloc[train_num:]
]).reset_index(drop=True)

# Define Dataset class (if not already defined)
class GANDAIGDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create dataset objects
train_dataset = GANDAIGDataset(
    texts=list(train_set['text']),
    labels=list(train_set['generated'])
)

test_dataset = GANDAIGDataset(
    texts=list(test_set['text']),
    labels=list(test_set['generated'])
)

# Create DataLoaders
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)


In [33]:
#6. Generator definition
from transformers import BertModel

config = BertConfig(num_hidden_layers=num_hidden_layers)

class Generator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Linear(input_dim, 256 * 128)

        self.conv_net = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2),
            nn.Conv1d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.2)
        )

        self.fc_embed = nn.Linear(256, 768)

        # Use BertModel instead of BertEncoder here
        self.bert_model = BertModel(config)

    def forward(self, x):
        x = self.fc(x)
        x = x.view(-1, 256, 128)
        x = self.conv_net(x)
        x = x.transpose(1, 2)
        x = self.fc_embed(x)

        # Now pass inputs_embeds to BertModel, not BertEncoder
        outputs = self.bert_model(inputs_embeds=x)

        return outputs.last_hidden_state





In [46]:
#7. Define the Discriminator Model

class SumBertPooler(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        mean_embeddings = hidden_states.mean(dim=1)
        return mean_embeddings
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_encoder = BertEncoder(config)
        self.bert_encoder.layer = nn.ModuleList([
            layer for layer in pretrained_model.bert.encoder.layer[:6]
        ])
        self.pooler = SumBertPooler()
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )

    def forward(self, hidden_states, attention_mask=None):
        out = self.bert_encoder(hidden_states, attention_mask=attention_mask)
        pooled = self.pooler(out.last_hidden_state)  # or pass attention_mask if you update pooler
        logits = self.classifier(pooled)
        return torch.sigmoid(logits).view(-1)


In [47]:
#8. Train the Model
# ===== Functions =====

embedding_model = BertModel.from_pretrained('bert-base-uncased').to(device)

def preparation_embedding(texts):
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    input_ids = encodings['input_ids']
    token_type_ids = encodings.get('token_type_ids')
    attention_mask = encodings['attention_mask']

    outputs = embedding_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    return outputs.last_hidden_state


def eval_auc(model):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in test_loader:
            texts, labels = batch
            embeded = preparation_embedding(texts)
            outputs = model(embeded)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.float().numpy())

    auc = roc_auc_score(actuals, predictions)
    print("AUC:", auc)
    return auc

def get_model_info_dict(model, epoch, auc_score):
    current_device = next(model.parameters()).device
    model.to('cpu')
    model_info = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'auc_score': auc_score,
    }
    model.to(current_device)
    return model_info

def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    netD.zero_grad()
    batch_size = real_data.size(0)

    output = netD(real_data)
    errD_real = criterion(output, label)
    errD_real.backward()
    D_x = output.mean().item()

    noise = torch.randn(batch_size, nz, device=device)
    fake_data = netG(noise).detach()
    label_fake = torch.zeros(batch_size, device=device)

    output = netD(fake_data)
    errD_fake = criterion(output, label_fake)
    errD_fake.backward()
    D_G_z1 = output.mean().item()

    errD = errD_real + errD_fake
    optimizerD.step()

    netG.zero_grad()
    label_gen = torch.ones(batch_size, device=device)
    output = netD(fake_data)
    errG = criterion(output, label_gen)
    errG.backward()
    D_G_z2 = output.mean().item()
    optimizerG.step()

    if i % 50 == 0:
        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f' % (
            epoch, num_epochs, i, len(train_loader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

    return optimizerG, optimizerD, netG, netD

# ===== Initialize models, criterion, optimizers =====
netG = Generator(input_dim=nz).to(device)
netD = Discriminator().to(device)

criterion = nn.BCELoss()
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

# ===== Training =====
model_infos = []
for epoch in range(num_epochs):
    netG.train()
    netD.train()
    for i, data in enumerate(train_loader):
        texts, labels = data
        labels = labels.float().to(device)

        with torch.no_grad():
            embeded = preparation_embedding(texts).to(device)

        optimizerG, optimizerD, netG, netD = GAN_step(
            optimizerG=optimizerG,
            optimizerD=optimizerD,
            netG=netG,
            netD=netD,
            real_data=embeded,
            label=labels,
            epoch=epoch, i=i)

    auc_score = eval_auc(netD)
    model_infos.append(get_model_info_dict(netD, epoch, auc_score))

print('Train complete!')



[0/3][0/69] Loss_D: 1.2923 Loss_G: 0.9517 D(x): 0.4627 D(G(z)): 0.4885 / 0.3864
[0/3][50/69] Loss_D: 0.0234 Loss_G: 4.5839 D(x): 0.0123 D(G(z)): 0.0110 / 0.0103
AUC: 0.9927272727272727
[1/3][0/69] Loss_D: 0.0140 Loss_G: 5.0309 D(x): 0.0069 D(G(z)): 0.0071 / 0.0067
[1/3][50/69] Loss_D: 0.0066 Loss_G: 5.8194 D(x): 0.0035 D(G(z)): 0.0031 / 0.0030
AUC: 0.9745454545454545
[2/3][0/69] Loss_D: 0.0066 Loss_G: 6.0350 D(x): 0.0038 D(G(z)): 0.0028 / 0.0024
[2/3][50/69] Loss_D: 0.0037 Loss_G: 6.4340 D(x): 0.0021 D(G(z)): 0.0017 / 0.0017
AUC: 0.9672727272727273
Train complete!
