### 📦 **1. Setup environnement**

In [1]:
!pip install -q wandb transformers==4.41.0 kaggle

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### 📁 **2. Téléchargement des données**


In [2]:
import os, zipfile, shutil
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd, numpy as np
import wandb
wandb.login()  # 🔑 Collez votre clé API quand demandé
from sklearn.metrics import roc_auc_score
from transformers import BertTokenizer, BertModel, BertConfig
from google.colab import drive, files
drive.mount('/content/drive')



if not os.path.exists("llm-detect-ai-generated-text"):
    !kaggle competitions download -c llm-detect-ai-generated-text
    with zipfile.ZipFile("llm-detect-ai-generated-text.zip") as zf:
        zf.extractall()

TRAIN_PATH  = "train_essays.csv"
TEST_PATH   = "test_essays.csv"
PROMPT_PATH = "train_prompts.csv"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mk_benyahia[0m ([33mk_benyahia-pstb[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Mounted at /content/drive
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.11/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 434, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


### ⚙️ **4. Hyper-paramètres**

In [15]:
train_batch_size  = 64
test_batch_size   = 64
max_length        = 128
num_hidden_layers = 2
train_ratio       = 0.9
nz                = 100
lr                = 1e-4
beta1             = 0.3
num_epochs        = 50
device            = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer        = BertTokenizer.from_pretrained("bert-base-uncased")
embedding_model  = BertModel.from_pretrained("bert-base-uncased").to(device).eval()

### 📚 **5. Dataset**

In [16]:
src_train = pd.read_csv(TRAIN_PATH)
all_num   = len(src_train)
train_num = int(all_num * train_ratio)

class GANDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.texts, self.labels = texts, labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.texts[idx], self.labels[idx]
        else:
            return self.texts[idx]

train_ds = GANDataset(src_train["text"][:train_num].tolist(), src_train["generated"][:train_num].tolist())
test_ds  = GANDataset(src_train["text"][train_num:].tolist(), src_train["generated"][train_num:].tolist())

train_loader = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True, drop_last=True)
test_loader  = DataLoader(test_ds,  batch_size=test_batch_size, shuffle=False)

### 🧠 **6. Modèles**

In [17]:
from transformers.models.bert.modeling_bert import BertModel

class Generator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(nn.Linear(input_dim, 512), nn.ReLU(),
                                nn.Linear(512, 256*128), nn.ReLU())
        self.conv = nn.Sequential(
            nn.ConvTranspose1d(256, 128, 4, 2, 1), nn.ReLU(),
            nn.ConvTranspose1d(128, 64, 4, 2, 1),  nn.ReLU(),
            nn.ConvTranspose1d(64, 768, 4, 2, 1),  nn.Tanh())
        cfg = BertConfig(hidden_size=768, num_hidden_layers=num_hidden_layers,
                         max_position_embeddings=max_length, vocab_size=1)
        self.bert = BertModel(cfg, add_pooling_layer=False)

    def forward(self, z):
        x = self.fc(z).view(-1, 256, 128)
        x = self.conv(x).transpose(1,2)[:, :max_length, :]
        return self.bert(inputs_embeds=x).last_hidden_state

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        cfg = BertConfig(hidden_size=768, num_hidden_layers=2,
                         max_position_embeddings=max_length, vocab_size=1)
        self.bert = BertModel(cfg, add_pooling_layer=False)
        self.pool = lambda h: h.mean(dim=1)
        self.clf  = nn.Sequential(nn.Linear(768, 256), nn.ReLU(), nn.Linear(256, 1))

    def forward(self, x, mask=None):
        h = self.bert(inputs_embeds=x, attention_mask=mask).last_hidden_state
        return torch.sigmoid(self.clf(self.pool(h))).squeeze(-1)


### 🔧 **7. Embedder helper**

In [18]:
def embed(texts):
    enc = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
    ids, mask = enc["input_ids"].to(device), enc["attention_mask"].to(device)
    with torch.no_grad():
        emb = embedding_model(input_ids=ids, attention_mask=mask).last_hidden_state
    return emb, mask

### 🚀 **8. Entraînement avec wandb**

In [19]:
netG = Generator(nz).to(device)
netD = Discriminator().to(device)
crit = nn.BCELoss()
optD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

run = wandb.init(
    project="llm-detect-ai-gan-v3",
    config={
        "lr": lr, "batch_size": train_batch_size, "epochs": num_epochs,
        "nz": nz, "max_len": max_length, "beta1": beta1
    }
)

for epoch in range(num_epochs):
    netG.train(); netD.train()
    for i, (texts, labels) in enumerate(train_loader):
        emb, mask = embed(texts)
        y = labels.float().to(device)

        # Discriminateur
        optD.zero_grad()
        real = netD(emb, mask)
        loss_real = crit(real, torch.full_like(y, 0.9))
        noise = torch.randn(len(y), nz, device=device)
        fake = netG(noise)
        fake_mask = torch.ones(fake.size(0), fake.size(1), device=device)
        loss_fake = crit(netD(fake.detach(), fake_mask), torch.full_like(y, 0.1))
        lossD = loss_real + loss_fake
        lossD.backward(); optD.step()

        # Générateur
        optG.zero_grad()
        lossG = crit(netD(fake, fake_mask), torch.full_like(y, 0.9))
        lossG.backward(); optG.step()

        if i % 50 == 0:
            wandb.log({"epoch": epoch, "step": i, "lossD": lossD.item(), "lossG": lossG.item()})

    # AUC sur test
    netD.eval(); all_preds, all_labels = [], []
    with torch.no_grad():
        for texts, labels in test_loader:
            emb, mask = embed(texts)
            preds = netD(emb, mask)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    auc = roc_auc_score(all_labels, all_preds)
    wandb.log({"epoch": epoch, "AUC": auc})

wandb.finish()

0,1
AUC,█▅▇▅▃▇▅▅▂▁▁▂▂▅▅██▇▇▆▇▇▆▆▇▅▅▅▅▅▅▅▅▆▇▆▆▆▆▆
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
lossD,▇▂▄▇▁▂▅▃▂▃▂█▄▆█▆▄▄▄▃▃▄▂▄▄▄▄▅▃▄▄▅▄▄▅▃▄▄▄▄
lossG,▂▃▄▃▄▅▂▁▄▁▄█▃▁▃▂▃▁▂▂▂▂▃▂▁▂▁▂▁▂▁▁▁▂▃▂▂▃▂▁
step,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
AUC,0.54015
epoch,49.0
lossD,1.07454
lossG,1.19226
step,0.0


### 📤 **9. Inférence & soumission**

In [14]:
# Inférence
netD.eval()
test_df = pd.read_csv(TEST_PATH)
all_preds = []

with torch.no_grad():
    for texts in DataLoader(GANDataset(test_df["text"].tolist()), batch_size=test_batch_size):
        emb, mask = embed(texts)
        preds = netD(emb)
        all_preds.extend(preds.cpu().numpy().flatten())

sub = pd.DataFrame({"id": test_df["id"], "generated": all_preds})
sub.to_csv("submission.csv", index=False)
print("\n📄 Aperçu submission.csv :")
print(sub.head())
files.download("submission.csv")


📄 Aperçu submission.csv :
         id  generated
0  0000aaaa   0.616513
1  1111bbbb   0.622738
2  2222cccc   0.721280


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>