In [6]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers scikit-learn pandas tqdm joblib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cu121

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
!pip install dirtyjson

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# ================================================================
#  Text‑+‑Meta  •  clean split (800 logs train / 200 logs eval)
#  Computes F1, ROC‑AUC and false‑positive rate
# ================================================================
import json, os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import joblib

# -------------------- Parameters ---------------------------------
SEED       = 42
BATCH_SIZE = 16
EPOCHS     = 3
LR         = 2e-5
BACKBONE   = "roberta-base"

META_COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ================================================================
# 1. Load enriched JSONL   (train‑* vs valid‑*/test‑*)
# ================================================================
def load_dataset(folder: str, label: int, split: str):
    """split='train'  -> train-* files
       split='eval'   -> valid-* and test-* files"""
    rows = []
    for fn in os.listdir(folder):
        if not fn.endswith("_enriched.jsonl"):       # skip raw files
            continue
        is_train = fn.startswith("train-")
        if (split == "train" and not is_train) or (split == "eval" and is_train):
            continue
        for line in open(Path(folder)/fn, encoding="utf-8"):
            obj  = json.loads(line)
            meta = [obj["extra"].get(c, 0.0) for c in META_COLS]
            rows.append({"text": obj["text"], "meta": meta, "label": label})
    return rows

train_rows = (load_dataset("OpenLLMText_Human", 0, "train") +
              load_dataset("OpenLLMText_ChatGPT", 1, "train"))
eval_rows  = (load_dataset("OpenLLMText_Human", 0, "eval")  +
              load_dataset("OpenLLMText_ChatGPT", 1, "eval"))

train_df = pd.DataFrame(train_rows)
test_df  = pd.DataFrame(eval_rows)
print(f"Train DF: {len(train_df)}   –   Eval DF: {len(test_df)}")

# ================================================================
# 2. Feature scaling  (fit on TRAIN only)
# ================================================================
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df ["meta"] = list(scaler.transform(np.vstack(test_df ["meta"])))

# ================================================================
# 3. PyTorch Dataset / Loader
# ================================================================
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

class TextMetaDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.metas  = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = tokenizer(self.texts[idx], max_length=512, truncation=True,
                        padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "meta": self.metas[idx],
                "label": self.labels[idx]}

train_loader = DataLoader(TextMetaDS(train_df), batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TextMetaDS(test_df),  batch_size=BATCH_SIZE)

# ================================================================
# 4. Model definition
# ================================================================
class TextMetaClassifier(nn.Module):
    def __init__(self, meta_dim=20, hidden_meta=32):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(BACKBONE)
        # freeze embeddings + first 6 layers
        for p in self.encoder.embeddings.parameters(): p.requires_grad = False
        for layer in self.encoder.encoder.layer[:6]:
            for p in layer.parameters(): p.requires_grad = False
        self.meta_fc = nn.Sequential(nn.Linear(meta_dim, hidden_meta),
                                     nn.ReLU(), nn.Dropout(0.1))
        self.classifier = nn.Linear(self.encoder.config.hidden_size + hidden_meta, 1)
    def forward(self, ids, mask, meta):
        h_text = self.encoder(ids, attention_mask=mask).pooler_output
        h_meta = self.meta_fc(meta)
        return self.classifier(torch.cat([h_text, h_meta], dim=1)).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextMetaClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optim     = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                              lr=LR)

# ================================================================
# 5. Train & evaluate
# ================================================================
def run(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, trues = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            ids  = batch["input_ids"].to(device)
            msk  = batch["attention_mask"].to(device)
            meta = batch["meta"].to(device)
            y    = batch["label"].to(device)
            logits = model(ids, msk, meta)
            loss   = criterion(logits, y)
            if train:
                optim.zero_grad(); loss.backward(); optim.step()
            losses.append(loss.item())
            preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            trues.extend(y.cpu().numpy())
    return np.array(preds), np.array(trues), np.mean(losses)

for ep in range(1, EPOCHS+1):
    _, _, tr_loss = run(train_loader, True)
    y_pred, y_true, val_loss = run(test_loader, False)
    f1  = f1_score(y_true, y_pred >= 0.5)
    auc = roc_auc_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred >= 0.5).ravel()
    fp_rate = fp / (fp + tn)
    print(f"[Ep {ep}] train_loss={tr_loss:.4f}  val_loss={val_loss:.4f}  "
          f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.2%}")

# ================================================================
# 6. Save artefacts
# ================================================================
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_meta_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Model + scaler saved.")

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Text‑only LLM classifier  — Human vs ChatGPT (English)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Entrée  : OpenLLMText_Human/*_enriched.jsonl
          OpenLLMText_ChatGPT/*_enriched.jsonl
Sorties : saved/text_only_roberta.pt
          (aucun scaler, car pas de variables méta)
"""

from __future__ import annotations
import json, os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# ─── PARAMS ──────────────────────────────────────────────────────────
SEED       = 42
BATCH_SIZE = 16
EPOCHS     = 3
LR         = 2e-5
BACKBONE   = "roberta-base"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ─── 1. DATA LOAD (ignore les méta) ──────────────────────────────────
def load_dataset(folder: str, label: int):
    data = []
    for fn in os.listdir(folder):
        if fn.endswith("_enriched.jsonl"):
            for line in open(Path(folder)/fn, encoding="utf-8"):
                obj = json.loads(line)
                data.append({"text": obj["text"], "label": label})
    return data

df = pd.DataFrame(
      load_dataset("OpenLLMText_Human", 0)
    + load_dataset("OpenLLMText_ChatGPT", 1)
)
print(f"Dataset size: {len(df)} examples")

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

# ─── 2. DATASET / DATALOADER ─────────────────────────────────────────
tok = AutoTokenizer.from_pretrained(BACKBONE)

class TextOnlyDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = tok(self.texts[idx], max_length=512, truncation=True,
                  padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "label": self.labels[idx]}

train_loader = DataLoader(TextOnlyDS(train_df), BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TextOnlyDS(test_df),  BATCH_SIZE)

# ─── 3. MODEL ────────────────────────────────────────────────────────
class TextOnlyClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = AutoModel.from_pretrained(BACKBONE)
        # option : geler embeddings + moitié des couches
        for p in self.enc.embeddings.parameters():
            p.requires_grad = False
        for layer in self.enc.encoder.layer[:6]:
            for p in layer.parameters():
                p.requires_grad = False
        self.clf = nn.Linear(self.enc.config.hidden_size, 1)

    def forward(self, ids, mask):
        h = self.enc(ids, attention_mask=mask).pooler_output
        return self.clf(h).squeeze(1)     # logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextOnlyClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optim     = torch.optim.AdamW(filter(lambda p: p.requires_grad,
                                     model.parameters()), lr=LR)

# ─── 4. TRAIN / EVAL LOOP ────────────────────────────────────────────
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, trues = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            ids  = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            y    = batch["label"].to(device)

            logits = model(ids, mask)
            loss   = criterion(logits, y)
            if train:
                optim.zero_grad(); loss.backward(); optim.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            trues.extend(y.cpu().numpy())
    return np.mean(losses), np.array(preds), np.array(trues)

for ep in range(1, EPOCHS+1):
    tr_loss, _, _        = run_epoch(train_loader, True)
    val_loss, y_hat, yt  = run_epoch(test_loader,  False)
    f1  = f1_score(yt, y_hat >= 0.5)
    auc = roc_auc_score(yt, y_hat)
    print(f"[Epoch {ep}] train_loss={tr_loss:.4f} "
          f"val_loss={val_loss:.4f}  F1={f1:.3f}  ROC‑AUC={auc:.3f}")

# ─── 5. SAVE MODEL ───────────────────────────────────────────────────
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_only_roberta.pt")
print("✅  Text‑only model saved to ./saved/")

# Cellule à lancer quand train_df / test_df existent déjà
#  et contiennent : text, meta (list[20]), label

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
import torch, torch.nn as nn, numpy as np
from tqdm.auto import tqdm
import joblib, os
from pathlib import Path

SEED, BATCH, EPOCHS, LR = 42, 64, 3, 3e-4
torch.manual_seed(SEED); np.random.seed(SEED)

# 1 ── NORMALISER les 20 features ---------------------------------------
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df ["meta"] = list(scaler.transform(np.vstack(test_df ["meta"])))

# 2 ── DATASET / LOADER --------------------------------------------------
class MetaDS(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.y = torch.tensor(df["label"].values,     dtype=torch.float32)
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return {"meta": self.x[i], "label": self.y[i]}

train_loader = DataLoader(MetaDS(train_df), BATCH, shuffle=True)
test_loader  = DataLoader(MetaDS(test_df),  BATCH)

# 3 ── MODELE ------------------------------------------------------------
class MetaOnlyMLP(nn.Module):
    def __init__(self, meta_dim=20, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(meta_dim, hidden), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(hidden, hidden//2), nn.ReLU(),
            nn.Linear(hidden//2, 1)
        )
    def forward(self, m): return self.net(m).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = MetaOnlyMLP().to(device)
lossf  = nn.BCEWithLogitsLoss()
optim  = torch.optim.Adam(model.parameters(), lr=LR)

# 4 ── BOUCLE ------------------------------------------------------------
def run(loader, train=True):
    model.train() if train else model.eval()
    preds, trues, losses = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            m = batch["meta"].to(device); y = batch["label"].to(device)

            logit = model(m)
            loss  = lossf(logit, y)

            if train:
                optim.zero_grad()
                loss.backward()
                optim.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logit).detach().cpu().numpy())  # ← fix
            trues.extend(y.cpu().numpy())
    return np.array(preds), np.array(trues), np.mean(losses)


for ep in range(1, EPOCHS+1):
    _, _, tr_loss       = run(train_loader, True)
    y_pred, y_true, vl  = run(test_loader,  False)
    f1  = f1_score(y_true, y_pred >= 0.5)
    auc = roc_auc_score(y_true, y_pred)
    tn, fp, _, _ = confusion_matrix(y_true, y_pred >= 0.5).ravel()
    fp_rate = fp / (fp + tn)
    print(f"[Ep {ep}] train_loss={tr_loss:.4f}  val_loss={vl:.4f}  "
          f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.2%}")

# 5 ── SAUVEGARDE --------------------------------------------------------
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/meta_only_mlp_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Meta‑only model + scaler saved.")



Train DF: 35238   –   Eval DF: 16636


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                   

[Ep 1] train_loss=0.3493  val_loss=0.2862  F1=0.900  ROC‑AUC=0.984  FP‑rate=66.78%


 33%|███▎      | 719/2203 [18:31<38:28,  1.56s/it]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

                                                     

[Ep 2] train_loss=0.2514  val_loss=0.2375  F1=0.940  ROC‑AUC=0.989  FP‑rate=37.99%


                                                     

[Ep 3] train_loss=0.1651  val_loss=0.1970  F1=0.954  ROC‑AUC=0.989  FP‑rate=28.38%
✔️  Model + scaler saved.
Dataset size: 51874 examples


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 20/2594 [01:03<2:15:18,  3.15s/it]

In [2]:
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_meta_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Model + scaler saved.")

✔️  Model + scaler saved.


In [6]:
# Cellule à lancer quand train_df / test_df existent déjà
#  et contiennent : text, meta (list[20]), label

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
import torch, torch.nn as nn, numpy as np
from tqdm.auto import tqdm
import joblib, os
from pathlib import Path

SEED, BATCH, EPOCHS, LR = 42, 64, 3, 3e-4
torch.manual_seed(SEED); np.random.seed(SEED)

# 1 ── NORMALISER les 20 features ---------------------------------------
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df ["meta"] = list(scaler.transform(np.vstack(test_df ["meta"])))

# 2 ── DATASET / LOADER --------------------------------------------------
class MetaDS(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.y = torch.tensor(df["label"].values,     dtype=torch.float32)
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return {"meta": self.x[i], "label": self.y[i]}

train_loader = DataLoader(MetaDS(train_df), BATCH, shuffle=True)
test_loader  = DataLoader(MetaDS(test_df),  BATCH)

# 3 ── MODELE ------------------------------------------------------------
class MetaOnlyMLP(nn.Module):
    def __init__(self, meta_dim=20, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(meta_dim, hidden), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(hidden, hidden//2), nn.ReLU(),
            nn.Linear(hidden//2, 1)
        )
    def forward(self, m): return self.net(m).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = MetaOnlyMLP().to(device)
lossf  = nn.BCEWithLogitsLoss()
optim  = torch.optim.Adam(model.parameters(), lr=LR)

# 4 ── BOUCLE ------------------------------------------------------------
def run(loader, train=True):
    model.train() if train else model.eval()
    preds, trues, losses = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            m = batch["meta"].to(device); y = batch["label"].to(device)

            logit = model(m)
            loss  = lossf(logit, y)

            if train:
                optim.zero_grad()
                loss.backward()
                optim.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logit).detach().cpu().numpy())  # ← fix
            trues.extend(y.cpu().numpy())
    return np.array(preds), np.array(trues), np.mean(losses)


for ep in range(1, EPOCHS+1):
    _, _, tr_loss       = run(train_loader, True)
    y_pred, y_true, vl  = run(test_loader,  False)
    f1  = f1_score(y_true, y_pred >= 0.5)
    auc = roc_auc_score(y_true, y_pred)
    tn, fp, _, _ = confusion_matrix(y_true, y_pred >= 0.5).ravel()
    fp_rate = fp / (fp + tn)
    print(f"[Ep {ep}] train_loss={tr_loss:.4f}  val_loss={vl:.4f}  "
          f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.2%}")

# 5 ── SAUVEGARDE --------------------------------------------------------
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/meta_only_mlp_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Meta‑only model + scaler saved.")


                                                  

[Ep 1] train_loss=0.2035  val_loss=0.0733  F1=0.986  ROC‑AUC=0.994  FP‑rate=7.57%


                                                  

[Ep 2] train_loss=0.0529  val_loss=0.0446  F1=0.991  ROC‑AUC=0.997  FP‑rate=5.33%


                                                  

[Ep 3] train_loss=0.0353  val_loss=0.0308  F1=0.994  ROC‑AUC=0.997  FP‑rate=3.59%
✔️  Meta‑only model + scaler saved.


