In [2]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.1
    Uninstalling pip-25.1:
      Successfully uninstalled pip-25.1
Successfully installed pip-25.1.1


In [3]:
!pip install dirtyjson

Collecting dirtyjson
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Installing collected packages: dirtyjson
Successfully installed dirtyjson-1.0.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers scikit-learn pandas tqdm joblib


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-linux_x86_64.whl (7.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from torch)
  Downloading https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collec

In [7]:
import dirtyjson as dj  # pip install dirtyjson



In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enrichit les JSONL OpenLLMText_Human avec des métriques clavier authentic
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Prérequis  : pip install pandas numpy dirtyjson
Exécution  : python enrich_openllmtext.py
Structure  :
    .
    ├── ai_detect_keystroke_logging_data_anon_github.csv
    └── OpenLLMText_Human/
        ├── train-dirty.jsonl
        ├── valid-dirty.jsonl
        └── test-dirty.jsonl
Le script crée pour chaque fichier *_dirty.jsonl* un *_enriched.jsonl*.
"""

import os, json, random, numpy as np, pandas as pd, dirtyjson as dj
from pathlib import Path

# ───────────────────────────────────────────────────────────────────────
# 1. PARAMÈTRES
# ───────────────────────────────────────────────────────────────────────
CSV_PATH = "ai_detect_keystroke_logging_data_anon_github.csv"
DATASET_DIRS = ["OpenLLMText_Human"]      # ajoutez d’autres dossiers si besoin
OUT_SUFFIX = "_enriched.jsonl"            # suffixe de sortie
RANDOM_SEED = 42                          # pour reproductibilité du sample

# ───────────────────────────────────────────────────────────────────────
# 2. COLONNES À EXTRAIRE (20 demandées + condition)
# ───────────────────────────────────────────────────────────────────────
COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

# ───────────────────────────────────────────────────────────────────────
# 3. CHARGER LE CSV & CONTRÔLER LES COLONNES
# ───────────────────────────────────────────────────────────────────────
missing = set(COLS + ["condition"]) - set(pd.read_csv(CSV_PATH, nrows=0).columns)
if missing:
    raise ValueError(f"Colonnes manquantes dans le CSV: {missing}")

df_auth = (
    pd.read_csv(CSV_PATH, usecols=["condition"] + COLS)
      .query("condition == 'authentic'")
      .reset_index(drop=True)[COLS]
)
rng = np.random.default_rng(RANDOM_SEED)

# ───────────────────────────────────────────────────────────────────────
# 4. UTILITAIRE : LIRE UN OBJET JSON, MÊME SUR PLUSIEURS LIGNES
# ───────────────────────────────────────────────────────────────────────
def iter_json_objects(path):
    """Yield chaque objet JSON contenu dans *path*, même s’il s’étale sur
       plusieurs lignes. Utilise un comptage d'accolades et dirtyjson."""
    buf, depth = [], 0
    with open(path, encoding="utf-8") as f:
        for line in f:
            depth += line.count("{") - line.count("}")
            buf.append(line)
            if depth == 0 and buf:
                yield dj.loads("".join(buf))
                buf.clear()

# ───────────────────────────────────────────────────────────────────────
# 5. FONCTION D’ENRICHISSEMENT
# ───────────────────────────────────────────────────────────────────────
def enrich_jsonl(file_path: Path, text_source: str) -> None:
    """Crée <file_path>_enriched.jsonl avec un champ 'extra' additionnel."""
    out_path = file_path.with_name(file_path.stem + OUT_SUFFIX)
    n = 0
    with out_path.open("w", encoding="utf-8") as fout:
        for item in iter_json_objects(file_path):
            #  échantillonner UNE ligne authentic aléatoire
            meta_row = df_auth.iloc[rng.integers(len(df_auth))].to_dict()
            # convertir les types numpy -> natifs
            meta_row = {k: (None if pd.isna(v)
                            else v.item() if hasattr(v, "item") else v)
                        for k, v in meta_row.items()}
            # fusionner dans item["extra"]
            item.setdefault("extra", {}).update(
                {"text_source": text_source, **meta_row}
            )
            fout.write(json.dumps(item, ensure_ascii=False) + "\n")
            n += 1
    print(f"✓ {file_path.name:25s} → {out_path.name:25s} — {n} objets enrichis")

# ───────────────────────────────────────────────────────────────────────
# 6. BOUCLE PRINCIPALE
# ───────────────────────────────────────────────────────────────────────
def main():
    for folder in DATASET_DIRS:
        for fname in os.listdir(folder):
            if fname.endswith(".jsonl") and "_enriched" not in fname:

                fpath = Path(folder) / fname
                enrich_jsonl(
                    fpath,
                    text_source=folder.replace("OpenLLMText_", "").rstrip("/"),
                )

if __name__ == "__main__":
    main()


✓ valid-dirty.jsonl         → valid-dirty_enriched.jsonl — 737 objets enrichis
✓ train-dirty.jsonl         → train-dirty_enriched.jsonl — 2506 objets enrichis
✓ test-dirty.jsonl          → test-dirty_enriched.jsonl — 1762 objets enrichis


In [9]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enrichit les JSONL OpenLLMText_Human avec des métriques clavier authentic
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Prérequis  : pip install pandas numpy dirtyjson
Exécution  : python enrich_openllmtext.py
Structure  :
    .
    ├── ai_detect_keystroke_logging_data_anon_github.csv
    └── OpenLLMText_Human/
        ├── train-dirty.jsonl
        ├── valid-dirty.jsonl
        └── test-dirty.jsonl
Le script crée pour chaque fichier *_dirty.jsonl* un *_enriched.jsonl*.
"""

import os, json, random, numpy as np, pandas as pd, dirtyjson as dj
from pathlib import Path

# ───────────────────────────────────────────────────────────────────────
# 1. PARAMÈTRES
# ───────────────────────────────────────────────────────────────────────
CSV_PATH = "ai_detect_keystroke_logging_data_anon_github.csv"
DATASET_DIRS = ["OpenLLMText_ChatGPT"]      # ajoutez d’autres dossiers si besoin
OUT_SUFFIX = "_enriched.jsonl"            # suffixe de sortie
RANDOM_SEED = 42                          # pour reproductibilité du sample

# ───────────────────────────────────────────────────────────────────────
# 2. COLONNES À EXTRAIRE (20 demandées + condition)
# ───────────────────────────────────────────────────────────────────────
COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

# ───────────────────────────────────────────────────────────────────────
# 3. CHARGER LE CSV & CONTRÔLER LES COLONNES
# ───────────────────────────────────────────────────────────────────────
missing = set(COLS + ["condition"]) - set(pd.read_csv(CSV_PATH, nrows=0).columns)
if missing:
    raise ValueError(f"Colonnes manquantes dans le CSV: {missing}")

df_auth = (
    pd.read_csv(CSV_PATH, usecols=["condition"] + COLS)
      .query("condition == 'transcribed'")
      .reset_index(drop=True)[COLS]
)
rng = np.random.default_rng(RANDOM_SEED)

# ───────────────────────────────────────────────────────────────────────
# 4. UTILITAIRE : LIRE UN OBJET JSON, MÊME SUR PLUSIEURS LIGNES
# ───────────────────────────────────────────────────────────────────────
def iter_json_objects(path):
    """Yield chaque objet JSON contenu dans *path*, même s’il s’étale sur
       plusieurs lignes. Utilise un comptage d'accolades et dirtyjson."""
    buf, depth = [], 0
    with open(path, encoding="utf-8") as f:
        for line in f:
            depth += line.count("{") - line.count("}")
            buf.append(line)
            if depth == 0 and buf:
                yield dj.loads("".join(buf))
                buf.clear()

# ───────────────────────────────────────────────────────────────────────
# 5. FONCTION D’ENRICHISSEMENT
# ───────────────────────────────────────────────────────────────────────
def enrich_jsonl(file_path: Path, text_source: str) -> None:
    """Crée <file_path>_enriched.jsonl avec un champ 'extra' additionnel."""
    out_path = file_path.with_name(file_path.stem + OUT_SUFFIX)
    n = 0
    with out_path.open("w", encoding="utf-8") as fout:
        for item in iter_json_objects(file_path):
            #  échantillonner UNE ligne authentic aléatoire
            meta_row = df_auth.iloc[rng.integers(len(df_auth))].to_dict()
            # convertir les types numpy -> natifs
            meta_row = {k: (None if pd.isna(v)
                            else v.item() if hasattr(v, "item") else v)
                        for k, v in meta_row.items()}
            # fusionner dans item["extra"]
            item.setdefault("extra", {}).update(
                {"text_source": text_source, **meta_row}
            )
            fout.write(json.dumps(item, ensure_ascii=False) + "\n")
            n += 1
    print(f"✓ {file_path.name:25s} → {out_path.name:25s} — {n} objets enrichis")

# ───────────────────────────────────────────────────────────────────────
# 6. BOUCLE PRINCIPALE
# ───────────────────────────────────────────────────────────────────────
def main():
    for folder in DATASET_DIRS:
        for fname in os.listdir(folder):
            if fname.endswith(".jsonl") and "_enriched" not in fname:

                fpath = Path(folder) / fname
                enrich_jsonl(
                    fpath,
                    text_source=folder.replace("OpenLLMText_", "").rstrip("/"),
                )

if __name__ == "__main__":
    main()


✓ valid-dirty.jsonl         → valid-dirty_enriched.jsonl — 11708 objets enrichis


Error: Invalid \X escape sequence 'o': line 1 column 391 (char 390)

In [13]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enrichit les JSONL OpenLLMText_Human et consigne les objets invalides
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    ai_detect_keystroke_logging_data_anon_github.csv
    OpenLLMText_Human/
        ├── train-dirty.jsonl
        ├── valid-dirty.jsonl
        └── test-dirty.jsonl

Résultat :
    OpenLLMText_Human/
        ├── train-dirty_enriched.jsonl
        ├── train-dirty_rejects.jsonl
        └── …
"""

from __future__ import annotations
import os, json, random, numpy as np, pandas as pd, dirtyjson as dj
from pathlib import Path
from contextlib import nullcontext

# ──────────────────────────────────────────────────
# 1. PARAMÈTRES
# ──────────────────────────────────────────────────
CSV_PATH     = "ai_detect_keystroke_logging_data_anon_github.csv"
DATASET_DIRS = ["OpenLLMText_ChatGPT"]
OUT_SUFFIX   = "_enriched.jsonl"
REJ_SUFFIX   = "_rejects.jsonl"
RANDOM_SEED  = 42

# 20 colonnes demandées
COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

# ──────────────────────────────────────────────────
# 2. CHARGER LE CSV AUTHENTIC
# ──────────────────────────────────────────────────
missing = set(COLS + ["condition"]) - set(pd.read_csv(CSV_PATH, nrows=0).columns)
if missing:
    raise ValueError(f"Colonnes manquantes dans le CSV : {missing}")

df_auth = (
    pd.read_csv(CSV_PATH, usecols=["condition"] + COLS)
      .query("condition == 'transcribed'")
      .reset_index(drop=True)[COLS]
)
rng = np.random.default_rng(RANDOM_SEED)

# ──────────────────────────────────────────────────
# 3. GÉNÉRATEUR ROBUSTE + CONSIGNATION DES OBJETS CASSÉS
# ──────────────────────────────────────────────────
def iter_json_objects(path: Path, reject_path: Path | None = None):
    """Yield chaque objet JSON complet, même multi‑lignes.
       Les objets indécodables sont écrits dans *reject_path* et ignorés."""
    buf, depth = [], 0
    with path.open(encoding="utf-8") as fin, \
         (reject_path.open("w", encoding="utf-8") if reject_path
          else nullcontext()) as rej:
        for line in fin:
            depth += line.count("{") - line.count("}")
            buf.append(line)
            if depth == 0 and buf:
                raw = "".join(buf)
                try:
                    yield dj.loads(raw)
                except Exception as e:
                    if rej:
                        rej.write(raw + "\n")     # consigne le JSON brut
                    else:
                        print(f"⚠︎ Rejeté ({e})")
                buf.clear()

# ──────────────────────────────────────────────────
# 4. ENRICHISSEMENT
# ──────────────────────────────────────────────────
def enrich_jsonl(file_path: Path, text_source: str) -> None:
    out_path = file_path.with_name(file_path.stem + OUT_SUFFIX)
    rej_path = file_path.with_name(file_path.stem + REJ_SUFFIX)
    n_ok = n_rej = 0

    with out_path.open("w", encoding="utf-8") as fout:
        for item in iter_json_objects(file_path, reject_path=rej_path):
            if item is None:            # objet rejeté
                n_rej += 1
                continue

            # 1 ligne authentic aléatoire
            meta_row = df_auth.iloc[rng.integers(len(df_auth))].to_dict()
            meta_row = {k: (None if pd.isna(v) else v.item()
                            if hasattr(v, "item") else v)
                        for k, v in meta_row.items()}

            item.setdefault("extra", {}).update(
                {"text_source": text_source, **meta_row}
            )
            fout.write(json.dumps(item, ensure_ascii=False) + "\n")
            n_ok += 1

    print(f"✓ {file_path.name:25s} → {out_path.name:25s} — "
          f"{n_ok} enrichis, {n_rej} rejetés")

# ──────────────────────────────────────────────────
# 5. BOUCLE PRINCIPALE
# ──────────────────────────────────────────────────
def main():
    for folder in DATASET_DIRS:
        for fname in os.listdir(folder):
            if fname.endswith(".jsonl") and "_enriched" not in fname:
                fpath = Path(folder) / fname
                enrich_jsonl(
                    fpath,
                    text_source=folder.replace("OpenLLMText_", "").rstrip("/"),
                )

if __name__ == "__main__":
    main()


✓ train-dirty_rejects.jsonl → train-dirty_rejects_enriched.jsonl — 0 enrichis, 0 rejetés
✓ valid-dirty.jsonl         → valid-dirty_enriched.jsonl — 11708 enrichis, 0 rejetés
✓ train-dirty.jsonl         → train-dirty_enriched.jsonl — 27722 enrichis, 0 rejetés
✓ valid-dirty_rejects.jsonl → valid-dirty_rejects_enriched.jsonl — 0 enrichis, 0 rejetés
✓ test-dirty.jsonl          → test-dirty_enriched.jsonl — 780 enrichis, 0 rejetés
✓ test-dirty_rejects.jsonl  → test-dirty_rejects_enriched.jsonl — 0 enrichis, 0 rejetés


In [None]:
# ================================================================
#  Text‑+‑Meta  •  clean split (800 logs train / 200 logs eval)
#  Computes F1, ROC‑AUC and false‑positive rate
# ================================================================
import json, os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import joblib

# -------------------- Parameters ---------------------------------
SEED       = 42
BATCH_SIZE = 16
EPOCHS     = 2
LR         = 2e-5
BACKBONE   = "roberta-base"

META_COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ================================================================
# 1. Load enriched JSONL   (train‑* vs valid‑*/test‑*)
# ================================================================
def load_dataset(folder: str, label: int, split: str):
    """split='train'  -> train-* files
       split='eval'   -> valid-* and test-* files"""
    rows = []
    for fn in os.listdir(folder):
        if not fn.endswith("_enriched.jsonl"):       # skip raw files
            continue
        is_train = fn.startswith("train-")
        if (split == "train" and not is_train) or (split == "eval" and is_train):
            continue
        for line in open(Path(folder)/fn, encoding="utf-8"):
            obj  = json.loads(line)
            meta = [obj["extra"].get(c, 0.0) for c in META_COLS]
            rows.append({"text": obj["text"], "meta": meta, "label": label})
    return rows

train_rows = (load_dataset("OpenLLMText_Human", 0, "train") +
              load_dataset("OpenLLMText_ChatGPT", 1, "train"))
eval_rows  = (load_dataset("OpenLLMText_Human", 0, "eval")  +
              load_dataset("OpenLLMText_ChatGPT", 1, "eval"))

train_df = pd.DataFrame(train_rows)
test_df  = pd.DataFrame(eval_rows)
print(f"Train DF: {len(train_df)}   –   Eval DF: {len(test_df)}")

# ================================================================
# 2. Feature scaling  (fit on TRAIN only)
# ================================================================
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df ["meta"] = list(scaler.transform(np.vstack(test_df ["meta"])))

# ================================================================
# 3. PyTorch Dataset / Loader
# ================================================================
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

class TextMetaDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.metas  = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = tokenizer(self.texts[idx], max_length=512, truncation=True,
                        padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "meta": self.metas[idx],
                "label": self.labels[idx]}

train_loader = DataLoader(TextMetaDS(train_df), batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TextMetaDS(test_df),  batch_size=BATCH_SIZE)

# ================================================================
# 4. Model definition
# ================================================================
class TextMetaClassifier(nn.Module):
    def __init__(self, meta_dim=20, hidden_meta=32):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(BACKBONE)
        # freeze embeddings + first 6 layers
        for p in self.encoder.embeddings.parameters(): p.requires_grad = False
        for layer in self.encoder.encoder.layer[:6]:
            for p in layer.parameters(): p.requires_grad = False
        self.meta_fc = nn.Sequential(nn.Linear(meta_dim, hidden_meta),
                                     nn.ReLU(), nn.Dropout(0.1))
        self.classifier = nn.Linear(self.encoder.config.hidden_size + hidden_meta, 1)
    def forward(self, ids, mask, meta):
        h_text = self.encoder(ids, attention_mask=mask).pooler_output
        h_meta = self.meta_fc(meta)
        return self.classifier(torch.cat([h_text, h_meta], dim=1)).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextMetaClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optim     = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                              lr=LR)

# ================================================================
# 5. Train & evaluate
# ================================================================
def run(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, trues = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            ids  = batch["input_ids"].to(device)
            msk  = batch["attention_mask"].to(device)
            meta = batch["meta"].to(device)
            y    = batch["label"].to(device)
            logits = model(ids, msk, meta)
            loss   = criterion(logits, y)
            if train:
                optim.zero_grad(); loss.backward(); optim.step()
            losses.append(loss.item())
            preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            trues.extend(y.cpu().numpy())
    return np.array(preds), np.array(trues), np.mean(losses)

for ep in range(2, EPOCHS+1):
    _, _, tr_loss = run(train_loader, True)
    y_pred, y_true, val_loss = run(test_loader, False)
    f1  = f1_score(y_true, y_pred >= 0.5)
    auc = roc_auc_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred >= 0.5).ravel()
    fp_rate = fp / (fp + tn)
    print(f"[Ep {ep}] train_loss={tr_loss:.4f}  val_loss={val_loss:.4f}  "
          f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.2%}")

# ================================================================
# 6. Save artefacts
# ================================================================
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_meta_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Model + scaler saved.")

  from .autonotebook import tqdm as notebook_tqdm


Dataset size : 51874 examples


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                   

[Epoch 1] train_loss=0.3162 val_loss=0.2698  F1=0.936  ROC‑AUC=0.981


                                                   

[Epoch 2] train_loss=0.2156 val_loss=0.1761  F1=0.977  ROC‑AUC=0.991


                                                 

KeyboardInterrupt: 

In [14]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Train Text+Meta Classifier  (Human vs ChatGPT) – English‑only version
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Prérequis :
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    pip install transformers scikit-learn pandas tqdm joblib

Données attendues :
    OpenLLMText_Human/*_enriched.jsonl
    OpenLLMText_ChatGPT/*_enriched.jsonl

Sorties :
    saved/text_meta_roberta.pt
    saved/meta_scaler.gz
"""

from __future__ import annotations
import json, os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import joblib

# ────────────────────────────
# 1. PARAMÈTRES
# ────────────────────────────
SEED       = 42
BATCH_SIZE = 16
EPOCHS     = 3
LR         = 2e-5
BACKBONE   = "roberta-base"     # full‑English

META_COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ────────────────────────────
# 2. LECTURE DES JSONL ENRICHIS
# ────────────────────────────
def load_dataset(folder: str, label: int):
    data = []
    for fname in os.listdir(folder):
        if fname.endswith("_enriched.jsonl"):
            for line in open(Path(folder) / fname, encoding="utf-8"):
                obj = json.loads(line)
                meta = [obj["extra"].get(col, 0.0) for col in META_COLS]
                data.append({"text": obj["text"], "meta": meta, "label": label})
    return data

data = pd.DataFrame(
    load_dataset("OpenLLMText_Human", 0) +
    load_dataset("OpenLLMText_ChatGPT", 1)
)
print(f"Dataset size : {len(data)} examples")

# ────────────────────────────
# 3. TRAIN / TEST SPLIT
# ────────────────────────────
train_df, test_df = train_test_split(
    data, test_size=0.2, random_state=SEED, stratify=data["label"]
)

# Standardisation des 20 features
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df["meta"]  = list(scaler.transform(np.vstack(test_df["meta"])))

# ────────────────────────────
# 4. DATASET PyTorch
# ────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

class TextMetaDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.metas  = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            max_length=512,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "meta": self.metas[idx],
            "label": self.labels[idx]
        }

train_loader = DataLoader(TextMetaDS(train_df), batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TextMetaDS(test_df),  batch_size=BATCH_SIZE)

# ────────────────────────────
# 5. MODÈLE
# ────────────────────────────
class TextMetaClassifier(nn.Module):
    def __init__(self, meta_dim=20, hidden_meta=32):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(BACKBONE)
        # Freeze embeddings + 6 premières couches
        for p in self.encoder.embeddings.parameters():
            p.requires_grad = False
        for layer in self.encoder.encoder.layer[:6]:
            for p in layer.parameters():
                p.requires_grad = False

        self.meta_fc = nn.Sequential(
            nn.Linear(meta_dim, hidden_meta),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        self.classifier = nn.Linear(
            self.encoder.config.hidden_size + hidden_meta, 1
        )

    def forward(self, input_ids, attention_mask, meta):
        h_text = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).pooler_output               # (bs, 768)
        h_meta = self.meta_fc(meta)    # (bs, hidden_meta)
        h = torch.cat([h_text, h_meta], dim=1)
        return self.classifier(h).squeeze(1)  # logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextMetaClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)

# ────────────────────────────
# 6. BOUCLE D’ENTRAÎNEMENT
# ────────────────────────────
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, trues = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            ids  = batch["input_ids"].to(device)
            msk  = batch["attention_mask"].to(device)
            meta = batch["meta"].to(device)
            y    = batch["label"].to(device)

            logits = model(ids, msk, meta)
            loss   = criterion(logits, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            trues.extend(y.cpu().numpy())
    return np.mean(losses), np.array(preds), np.array(trues)

for epoch in range(1, EPOCHS + 1):
    tr_loss, _, _       = run_epoch(train_loader, train=True)
    val_loss, y_hat, y_true = run_epoch(test_loader,  train=False)

    f1  = f1_score(y_true, y_hat >= 0.5)
    auc = roc_auc_score(y_true, y_hat)
    print(f"[Epoch {epoch}] train_loss={tr_loss:.4f} "
          f"val_loss={val_loss:.4f}  F1={f1:.3f}  ROC‑AUC={auc:.3f}")

# ────────────────────────────
# 7. SAUVEGARDE
# ────────────────────────────
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_meta_roberta.pt")
joblib.dump(scaler,              "saved/meta_scaler.gz")
print("🎉  Modèle et scaler sauvegardés dans ./saved/")


  from .autonotebook import tqdm as notebook_tqdm


Dataset size : 51874 examples


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                   

[Epoch 1] train_loss=0.3162 val_loss=0.2698  F1=0.936  ROC‑AUC=0.981


                                                   

[Epoch 2] train_loss=0.2156 val_loss=0.1761  F1=0.977  ROC‑AUC=0.991


                                                 

KeyboardInterrupt: 

🎉  Modèle et scaler sauvegardés dans ./saved/


In [18]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Text‑only LLM classifier  — Human vs ChatGPT (English)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Entrée  : OpenLLMText_Human/*_enriched.jsonl
          OpenLLMText_ChatGPT/*_enriched.jsonl
Sorties : saved/text_only_roberta.pt
          (aucun scaler, car pas de variables méta)
"""

from __future__ import annotations
import json, os, random, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# ─── PARAMS ──────────────────────────────────────────────────────────
SEED       = 42
BATCH_SIZE = 16
EPOCHS     = 2
LR         = 2e-5
BACKBONE   = "roberta-base"

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ─── 1. DATA LOAD (ignore les méta) ──────────────────────────────────
def load_dataset(folder: str, label: int):
    data = []
    for fn in os.listdir(folder):
        if fn.endswith("_enriched.jsonl"):
            for line in open(Path(folder)/fn, encoding="utf-8"):
                obj = json.loads(line)
                data.append({"text": obj["text"], "label": label})
    return data

df = pd.DataFrame(
      load_dataset("OpenLLMText_Human", 0)
    + load_dataset("OpenLLMText_ChatGPT", 1)
)
print(f"Dataset size: {len(df)} examples")

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

# ─── 2. DATASET / DATALOADER ─────────────────────────────────────────
tok = AutoTokenizer.from_pretrained(BACKBONE)

class TextOnlyDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = tok(self.texts[idx], max_length=512, truncation=True,
                  padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "label": self.labels[idx]}

train_loader = DataLoader(TextOnlyDS(train_df), BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TextOnlyDS(test_df),  BATCH_SIZE)

# ─── 3. MODEL ────────────────────────────────────────────────────────
class TextOnlyClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = AutoModel.from_pretrained(BACKBONE)
        # option : geler embeddings + moitié des couches
        for p in self.enc.embeddings.parameters():
            p.requires_grad = False
        for layer in self.enc.encoder.layer[:6]:
            for p in layer.parameters():
                p.requires_grad = False
        self.clf = nn.Linear(self.enc.config.hidden_size, 1)

    def forward(self, ids, mask):
        h = self.enc(ids, attention_mask=mask).pooler_output
        return self.clf(h).squeeze(1)     # logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextOnlyClassifier().to(device)
criterion = nn.BCEWithLogitsLoss()
optim     = torch.optim.AdamW(filter(lambda p: p.requires_grad,
                                     model.parameters()), lr=LR)

# ─── 4. TRAIN / EVAL LOOP ────────────────────────────────────────────
def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    losses, preds, trues = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            ids  = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            y    = batch["label"].to(device)

            logits = model(ids, mask)
            loss   = criterion(logits, y)
            if train:
                optim.zero_grad(); loss.backward(); optim.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            trues.extend(y.cpu().numpy())
    return np.mean(losses), np.array(preds), np.array(trues)

for ep in range(1, EPOCHS+1):
    tr_loss, _, _        = run_epoch(train_loader, True)
    val_loss, y_hat, yt  = run_epoch(test_loader,  False)
    f1  = f1_score(yt, y_hat >= 0.5)
    auc = roc_auc_score(yt, y_hat)
    print(f"[Epoch {ep}] train_loss={tr_loss:.4f} "
          f"val_loss={val_loss:.4f}  F1={f1:.3f}  ROC‑AUC={auc:.3f}")

# ─── 5. SAVE MODEL ───────────────────────────────────────────────────
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/text_only_roberta.pt")
print("✅  Text‑only model saved to ./saved/")


Dataset size: 51874 examples


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                   

[Epoch 1] train_loss=0.3437 val_loss=0.3292  F1=0.827  ROC‑AUC=0.855


                                                   

[Epoch 2] train_loss=0.3199 val_loss=0.3222  F1=0.848  ROC‑AUC=0.866
✅  Text‑only model saved to ./saved/


In [23]:
# evaluate_f1.py --------------------------------------------------------
import json, os, joblib, torch, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# ----- 1. Reprend la même classe que dans ton training -----------------
class TextMetaClassifier(torch.nn.Module):
    def __init__(self, meta_dim=20, hidden_meta=32, backbone="roberta-base"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(backbone)
        for p in self.encoder.embeddings.parameters():
            p.requires_grad = False
        for layer in self.encoder.encoder.layer[:6]:
            for p in layer.parameters():
                p.requires_grad = False
        self.meta_fc = torch.nn.Sequential(
            torch.nn.Linear(meta_dim, hidden_meta),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
        )
        self.classifier = torch.nn.Linear(
            self.encoder.config.hidden_size + hidden_meta, 1
        )

    def forward(self, input_ids, attention_mask, meta):
        h_text = self.encoder(input_ids, attention_mask).pooler_output
        h_meta = self.meta_fc(meta)
        h = torch.cat([h_text, h_meta], dim=1)
        return self.classifier(h).squeeze(1)  # logits
# ----------------------------------------------------------------------

# ----- 2. paramètres ---------------------------------------------------
SEED, BATCH = 42, 16
BACKBONE = "roberta-base"
META_COLS = [
    "mean_pause_time_in_secs_threshold200",
    "total_insertions_chars_exclu_space",
    "total_pause_time_in_secs_threshold2000",
    "product_process_ratio",
    "mean_insertion_length_chars_exclu_space",
    "total_deletions_words",
    "mean_pause_time_before_sents_threshold200",
    "mean_deletion_length_chars",
    "num_of_insertions",
    "median_insertion_length_chars_exclu_space",
    "num_of_pause_within_words_threshold200",
    "sd_strokes_per_min_5_intervals",
    "median_length_Rburst_sec",
    "median_pause_time_between_words_threshold200",
    "num_of_pause_after_words_threshold200",
    "num_of_revisions",
    "sd_pause_time_before_words_threshold200",
    "total_number_of_pauses_threshold2000",
    "median_pause_time_before_words_threshold200",
    "mean_pause_time_before_words_threshold200",
]
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

# ----- 3. Recharger le jeu de données enrichi --------------------------
def load_dataset(folder, label):
    rows = []
    for fn in os.listdir(folder):
        if fn.endswith("_enriched.jsonl"):
            for line in open(Path(folder)/fn, encoding="utf-8"):
                obj = json.loads(line)
                meta = [obj["extra"].get(c, 0.0) for c in META_COLS]
                rows.append({"text": obj["text"], "meta": meta, "label": label})
    return rows

df = pd.DataFrame(
      load_dataset("OpenLLMText_Human", 0)
    + load_dataset("OpenLLMText_ChatGPT", 1)
)
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

# ----- 4. Appliquer le scaler sauvegardé -------------------------------
scaler = joblib.load("saved/meta_scaler.gz")
test_df["meta"] = list(scaler.transform(np.vstack(test_df["meta"])))

# ----- 5. DataLoader ----------------------------------------------------
class TextMetaDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.metas  = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = tokenizer(self.texts[i], max_length=512, truncation=True,
                        padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "meta": self.metas[i],
                "label": self.labels[i]}

loader = DataLoader(TextMetaDS(test_df), batch_size=BATCH)

# ----- 6. Charger le modèle entraîné et évaluer ------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model = TextMetaClassifier(backbone=BACKBONE).to(device)
model.load_state_dict(torch.load("saved/text_meta_roberta.pt",
                                 map_location=device))
model.eval()

pred, true = [], []
with torch.no_grad():
    for batch in tqdm(loader):
        ids  = batch["input_ids"].to(device)
        msk  = batch["attention_mask"].to(device)
        meta = batch["meta"].to(device)
        y    = batch["label"].to(device)
        logits = model(ids, msk, meta)
        pred.extend(torch.sigmoid(logits).cpu().numpy())
        true.extend(y.cpu().numpy())

pred, true = np.array(pred), np.array(true)
f1  = f1_score(true, pred >= 0.5)
auc = roc_auc_score(true, pred)
tn, fp, fn, tp = confusion_matrix(true, pred >= 0.5).ravel()
fp_rate = fp / (fp + tn)
print(f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.3%}")




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("saved/text_meta_roberta.pt",
100%|██████████| 649/649 [04:52<00:00,  2.22it/s]

F1=0.969  ROC‑AUC=0.986  FP‑rate=19.460%





In [24]:
#!/usr/bin/env python3
# Evaluate text‑only RoBERTa model and compute FP‑rate
import json, os, torch, numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# ---------------- parameters ------------------------------------------
SEED, BATCH = 42, 16
BACKBONE = "roberta-base"

torch.manual_seed(SEED); np.random.seed(SEED)

# ---------------- load enriched JSONL (text only) ---------------------
def load_dataset(folder, label):
    rows = []
    for fn in os.listdir(folder):
        if fn.endswith("_enriched.jsonl"):
            for line in open(Path(folder)/fn, encoding="utf-8"):
                obj = json.loads(line)
                rows.append({"text": obj["text"], "label": label})
    return rows

df = pd.DataFrame(
      load_dataset("OpenLLMText_Human", 0)
    + load_dataset("OpenLLMText_ChatGPT", 1)
)
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

# ---------------- DataLoader ------------------------------------------
tok = AutoTokenizer.from_pretrained(BACKBONE)
class TextOnlyDS(Dataset):
    def __init__(self, df):
        self.texts  = df["text"].tolist()
        self.labels = torch.tensor(df["label"].values, dtype=torch.float32)
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        enc = tok(self.texts[i], max_length=512, truncation=True,
                  padding="max_length", return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0),
                "attention_mask": enc["attention_mask"].squeeze(0),
                "label": self.labels[i]}

loader = DataLoader(TextOnlyDS(test_df), batch_size=BATCH)

# ---------------- model definition (same as training) -----------------
class TextOnlyClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = AutoModel.from_pretrained(BACKBONE)
        for p in self.enc.embeddings.parameters():
            p.requires_grad = False
        for layer in self.enc.encoder.layer[:6]:
            for p in layer.parameters():
                p.requires_grad = False
        self.clf = torch.nn.Linear(self.enc.config.hidden_size, 1)
    def forward(self, ids, mask):
        h = self.enc(ids, attention_mask=mask).pooler_output
        return self.clf(h).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = TextOnlyClassifier().to(device)
model.load_state_dict(torch.load("saved/text_only_roberta.pt",
                                 map_location=device))
model.eval()

# ---------------- inference & metrics ---------------------------------
pred, true = [], []
with torch.no_grad():
    for batch in tqdm(loader):
        ids  = batch["input_ids"].to(device)
        msk  = batch["attention_mask"].to(device)
        y    = batch["label"].to(device)
        logits = model(ids, msk)
        pred.extend(torch.sigmoid(logits).cpu().numpy())
        true.extend(y.cpu().numpy())

pred, true = np.array(pred), np.array(true)
threshold = 0.5
f1  = f1_score(true, pred >= threshold)
auc = roc_auc_score(true, pred)
tn, fp, fn, tp = confusion_matrix(true, pred >= threshold).ravel()
fp_rate = fp / (fp + tn)

print(f"F1={f1:.3f}   ROC‑AUC={auc:.3f}   FP‑rate={fp_rate:.3%}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("saved/text_only_roberta.pt",
100%|██████████| 649/649 [04:56<00:00,  2.19it/s]

F1=0.848   ROC‑AUC=0.866   FP‑rate=22.289%





In [None]:
# Cellule à lancer quand train_df / test_df existent déjà
#  et contiennent : text, meta (list[20]), label

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
import torch, torch.nn as nn, numpy as np
from tqdm.auto import tqdm
import joblib, os
from pathlib import Path

SEED, BATCH, EPOCHS, LR = 42, 64, 3, 3e-4
torch.manual_seed(SEED); np.random.seed(SEED)

# 1 ── NORMALISER les 20 features ---------------------------------------
scaler = StandardScaler().fit(np.vstack(train_df["meta"]))
train_df["meta"] = list(scaler.transform(np.vstack(train_df["meta"])))
test_df ["meta"] = list(scaler.transform(np.vstack(test_df ["meta"])))

# 2 ── DATASET / LOADER --------------------------------------------------
class MetaDS(Dataset):
    def __init__(self, df):
        self.x = torch.tensor(np.vstack(df["meta"]), dtype=torch.float32)
        self.y = torch.tensor(df["label"].values,     dtype=torch.float32)
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return {"meta": self.x[i], "label": self.y[i]}

train_loader = DataLoader(MetaDS(train_df), BATCH, shuffle=True)
test_loader  = DataLoader(MetaDS(test_df),  BATCH)

# 3 ── MODELE ------------------------------------------------------------
class MetaOnlyMLP(nn.Module):
    def __init__(self, meta_dim=20, hidden=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(meta_dim, hidden), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(hidden, hidden//2), nn.ReLU(),
            nn.Linear(hidden//2, 1)
        )
    def forward(self, m): return self.net(m).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = MetaOnlyMLP().to(device)
lossf  = nn.BCEWithLogitsLoss()
optim  = torch.optim.Adam(model.parameters(), lr=LR)

# 4 ── BOUCLE ------------------------------------------------------------
def run(loader, train=True):
    model.train() if train else model.eval()
    preds, trues, losses = [], [], []
    with torch.set_grad_enabled(train):
        for batch in tqdm(loader, leave=False):
            m = batch["meta"].to(device); y = batch["label"].to(device)

            logit = model(m)
            loss  = lossf(logit, y)

            if train:
                optim.zero_grad()
                loss.backward()
                optim.step()

            losses.append(loss.item())
            preds.extend(torch.sigmoid(logit).detach().cpu().numpy())  # ← fix
            trues.extend(y.cpu().numpy())
    return np.array(preds), np.array(trues), np.mean(losses)


for ep in range(1, EPOCHS+1):
    _, _, tr_loss       = run(train_loader, True)
    y_pred, y_true, vl  = run(test_loader,  False)
    f1  = f1_score(y_true, y_pred >= 0.5)
    auc = roc_auc_score(y_true, y_pred)
    tn, fp, _, _ = confusion_matrix(y_true, y_pred >= 0.5).ravel()
    fp_rate = fp / (fp + tn)
    print(f"[Ep {ep}] train_loss={tr_loss:.4f}  val_loss={vl:.4f}  "
          f"F1={f1:.3f}  ROC‑AUC={auc:.3f}  FP‑rate={fp_rate:.2%}")

# 5 ── SAUVEGARDE --------------------------------------------------------
Path("saved").mkdir(exist_ok=True)
torch.save(model.state_dict(), "saved/meta_only_mlp_split.pt")
joblib.dump(scaler, "saved/meta_scaler_split.gz")
print("✔️  Meta‑only model + scaler saved.")
