In [1]:
import os
from pathlib import Path

# ============================================================================
# CONFIGURATION: Update these paths based on your environment
# ============================================================================
# Base path to the dataset directory
BASE_DATASET_PATH = "/home/jivnesh/Harshit_Surge/dataset/eval_dataset"

# Alternatively, you can use relative paths:
# BASE_DATASET_PATH = Path(__file__).parent / "dataset" / "eval_dataset"

# Ensure the base path exists
if not os.path.exists(BASE_DATASET_PATH):
    print(f"Warning: Dataset path '{BASE_DATASET_PATH}' does not exist. Please update BASE_DATASET_PATH.")

# Define individual dataset file paths
RAID_EVAL_PATH = os.path.join(BASE_DATASET_PATH, "raid_eval.csv")
M4_EVAL_PATH = os.path.join(BASE_DATASET_PATH, "m4_eval.csv")
CHEAT_EVAL_PATH = os.path.join(BASE_DATASET_PATH, "cheat_eval.csv")
HC3_EVAL_PATH = os.path.join(BASE_DATASET_PATH, "hc3_eval.csv")
MAGE_EVAL_PATH = os.path.join(BASE_DATASET_PATH, "mage_eval.csv")

print("Configuration loaded:")
print(f"Base dataset path: {BASE_DATASET_PATH}")


Configuration loaded:
Base dataset path: /home/jivnesh/Harshit_Surge/dataset/eval_dataset


In [2]:
import pandas as pd
raid_eval = pd.read_csv(RAID_EVAL_PATH)
raid_eval["label"] = raid_eval["models"].replace({"human": 0, "ai": 1})
m4_eval = pd.read_csv(M4_EVAL_PATH)
cheat_eval = pd.read_csv(CHEAT_EVAL_PATH)
hc3_eval = pd.read_csv(HC3_EVAL_PATH)
mage_eval = pd.read_csv(MAGE_EVAL_PATH)


  raid_eval["label"] = raid_eval["models"].replace({"human": 0, "ai": 1})


In [4]:
# ============================================================
# Environment & Imports
# ============================================================

# !pip install torchview
# !pip install torchviz

import os
import json
import random
import warnings
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)
from tqdm import tqdm

warnings.filterwarnings("ignore")

# ============================================================
# Reproducibility & Device
# ============================================================

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

# ============================================================
# Style Contrastive Encoder
# ============================================================

class StyleContrastiveEncoder(nn.Module):
    def __init__(
        self,
        base_model: str = "microsoft/deberta-v3-base",
        embedding_dim: int = 256,
        dropout: float = 0.1,
        freeze_backbone: bool = False,
    ):
        super().__init__()

        self.backbone = AutoModel.from_pretrained(base_model)
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)

        if freeze_backbone:
            for p in self.backbone.parameters():
                p.requires_grad = False

        backbone_dim = self.backbone.config.hidden_size

        self.projection_head = nn.Sequential(
            nn.Linear(backbone_dim, backbone_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(backbone_dim // 2, embedding_dim),
            nn.LayerNorm(embedding_dim),
        )

        print(f"Initialized encoder: {backbone_dim} → {embedding_dim}")

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        hidden_states = outputs.last_hidden_state
        attn = attention_mask.unsqueeze(-1).float()

        pooled = (hidden_states * attn).sum(1) / attn.sum(1)
        emb = self.projection_head(pooled)
        emb = F.normalize(emb, p=2, dim=1)

        return emb

# ============================================================
# InfoNCE Loss (unchanged)
# ============================================================

class InfoNCELoss(nn.Module):
    def __init__(self, temperature: float = 0.1):
        super().__init__()
        self.temperature = temperature

    def forward(self, anchor, positive, negative):
        pos_sim = torch.sum(anchor * positive, dim=1) / self.temperature
        neg_sim = torch.sum(anchor * negative, dim=1) / self.temperature

        logits = torch.stack([pos_sim, neg_sim], dim=1)
        labels = torch.zeros(anchor.size(0), dtype=torch.long, device=anchor.device)

        loss = F.cross_entropy(logits, labels)
        acc = (pos_sim > neg_sim).float().mean()

        return loss, acc

# ============================================================
# Embedding Extraction (Frozen Encoder)
# ============================================================

@torch.no_grad()
def extract_embeddings(
    model,
    tokenizer,
    texts: List[str],
    device,
    batch_size: int = 64,
    max_length: int = 512,
):
    model.eval()
    encoder = model.module if hasattr(model, "module") else model

    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Extracting embeddings"):
        batch = texts[i : i + batch_size]
        toks = tokenizer(
            batch,
            max_length=max_length,
            truncation=True,
            padding=True,
            return_tensors="pt",
        ).to(device)

        emb = encoder(toks["input_ids"], toks["attention_mask"])
        all_embs.append(emb.cpu())

    return torch.cat(all_embs, dim=0).numpy()

# ============================================================
# MLP Probe
# ============================================================

class MLPProbe(nn.Module):
    def __init__(
        self,
        emb_dim: int = 256,
        hidden_dim: int = 128,
        num_classes: int = 2,
        dropout: float = 0.2,
    ):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def forward(self, x):
        return self.net(x)

# ============================================================
# Safe Torch Loading (PyTorch ≥2.6 compatible)
# ============================================================

try:
    torch.serialization.add_safe_globals([np.core.multiarray.scalar])
except Exception:
    pass

def safe_load(path: str, map_location):
    try:
        return torch.load(path, map_location=map_location)
    except Exception as e:
        print(f"Safe load failed for {path}: {e}")
        print("Retrying with weights_only=False (trusted file only)")
        return torch.load(path, map_location=map_location, weights_only=False)

# ============================================================
# MLP-Based Style Detector
# ============================================================

class MLPStyleDetector:
    def __init__(self, model_path: str, probe_path: str, device):
        self.device = device

        self.model = StyleContrastiveEncoder().to(device)
        ckpt = safe_load(model_path, device)
        enc_state = ckpt.get("model_state_dict", ckpt)

        if enc_state and next(iter(enc_state)).startswith("module."):
            enc_state = {k[7:]: v for k, v in enc_state.items()}

        self.model.load_state_dict(enc_state, strict=False)
        self.model.eval()
        self.tokenizer = self.model.tokenizer

        probe_ckpt = safe_load(probe_path, device)
        emb_dim = probe_ckpt["emb_dim"]

        self.probe = MLPProbe(emb_dim=emb_dim).to(device)
        self.probe.load_state_dict(probe_ckpt["probe_state_dict"])
        self.probe.eval()

        self.meta = probe_ckpt.get("meta", {})
        print(f"Loaded encoder + probe | best_val_f1={self.meta.get('best_val_f1', '?')}")

    @torch.no_grad()
    def batch_predict(self, texts: List[str], batch_size: int = 64):
        preds = []
        for i in tqdm(range(0, len(texts), batch_size), desc="MLPDetect"):
            batch = texts[i : i + batch_size]
            toks = self.tokenizer(
                batch,
                max_length=512,
                truncation=True,
                padding=True,
                return_tensors="pt",
            ).to(self.device)

            emb = self.model(toks["input_ids"], toks["attention_mask"])
            logits = self.probe(emb)
            preds.extend(logits.argmax(dim=1).cpu().tolist())

        return preds

# ============================================================
# Evaluation
# ============================================================

def evaluate_with_mlp(detector, df: pd.DataFrame, name: str, batch_size: int = 64):
    texts = df["text"].astype(str).tolist()
    labels = df["label"].tolist()

    preds = detector.batch_predict(texts, batch_size=batch_size)
    acc = accuracy_score(labels, preds)

    print(f"\nEvaluating {name} (n={len(df)})")
    print(classification_report(labels, preds, target_names=["Human", "AI"], zero_division=0))

    cm = confusion_matrix(labels, preds)
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) else 0.0

    print("Confusion Matrix:\n", cm)
    print(f"False Positive Rate: {fpr:.4f}")

    return {"name": name, "accuracy": acc, "fpr": fpr, "cm": cm}

# ============================================================
# Main Execution
# ============================================================

MODEL_PATH = "best_style_model.pt"
PROBE_PATH = "mlp_probe.pt"

if os.path.exists(MODEL_PATH) and os.path.exists(PROBE_PATH):
    detector = MLPStyleDetector(MODEL_PATH, PROBE_PATH, device)
    results = []

    if "raid_eval" in globals():
        results.append(evaluate_with_mlp(detector, raid_eval, "RAID Eval"))
    if "m4_eval" in globals():
        results.append(evaluate_with_mlp(detector, m4_eval, "M4 Eval"))
    if "cheat_eval" in globals():
        results.append(evaluate_with_mlp(detector, cheat_eval, "Cheat Eval"))
    if "hc3_eval" in globals():
        results.append(evaluate_with_mlp(detector, hc3_eval, "HC3 Eval"))
    if "mage_eval" in globals():
        results.append(evaluate_with_mlp(detector, mage_eval, "MAGE Eval"))

    if results:
        summary = pd.DataFrame(
            [{"Dataset": r["name"], "Accuracy": r["accuracy"], "FPR": r["fpr"]} for r in results]
        )
        print("\nSummary:\n", summary)
else:
    print("Missing best_style_model.pt or mlp_probe.pt")


Using device: cuda
GPU 0: NVIDIA A100 80GB PCIe
GPU 1: NVIDIA A100 80GB PCIe
GPU 2: NVIDIA A100 80GB PCIe
GPU 3: NVIDIA A100 80GB PCIe


Initialized encoder: 768 → 256
Safe load failed for mlp_probe.pt: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy.core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe_globals([numpy.core.multiarray.scalar])` or the `torch.serialization.safe_globals([numpy.core.multiarray.scalar])` context manager to allowlist this global if you trust this class/function.

C

MLPDetect: 100%|██████████| 188/188 [03:39<00:00,  1.17s/it]



Evaluating RAID Eval (n=12000)
              precision    recall  f1-score   support

       Human       0.93      0.98      0.96      4000
          AI       0.99      0.97      0.98      8000

    accuracy                           0.97     12000
   macro avg       0.96      0.97      0.97     12000
weighted avg       0.97      0.97      0.97     12000

Confusion Matrix:
 [[3936   64]
 [ 275 7725]]
False Positive Rate: 0.0160


MLPDetect: 100%|██████████| 157/157 [02:18<00:00,  1.13it/s]



Evaluating M4 Eval (n=10000)
              precision    recall  f1-score   support

       Human       0.51      0.99      0.68      5000
          AI       0.82      0.07      0.12      5000

    accuracy                           0.53     10000
   macro avg       0.67      0.53      0.40     10000
weighted avg       0.67      0.53      0.40     10000

Confusion Matrix:
 [[4926   74]
 [4662  338]]
False Positive Rate: 0.0148


MLPDetect: 100%|██████████| 157/157 [00:58<00:00,  2.67it/s]


Evaluating Cheat Eval (n=10000)
              precision    recall  f1-score   support

       Human       1.00      0.94      0.97      5000
          AI       0.95      1.00      0.97      5000

    accuracy                           0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

Confusion Matrix:
 [[4714  286]
 [  13 4987]]
False Positive Rate: 0.0572

Summary:
       Dataset  Accuracy     FPR
0   RAID Eval   0.97175  0.0160
1     M4 Eval   0.52640  0.0148
2  Cheat Eval   0.97010  0.0572





In [None]:
import os
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Union, List
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

# --- Pre-computation Setup & Warnings ---
torch.set_grad_enabled(False)

# --- Metric Functions (UNCHANGED) ---
ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)

def perplexity(encoding: transformers.BatchEncoding, logits: torch.Tensor, median: bool = False, temperature: float = 1.0):
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()
    if median:
        ce_nan = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).masked_fill(~shifted_attention_mask.bool(), float("nan")))
        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        ppl = (ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels) * shifted_attention_mask).sum(1) / shifted_attention_mask.sum(1)
        ppl = ppl.to("cpu").float().numpy()
    return ppl

def entropy(p_logits: torch.Tensor, q_logits: torch.Tensor, encoding: transformers.BatchEncoding, pad_token_id: int, median: bool = False, sample_p: bool = False, temperature: float = 1.0):
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]
    p_scores, q_scores = p_logits / temperature, q_logits / temperature
    p_proba = softmax_fn(p_scores).view(-1, vocab_size)
    if sample_p:
        p_proba = torch.multinomial(p_proba.view(-1, vocab_size), replacement=True, num_samples=1).view(-1)
    q_scores = q_scores.view(-1, vocab_size)
    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)
    if median:
        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        agg_ce = (((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy())
    return agg_ce

# --- Huggingface token (if any) ---
huggingface_config = {"TOKEN": os.environ.get("HF_TOKEN", None)}

# NOTE: The threshold is based on the original Falcon models.
BINOCULARS_ACCURACY_THRESHOLD = 0.9015310749276843

# --- Device check ---
if not torch.cuda.is_available():
    print("Warning: CUDA not available. This code expects 4 GPUs for best performance.")
CUDA_DEVICE_COUNT = torch.cuda.device_count()
print(f"CUDA available: {torch.cuda.is_available()}, GPU count: {CUDA_DEVICE_COUNT}")

# --- Utilities to build max_memory map for 4 x A100 (80GB) ---
def build_max_memory_map_for_a100(reserve_mb_per_gpu: int = 2000):
    """
    Build a max_memory mapping tuned for 4x A100 80GB.
    reserve_mb_per_gpu - memory left free on each GPU for activations/tensors.
    """
    max_memory = {}
    for i in range(torch.cuda.device_count()):
        total_bytes = torch.cuda.get_device_properties(i).total_memory
        total_mb = total_bytes // (1024 * 1024)
        avail_mb = max(0, total_mb - reserve_mb_per_gpu)
        max_memory[f"cuda:{i}"] = f"{avail_mb}MB"
    # CPU budget for offloading
    max_memory["cpu"] = "30000MB"
    return max_memory

# --- Binoculars Class (tuned for 4xA100) ---
class Binoculars(object):
    def __init__(self,
                 observer_name_or_path: str = "tiiuae/falcon-7b",
                 performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
                 use_bfloat16: bool = True,
                 max_token_observed: int = 512,
                 reserve_mb_per_gpu: int = 2000,
                 explicit_split: bool = False):
        """
        Robust loader that retries with integer-keyed max_memory if transformers complains
        'Device cuda:0 is not recognized, available devices are integers(...)'.
        """
        print("Initializing Binoculars (robust multi-GPU loader)...")
        self.threshold = BINOCULARS_ACCURACY_THRESHOLD
        dtype = torch.bfloat16 if use_bfloat16 else torch.float32
        hf_token = huggingface_config.get("TOKEN", None)
        auth_args = {"use_auth_token": hf_token} if hf_token else {}

        # build both styles of max_memory maps
        n_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
        max_memory_str = {}
        max_memory_int = {}
        for i in range(n_gpus):
            total_bytes = torch.cuda.get_device_properties(i).total_memory
            total_mb = total_bytes // (1024 * 1024)
            avail_mb = max(0, total_mb - reserve_mb_per_gpu)
            max_memory_str[f"cuda:{i}"] = f"{avail_mb}MB"
            max_memory_int[i] = f"{avail_mb}MB"
        max_memory_str["cpu"] = "30000MB"
        max_memory_int["cpu"] = "30000MB"

        def try_from_pretrained(name_or_path, device_map, max_memory_map):
            """Wrap from_pretrained to show clearer errors."""
            return AutoModelForCausalLM.from_pretrained(
                name_or_path,
                device_map=device_map,
                max_memory=max_memory_map,
                torch_dtype=dtype,
                trust_remote_code=True,
                **auth_args
            )

        try:
            if explicit_split and n_gpus >= 4:
                # Create explicit split maps (both styles)
                max_memory_obs_str = {"cuda:0": max_memory_str.get("cuda:0", "0MB"),
                                      "cuda:1": max_memory_str.get("cuda:1", "0MB"),
                                      "cuda:2": "2000MB", "cuda:3": "2000MB", "cpu": "30000MB"}
                max_memory_perf_str = {"cuda:0": "2000MB", "cuda:1": "2000MB",
                                       "cuda:2": max_memory_str.get("cuda:2", "0MB"),
                                       "cuda:3": max_memory_str.get("cuda:3", "0MB"),
                                       "cpu": "30000MB"}

                max_memory_obs_int = {0: max_memory_int.get(0, "0MB"),
                                      1: max_memory_int.get(1, "0MB"),
                                      2: "2000MB", 3: "2000MB", "cpu": "30000MB"}
                max_memory_perf_int = {0: "2000MB", 1: "2000MB",
                                       2: max_memory_int.get(2, "0MB"),
                                       3: max_memory_int.get(3, "0MB"),
                                       "cpu": "30000MB"}

                # Try string-keyed first, then fallback to int-keyed if needed
                try:
                    print("Loading observer (explicit split) with string-keyed max_memory...")
                    self.observer_model = try_from_pretrained(observer_name_or_path, device_map="auto", max_memory_map=max_memory_obs_str)
                    print("Loading performer (explicit split) with string-keyed max_memory...")
                    self.performer_model = try_from_pretrained(performer_name_or_path, device_map="auto", max_memory_map=max_memory_perf_str)
                    used_int_keys = False
                except Exception as e:
                    msg = str(e)
                    if "Device cuda:0 is not recognized" in msg or "available devices are integers" in msg:
                        print("Detected device-keying mismatch; retrying explicit split with integer-keyed max_memory...")
                        self.observer_model = try_from_pretrained(observer_name_or_path, device_map="auto", max_memory_map=max_memory_obs_int)
                        self.performer_model = try_from_pretrained(performer_name_or_path, device_map="auto", max_memory_map=max_memory_perf_int)
                        used_int_keys = True
                    else:
                        raise

                # Set preferred input devices (string style for torch.device)
                self.obs_input_device = "cuda:0"
                self.perf_input_device = "cuda:2"

            else:
                # Auto-shard across GPUs: try string-keyed map first, fallback to integer-keyed map.
                try:
                    print("Attempting to load models with string-keyed max_memory map...")
                    print(f"max_memory map (string keys): {max_memory_str}")
                    self.observer_model = try_from_pretrained(observer_name_or_path, device_map="auto", max_memory_map=max_memory_str)
                    self.performer_model = try_from_pretrained(performer_name_or_path, device_map="auto", max_memory_map=max_memory_str)
                    used_int_keys = False
                except Exception as e:
                    msg = str(e)
                    if "Device cuda:0 is not recognized" in msg or "available devices are integers" in msg:
                        print("Transformers wants integer device keys — retrying with integer-keyed max_memory map...")
                        print(f"max_memory map (int keys): {max_memory_int}")
                        self.observer_model = try_from_pretrained(observer_name_or_path, device_map="auto", max_memory_map=max_memory_int)
                        self.performer_model = try_from_pretrained(performer_name_or_path, device_map="auto", max_memory_map=max_memory_int)
                        used_int_keys = True
                    else:
                        raise

                # choose default input devices
                self.obs_input_device = "cuda:0" if torch.cuda.is_available() and n_gpus > 0 else "cpu"
                self.perf_input_device = "cuda:0" if torch.cuda.is_available() and n_gpus > 0 else "cpu"

            # finalize
            self.observer_model.eval()
            self.performer_model.eval()
            self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path, **auth_args)
            if not self.tokenizer.pad_token:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            self.max_token_observed = max_token_observed
            print("Binoculars initialized successfully.")
        except Exception as e:
            print("Error during model initialization (final):", e)
            raise


    def _tokenize(self, batch: list[str]) -> transformers.BatchEncoding:
        # tokenization on CPU; we will move encodings to GPU devices before model call
        return self.tokenizer(batch, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_token_observed, return_token_type_ids=False)

    @torch.inference_mode()
    def _get_logits(self, encodings: transformers.BatchEncoding) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Send the encodings to each model's preferred input device.
        (With HF sharded models, sending to cuda:0 is typical; explicit_split uses different devices.)
        """
        # Move encodings to observer input device
        observer_device = torch.device(self.obs_input_device) if torch.cuda.is_available() else torch.device("cpu")
        performer_device = torch.device(self.perf_input_device) if torch.cuda.is_available() else torch.device("cpu")

        # .to(device) works on BatchEncoding
        obs_enc = {k: v.to(observer_device) for k, v in encodings.items()}
        perf_enc = {k: v.to(performer_device) for k, v in encodings.items()}

        observer_out = self.observer_model(**obs_enc)
        performer_out = self.performer_model(**perf_enc)

        # logits (shape: batch, seq_len, vocab)
        observer_logits = observer_out.logits
        performer_logits = performer_out.logits

        # synchronize if CUDA
        if torch.cuda.is_available():
            torch.cuda.synchronize()

        return observer_logits, performer_logits

    def compute_score(self, input_text: Union[str, List[str]]) -> Union[float, List[float]]:
        batch = [input_text] if isinstance(input_text, str) else input_text
        encodings = self._tokenize(batch)
        observer_logits, performer_logits = self._get_logits(encodings)
        # we send encodings to performer device for ppl calculation to ensure mask alignments
        # choose the performer device for perplexity computation (works fine)
        perf_device = torch.device(self.perf_input_device) if torch.cuda.is_available() else torch.device("cpu")
        ppl_val = perplexity(encodings.to(perf_device), performer_logits)
        x_ppl_val = entropy(observer_logits.to(observer_logits.device), performer_logits.to(performer_logits.device), encodings.to(perf_device), self.tokenizer.pad_token_id)
        binoculars_scores = ppl_val / x_ppl_val
        return binoculars_scores.tolist()[0] if isinstance(input_text, str) else binoculars_scores.tolist()

# --- Evaluation Function (MODIFIED to allow big batch sizes) ---
def evaluate_on_dataframe(df: pd.DataFrame, batch_size: int = 32):
    print("\n--- Starting Binoculars Evaluation (labels: 0=human, 1=ai) ---")
    try:
        binoculars = Binoculars()
    except Exception as e:
        print(f"\n--- ERROR ---\nFailed to initialize Binoculars classifier: {e}")
        return

    text_samples = df["text"].tolist()
    y_true = df["label"].astype(int).tolist()
    y_pred, all_scores = [], []

    print(f"\nRunning predictions on {len(text_samples)} samples in batches of {batch_size}...")
    for i in tqdm(range(0, len(text_samples), batch_size), desc="Processing Batches"):
        batch_texts = text_samples[i:i + batch_size]
        batch_scores = binoculars.compute_score(batch_texts)
        batch_preds = (np.array(batch_scores) < binoculars.threshold).astype(int).tolist()  # 1=ai, 0=human
        y_pred.extend(batch_preds)
        all_scores.extend(batch_scores)

    results_df = pd.DataFrame({
        'true_label': y_true,
        'predicted_label': y_pred,
        'binoculars_score': all_scores
    })

    y_true_arr = np.array(y_true, dtype=int)
    y_pred_arr = np.array(y_pred, dtype=int)
    y_scores = -np.array(all_scores, dtype=float)  # higher => more likely ai

    print("\n" + "="*50)
    print("      Binoculars Classification Statistics")
    print("="*50)

    print("\n--- Classification Report (labels 0/1) ---")
    try:
        print(classification_report(y_true_arr, y_pred_arr, labels=[0, 1], digits=4))
    except Exception as e:
        print(f"Could not print classification report: {e}")

    accuracy = accuracy_score(y_true_arr, y_pred_arr)
    print(f"\nOverall Accuracy: {accuracy:.4f}")

    try:
        auc_score = roc_auc_score(y_true_arr, y_scores)
        print(f"ROC AUC Score: {auc_score:.4f}")
    except ValueError as e:
        print(f"Could not calculate ROC AUC Score: {e}")

    try:
        cm = confusion_matrix(y_true_arr, y_pred_arr, labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

        print("\n--- Confusion Matrix (rows=true, cols=pred) ---")
        print(cm)
        print(f"\nFalse Positive Rate (human misclassified as ai): {fpr:.4f}")
    except ValueError as e:
        print(f"Could not calculate Confusion Matrix or FPR: {e}")

    print("\n" + "="*50)
    return results_df

# --- MAIN EXECUTION (example) ---
if __name__ == "__main__":
    print("Loading and preparing the dataset (assumes raid_eval, m4_eval, cheat_eval exist)...")
    # You can increase batch_size as your memory+seq length allows (try 32 -> 64 -> 128)
    evaluate_on_dataframe(raid_eval, batch_size=32)
    evaluate_on_dataframe(m4_eval, batch_size=32)
    evaluate_on_dataframe(cheat_eval, batch_size=32)


CUDA available: True, GPU count: 4
Loading and preparing the dataset (assumes raid_eval, m4_eval, cheat_eval exist)...

--- Starting Binoculars Evaluation (labels: 0=human, 1=ai) ---
Initializing Binoculars (robust multi-GPU loader)...
Attempting to load models with string-keyed max_memory map...
max_memory map (string keys): {'cuda:0': '79037MB', 'cuda:1': '79037MB', 'cuda:2': '79037MB', 'cuda:3': '79037MB', 'cpu': '30000MB'}


Transformers wants integer device keys — retrying with integer-keyed max_memory map...
max_memory map (int keys): {0: '79037MB', 1: '79037MB', 2: '79037MB', 3: '79037MB', 'cpu': '30000MB'}


Loading checkpoint shards: 100%|██████████| 2/2 [00:36<00:00, 18.07s/it]


Loading checkpoint shards: 100%|██████████| 2/2 [00:36<00:00, 18.49s/it]


Binoculars initialized successfully.

Running predictions on 12000 samples in batches of 32...


Processing Batches: 100%|██████████| 375/375 [29:08<00:00,  4.66s/it]



      Binoculars Classification Statistics

--- Classification Report (labels 0/1) ---
              precision    recall  f1-score   support

           0     0.6865    0.8912    0.7756      4000
           1     0.9361    0.7965    0.8607      8000

    accuracy                         0.8281     12000
   macro avg     0.8113    0.8439    0.8181     12000
weighted avg     0.8529    0.8281    0.8323     12000


Overall Accuracy: 0.8281
ROC AUC Score: 0.9301

--- Confusion Matrix (rows=true, cols=pred) ---
[[3565  435]
 [1628 6372]]

False Positive Rate (human misclassified as ai): 0.1087


--- Starting Binoculars Evaluation (labels: 0=human, 1=ai) ---
Initializing Binoculars (robust multi-GPU loader)...
Attempting to load models with string-keyed max_memory map...
max_memory map (string keys): {'cuda:0': '79037MB', 'cuda:1': '79037MB', 'cuda:2': '79037MB', 'cuda:3': '79037MB', 'cpu': '30000MB'}
Transformers wants integer device keys — retrying with integer-keyed max_memory map...
max_

Loading checkpoint shards: 100%|██████████| 2/2 [00:31<00:00, 15.82s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.91s/it]


Binoculars initialized successfully.

Running predictions on 10000 samples in batches of 32...


Processing Batches:  52%|█████▏    | 163/313 [12:01<11:02,  4.42s/it]

In [3]:
import os
import time
import torch
import pandas as pd
import numpy as np
from scipy.stats import norm
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from transformers import AutoModelForCausalLM, AutoTokenizer

# A dictionary to map short names to Hugging Face model identifiers
model_fullnames = {
    'gemma3-4b': 'google/gemma-3-4b-it', # Use instruct-tuned version for better performance
}

def get_model_fullname(model_name):
    return model_fullnames.get(model_name, model_name)

def load_model(model_name, device, cache_dir, quantization=None):
    model_fullname = get_model_fullname(model_name)
    print(f'Loading model {model_fullname}...')
    model_kwargs = {"cache_dir": cache_dir}
    print("-> Loading model in bfloat16 (half-precision)...")
    model_kwargs["torch_dtype"] = torch.bfloat16
    model_kwargs["device_map"] = "auto"
    model = AutoModelForCausalLM.from_pretrained(model_fullname, **model_kwargs)
    model.eval()
    return model

def load_tokenizer(model_name, cache_dir):
    model_fullname = get_model_fullname(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_fullname, cache_dir=cache_dir)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer

def get_sampling_discrepancy_analytic(logits_ref, logits_score, labels):
    if logits_ref.size(-1) != logits_score.size(-1):
        vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
        logits_ref = logits_ref[:, :, :vocab_size]
        logits_score = logits_score[:, :, :vocab_size]
    labels = labels.unsqueeze(-1) if labels.ndim == logits_score.ndim - 1 else labels
    lprobs_score = torch.log_softmax(logits_score, dim=-1)
    probs_ref = torch.softmax(logits_ref, dim=-1)
    log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
    mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
    var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
    log_likelihood_sum = log_likelihood.sum(dim=-1)
    mean_ref_sum = mean_ref.sum(dim=-1)
    var_ref_sum = var_ref.sum(dim=-1)
    denominator = torch.sqrt(torch.relu(var_ref_sum)) + 1e-6
    discrepancy = (log_likelihood_sum - mean_ref_sum) / denominator
    return discrepancy.item()

def compute_prob_norm(x, mu0, sigma0, mu1, sigma1):
    pdf_value0 = norm.pdf(x, loc=mu0, scale=sigma0)
    pdf_value1 = norm.pdf(x, loc=mu1, scale=sigma1)
    prob = pdf_value1 / (pdf_value0 + pdf_value1 + 1e-6)
    return prob

class FastDetectGPTDetector:
    def __init__(self, scoring_model_name, sampling_model_name, device, cache_dir, quantization):
        self.scoring_model_name = scoring_model_name
        self.sampling_model_name = sampling_model_name
        self.scoring_tokenizer = load_tokenizer(scoring_model_name, cache_dir)
        self.scoring_model = load_model(scoring_model_name, device, cache_dir, quantization)
        if sampling_model_name == scoring_model_name:
            self.sampling_model = self.scoring_model
            self.sampling_tokenizer = self.scoring_tokenizer
        else:
            self.sampling_tokenizer = load_tokenizer(sampling_model_name, cache_dir)
            self.sampling_model = load_model(sampling_model_name, device, cache_dir, quantization)
        # Using pre-calibrated parameters
        self.classifier_params = {'mu0': -0.0707, 'sigma0': 0.9520, 'mu1': 2.9306, 'sigma1': 1.9039}

    def compute_prob(self, text):
        tokenized_score = self.scoring_tokenizer(text, truncation=True, return_tensors="pt", max_length=1024)
        labels = tokenized_score.input_ids[:, 1:].to(self.scoring_model.device)
        if labels.shape[1] == 0:
            return 0.0
        with torch.no_grad():
            inputs_score = {k: v.to(self.scoring_model.device) for k, v in tokenized_score.items()}
            logits_score = self.scoring_model(**inputs_score).logits[:, :-1]
            if self.sampling_model_name == self.scoring_model_name:
                logits_ref = logits_score
            else:
                tokenized_ref = self.sampling_tokenizer(text, truncation=True, return_tensors="pt", max_length=1024)
                inputs_ref = {k: v.to(self.sampling_model.device) for k, v in tokenized_ref.items()}
                logits_ref = self.sampling_model(**inputs_ref).logits[:, :-1]
        crit = get_sampling_discrepancy_analytic(logits_ref, logits_score, labels)
        prob = compute_prob_norm(crit, **self.classifier_params)
        return prob

# --- Script Configuration ---
SCORING_MODEL_NAME = "gemma3-4b"
SAMPLING_MODEL_NAME = "gemma3-4b" # Use the same model for simplicity
DEVICE = "cuda:03" if torch.cuda.is_available() else "cpu"
CACHE_DIR = "./model_cache"
OUTPUT_FILE = "fastdetectgpt_gemma_results.csv"

# --- Main Execution ---
print("--- Initializing Fast-DetectGPT Detector ---")
detector = FastDetectGPTDetector(
    scoring_model_name=SCORING_MODEL_NAME,
    sampling_model_name=SAMPLING_MODEL_NAME,
    device=DEVICE,
    cache_dir=CACHE_DIR,
    quantization=None
)


def evaluate_detector(df, detector):
    """
    Runs the Fast-DetectGPT detector on a given DataFrame and evaluates the results.

    Args:
        df (pd.DataFrame): DataFrame with 'text' and 'label' columns. 
                           'label' should be 0 for human and 1 for AI.
        detector (FastDetectGPTDetector): An initialized detector instance.
        output_filename (str): The path to save the detailed results CSV file.
    """
    print(f"\n--- Running detection on {len(df)} samples ---")
    all_probs = []
    true_labels = []

    # Ensure the DataFrame has the required columns
    if 'text' not in df.columns or 'label' not in df.columns:
        raise ValueError("Input DataFrame must contain 'text' and 'label' columns.")

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing samples"):
        try:
            prob = detector.compute_prob(row['text'])
            all_probs.append(prob)
            true_labels.append(row['label'])
        except Exception as e:
            print(f"Error processing sample (index {index}): {e}. Skipping.")
            continue

    # --- Evaluating Results ---
    print("\n" + "="*50)
    print("      Fast-DetectGPT Classification Statistics")
    print("="*50)

    if len(all_probs) > 0 and len(set(true_labels)) > 1:
        # Convert probabilities to binary predictions for classification report
        binary_predictions = [1 if p > 0.5 else 0 for p in all_probs]

        # 1. Classification Report (Precision, Recall, F1-Score)
        print("\n--- Classification Report ---")
        print(classification_report(true_labels, binary_predictions, target_names=['Human', 'AI']))

        # 2. Overall Accuracy
        accuracy = accuracy_score(true_labels, binary_predictions)
        print(f"\nOverall Accuracy: {accuracy:.4f}")

        # 3. ROC AUC Score
        roc_auc = roc_auc_score(true_labels, all_probs)
        print(f"ROC AUC Score: {roc_auc:.4f}")

        # 4. Confusion Matrix and FPR
        cm = confusion_matrix(true_labels, binary_predictions)
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

        print("\n--- Confusion Matrix ---")
        print(f"{'':<15} | {'Predicted Human':<15} | {'Predicted AI':<15}")
        print("-" * 50)
        print(f"{'Actual Human':<15} | {tn:<15} | {fp:<15}")
        print(f"{'Actual AI':<15} | {fn:<15} | {tp:<15}")
        print("-" * 50)
        print(f"\nFalse Positive Rate (FPR): {fpr:.4f} (Human text incorrectly flagged as AI)")
        print("\n" + "="*50)

        # results_df = pd.DataFrame({'text': df['text'], 'true_label': true_labels, 'predicted_prob_ai': all_probs})
        # output_filename="fastdetectgpt_gemma_results.csv"
        # results_df.to_csv(output_filename, index=False)
        # print(f"Detailed results saved to {output_filename}")
    else:
        print(f"Could not compute metrics. Processed {len(all_probs)} samples.")

# Example usage with one of the provided dataframes (e.g., cheat_eval)
# You can replace `cheat_eval` with `m4_eval` or `raid_eval`
# Note: `raid_eval` has a 'models' column, but the function uses the 'label' column as requested.
evaluate_detector(cheat_eval, detector)
evaluate_detector(m4_eval, detector)
evaluate_detector(raid_eval, detector)

  from .autonotebook import tqdm as notebook_tqdm


--- Initializing Fast-DetectGPT Detector ---
Loading model google/gemma-3-4b-it...
-> Loading model in bfloat16 (half-precision)...


`torch_dtype` is deprecated! Use `dtype` instead!
2026-01-31 05:49:28.722833: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-31 05:49:28.753859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769806168.788149 1610340 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769806168.797663 1610340 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769806168.821830 1610340 computation_placer.cc:177] computation placer already r


--- Running detection on 10000 samples ---


Processing samples: 100%|██████████| 10000/10000 [13:09<00:00, 12.66it/s]



      Fast-DetectGPT Classification Statistics

--- Classification Report ---
              precision    recall  f1-score   support

       Human       0.52      1.00      0.69      5000
          AI       0.97      0.09      0.17      5000

    accuracy                           0.54     10000
   macro avg       0.75      0.54      0.43     10000
weighted avg       0.75      0.54      0.43     10000


Overall Accuracy: 0.5436
ROC AUC Score: 0.8214

--- Confusion Matrix ---
                | Predicted Human | Predicted AI   
--------------------------------------------------
Actual Human    | 4985            | 15             
Actual AI       | 4549            | 451            
--------------------------------------------------

False Positive Rate (FPR): 0.0030 (Human text incorrectly flagged as AI)


--- Running detection on 10000 samples ---


Processing samples: 100%|██████████| 10000/10000 [15:09<00:00, 11.00it/s]



      Fast-DetectGPT Classification Statistics

--- Classification Report ---
              precision    recall  f1-score   support

       Human       0.50      1.00      0.67      5000
          AI       0.00      0.00      0.00      5000

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000


Overall Accuracy: 0.4999
ROC AUC Score: 0.4958

--- Confusion Matrix ---
                | Predicted Human | Predicted AI   
--------------------------------------------------
Actual Human    | 4999            | 1              
Actual AI       | 5000            | 0              
--------------------------------------------------

False Positive Rate (FPR): 0.0002 (Human text incorrectly flagged as AI)


--- Running detection on 12000 samples ---


Processing samples: 100%|██████████| 12000/12000 [15:58<00:00, 12.52it/s]


      Fast-DetectGPT Classification Statistics

--- Classification Report ---
              precision    recall  f1-score   support

       Human       0.34      1.00      0.50      4000
          AI       0.92      0.02      0.04      8000

    accuracy                           0.35     12000
   macro avg       0.63      0.51      0.27     12000
weighted avg       0.73      0.35      0.20     12000


Overall Accuracy: 0.3464
ROC AUC Score: 0.5795

--- Confusion Matrix ---
                | Predicted Human | Predicted AI   
--------------------------------------------------
Actual Human    | 3986            | 14             
Actual AI       | 7829            | 171            
--------------------------------------------------

False Positive Rate (FPR): 0.0035 (Human text incorrectly flagged as AI)




