<a href="https://colab.research.google.com/github/MalavikaSNairr/TRACE_AI_SAMPLE/blob/main/Roberta_freeze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q torch transformers scikit-learn pandas tqdm


In [8]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import json
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [6]:
def load_clean_jsonl(path):
    data = []
    skipped = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                obj = json.loads(line)
                # Removed the condition `if "text" in obj and "label" in obj:`
                # This ensures all valid JSON lines are loaded.
                data.append(obj)
            except json.JSONDecodeError:
                skipped += 1
    print(f"Loaded {len(data)} samples, Skipped {skipped} corrupted lines")
    return pd.DataFrame(data)


def extract_features(text):
    words = text.split()
    return np.array([
        len(words),
        np.mean([len(w) for w in words]) if words else 0,
        text.count(","),
        text.count(".")
    ], dtype=np.float32)

In [7]:
class HybridDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256,
                 train_mode=False, feature_stats=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.train_mode = train_mode

        # üîë FIX: Use training set statistics for normalization
        if feature_stats is None:
            all_feats = np.array([extract_features(t) for t in texts])
            self.feat_mean = all_feats.mean(axis=0)
            self.feat_std = all_feats.std(axis=0) + 1e-6
        else:
            self.feat_mean, self.feat_std = feature_stats

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        feats = extract_features(self.texts[idx])
        feats = (feats - self.feat_mean) / self.feat_std

        # üîë STRONGER augmentation during training
        if self.train_mode:
            feats += np.random.normal(0, 0.3, feats.shape)

        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "features": torch.tensor(feats, dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def get_feature_stats(self):
        return (self.feat_mean, self.feat_std)


In [8]:
class RobertaWithFeatures(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")

        # üîë FREEZE early layers to prevent overfitting
        for param in self.roberta.embeddings.parameters():
            param.requires_grad = False
        for layer in self.roberta.encoder.layer[:8]:
            for param in layer.parameters():
                param.requires_grad = False

        self.feature_fc = nn.Sequential(
            nn.Linear(4, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # üîë SIMPLER classifier with more dropout
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(768 + 64, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 2)
        )

    def forward(self, input_ids, attention_mask, features):
        roberta_out = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).last_hidden_state[:, 0]

        feat_out = self.feature_fc(features)
        x = torch.cat([roberta_out, feat_out], dim=1)
        logits = self.classifier(x)

        # üîë AGGRESSIVE logit clamping
        logits = torch.clamp(logits, -2.5, 2.5)
        return logits


In [10]:
def train():
    # Load dataset
    df = load_clean_jsonl("final_dataset_no_emojis.jsonl")

    # üîë FIX: Validate dataset has both classes
    print("\n" + "=" * 60)
    print("VALIDATING DATASET")
    print("=" * 60)

    class_counts = df["label"].value_counts().sort_index()
    print(f"Total samples: {len(df)}")
    print(f"Class distribution: {class_counts.to_dict()}")

    if len(class_counts) < 2:
        print("\n‚ùå ERROR: Dataset must have BOTH classes!")
        print(f"   Found only: {list(class_counts.index)}")
        print(f"   Need: [0, 1]")
        print("\nüí° Check your JSONL file:")
        print("   - Label 0 = Human-written")
        print("   - Label 1 = AI-generated")
        return

    if 0 not in class_counts or 1 not in class_counts:
        print("\n‚ùå ERROR: Missing class label!")
        print(f"   Found labels: {list(class_counts.index)}")
        print(f"   Need labels: [0, 1]")
        return

    print("‚úÖ Dataset validation passed!")
    print("=" * 60 + "\n")

    # Split data
    train_df, val_df = train_test_split(
        df, test_size=0.2, stratify=df["label"], random_state=42
    )

    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")
    print(f"Train distribution: {train_df['label'].value_counts().to_dict()}")
    print(f"Val distribution: {val_df['label'].value_counts().to_dict()}\n")

    # Load tokenizer
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    # Create datasets
    train_ds = HybridDataset(
        train_df.text.tolist(),
        train_df.label.tolist(),
        tokenizer,
        train_mode=True
    )

    val_ds = HybridDataset(
        val_df.text.tolist(),
        val_df.label.tolist(),
        tokenizer,
        train_mode=False,
        feature_stats=train_ds.get_feature_stats()
    )

    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)

    # üîë FIX: Calculate class weights for BOTH classes
    counts = train_df["label"].value_counts().sort_index()
    total = len(train_df)

    # Ensure we have weights for both class 0 and class 1
    raw_weights = []
    for i in [0, 1]:  # Explicitly for class 0 and 1
        if i in counts:
            raw_weights.append(total / (2 * counts[i]))
        else:
            raw_weights.append(1.0)  # Default weight if class missing

    # Dampen extreme weights
    class_weights = torch.tensor(
        [(w ** 0.5) for w in raw_weights],
        dtype=torch.float
    ).to(device)

    print("Class weights (dampened):", class_weights.tolist())

    # Initialize model
    model = RobertaWithFeatures(dropout_rate=0.6).to(device)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-6, weight_decay=0.01)

    # Loss function
    loss_fn = nn.CrossEntropyLoss(
        weight=class_weights,
        label_smoothing=0.2
    )

    # Training
    print("\n" + "=" * 60)
    print("TRAINING FOR 1 EPOCH")
    print("=" * 60 + "\n")

    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training Epoch 1/1"):
        optimizer.zero_grad()
        logits = model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["features"].to(device)
        )
        loss = loss_fn(logits, batch["labels"].to(device))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch 1 Train Loss: {avg_loss:.4f}")
    # Validation
    print("\n" + "=" * 60)
    print("VALIDATION")
    print("=" * 60)

    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            out = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["features"].to(device)
            )
            preds.extend(out.argmax(1).cpu().numpy())
            labels.extend(batch["labels"].numpy())

    f1 = f1_score(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    print(f"\n" + "=" * 60)
    print("FINAL RESULTS")
    print("=" * 60)
    print(f"Validation Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print(f"Validation F1 Score: {f1:.4f}")
    print("=" * 60)

    # Save model
    torch.save({
        'model_state': model.state_dict(),
        'feature_stats': train_ds.get_feature_stats()
    }, "/content/drive/MyDrive/TraceAI_Hybrid_Best.pt")

    tokenizer.save_pretrained("/content/drive/MyDrive/TraceAI_Hybrid_Best")

    print(f"\n‚úÖ Model saved to Google Drive!")
    print(f"   Location: /content/drive/MyDrive/TraceAI_Hybrid_Best.pt")

In [11]:
train()

Loaded 8264 samples, Skipped 1 corrupted lines

VALIDATING DATASET
Total samples: 8264
Class distribution: {0: 8264}

‚ùå ERROR: Dataset must have BOTH classes!
   Found only: [0]
   Need: [0, 1]

üí° Check your JSONL file:
   - Label 0 = Human-written
   - Label 1 = AI-generated


In [7]:
import pandas as pd
import json

def load_jsonl_to_df(path):
    data = []
    skipped_count = 0
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                skipped_count += 1
    if skipped_count > 0:
        print(f"Skipped {skipped_count} corrupted lines in {path}")
    return pd.DataFrame(data)

file_path = "final_dataset_no_emojis.jsonl"
df_content = load_jsonl_to_df(file_path)

print(f"Content of {file_path}:")
display(df_content.head())
print(f"Total entries: {len(df_content)}")
print(f"Value counts for 'label':\n{df_content['label'].value_counts()}")

Content of final_dataset_no_emojis.jsonl:


Unnamed: 0,text,label
0,the yangtze giant softshell turtle rafetus swi...,0
1,propensity scores are typically used in the ma...,0
2,cv using full set for model selection huh it s...,0
3,gift my closest people enough money to love co...,0
4,one example of an observational study was run ...,0


Total entries: 18512
Value counts for 'label':
label
0    12788
1     5724
Name: count, dtype: int64


**FINAL_ROBERTA_CODE**

In [1]:
!pip install -q torch transformers scikit-learn pandas tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
def load_clean_jsonl(path):
    """Load JSONL file - FAILS if ANY line is corrupted"""
    data = []

    with open(path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:  # Skip empty lines
                continue

            try:
                obj = json.loads(line)

                # Validate required fields
                if "text" not in obj:
                    raise ValueError(f"Line {line_num}: Missing 'text' field")
                if "label" not in obj:
                    raise ValueError(f"Line {line_num}: Missing 'label' field")

                # Convert label to integer
                obj["label"] = int(obj["label"])

                # Validate label values
                if obj["label"] not in [0, 1]:
                    raise ValueError(f"Line {line_num}: Label must be 0 or 1, got {obj['label']}")

                data.append(obj)

            except json.JSONDecodeError as e:
                raise ValueError(f"Line {line_num}: Invalid JSON - {str(e)}")
            except ValueError as e:
                raise ValueError(f"Line {line_num}: {str(e)}")

    if len(data) == 0:
        raise ValueError("No valid data found in file!")

    df = pd.DataFrame(data)

    print(f"‚úÖ Successfully loaded {len(df)} samples")
    print(f"Class distribution: {df['label'].value_counts().to_dict()}")

    return df


def extract_features(text):
    """Extract statistical features from text"""
    words = text.split()
    return np.array([
        len(words),
        np.mean([len(w) for w in words]) if words else 0,
        text.count(","),
        text.count(".")
    ], dtype=np.float32)

In [5]:
class HybridDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256,
                 train_mode=False, feature_stats=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.train_mode = train_mode

        # Use training set statistics for normalization
        if feature_stats is None:
            all_feats = np.array([extract_features(t) for t in texts])
            self.feat_mean = all_feats.mean(axis=0)
            self.feat_std = all_feats.std(axis=0) + 1e-6
        else:
            self.feat_mean, self.feat_std = feature_stats

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        feats = extract_features(self.texts[idx])
        feats = (feats - self.feat_mean) / self.feat_std

        # Stronger augmentation during training
        if self.train_mode:
            feats += np.random.normal(0, 0.3, feats.shape)

        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "features": torch.tensor(feats, dtype=torch.float),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def get_feature_stats(self):
        return (self.feat_mean, self.feat_std)


In [6]:
class RobertaWithFeatures(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")

        # Freeze early layers to prevent overfitting
        for param in self.roberta.embeddings.parameters():
            param.requires_grad = False
        for layer in self.roberta.encoder.layer[:8]:
            for param in layer.parameters():
                param.requires_grad = False

        self.feature_fc = nn.Sequential(
            nn.Linear(4, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # Classifier with strong dropout
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(768 + 64, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 2)
        )

    def forward(self, input_ids, attention_mask, features):
        roberta_out = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).last_hidden_state[:, 0]

        feat_out = self.feature_fc(features)
        x = torch.cat([roberta_out, feat_out], dim=1)
        logits = self.classifier(x)

        # Aggressive logit clamping
        logits = torch.clamp(logits, -2.5, 2.5)
        return logits

In [7]:
def train():
    print("=" * 60)
    print("LOADING DATASET")
    print("=" * 60)

    # Load dataset - will fail if any line is corrupted
    try:
        df = load_clean_jsonl("final_dataset_no_emojis.jsonl")
    except ValueError as e:
        print(f"\n‚ùå ERROR: {str(e)}")
        print("\nüí° Fix your JSONL file and try again.")
        return

    print("\n" + "=" * 60)
    print("VALIDATING DATASET")
    print("=" * 60)

    class_counts = df["label"].value_counts().sort_index()
    print(f"Total samples: {len(df)}")
    print(f"Class 0 (Human): {class_counts.get(0, 0)}")
    print(f"Class 1 (AI): {class_counts.get(1, 0)}")

    # Validate both classes exist
    if len(class_counts) < 2:
        print("\n‚ùå ERROR: Dataset must have BOTH classes!")
        print(f"   Found only: {list(class_counts.index)}")
        print(f"   Need: [0, 1]")
        return

    if 0 not in class_counts or 1 not in class_counts:
        print("\n‚ùå ERROR: Missing class label!")
        print(f"   Found labels: {list(class_counts.index)}")
        print(f"   Need labels: [0, 1]")
        return

    print("‚úÖ Dataset validation passed!")

    # Split data
    print("\n" + "=" * 60)
    print("SPLITTING DATA")
    print("=" * 60)

    train_df, val_df = train_test_split(
        df, test_size=0.2, stratify=df["label"], random_state=42
    )

    print(f"Training samples: {len(train_df)}")
    print(f"  - Class 0: {(train_df['label'] == 0).sum()}")
    print(f"  - Class 1: {(train_df['label'] == 1).sum()}")
    print(f"Validation samples: {len(val_df)}")
    print(f"  - Class 0: {(val_df['label'] == 0).sum()}")
    print(f"  - Class 1: {(val_df['label'] == 1).sum()}")

    # Load tokenizer
    print("\n" + "=" * 60)
    print("LOADING TOKENIZER")
    print("=" * 60)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    print("‚úÖ Tokenizer loaded")

    # Create datasets
    print("\n" + "=" * 60)
    print("CREATING DATASETS")
    print("=" * 60)

    train_ds = HybridDataset(
        train_df.text.tolist(),
        train_df.label.tolist(),
        tokenizer,
        train_mode=True
    )

    val_ds = HybridDataset(
        val_df.text.tolist(),
        val_df.label.tolist(),
        tokenizer,
        train_mode=False,
        feature_stats=train_ds.get_feature_stats()
    )

    print("‚úÖ Datasets created")

    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)

    # Calculate class weights
    counts = train_df["label"].value_counts().sort_index()
    total = len(train_df)

    raw_weights = []
    for i in [0, 1]:
        if i in counts:
            raw_weights.append(total / (2 * counts[i]))
        else:
            raw_weights.append(1.0)

    # Dampen extreme weights
    class_weights = torch.tensor(
        [(w ** 0.5) for w in raw_weights],
        dtype=torch.float
    ).to(device)

    print(f"\nClass weights (dampened): {class_weights.tolist()}")

    # Initialize model
    print("\n" + "=" * 60)
    print("INITIALIZING MODEL")
    print("=" * 60)

    model = RobertaWithFeatures(dropout_rate=0.6).to(device)
    print("‚úÖ Model initialized")

    # Count trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Frozen parameters: {total_params - trainable_params:,}")

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-6, weight_decay=0.01)

    # Loss function
    loss_fn = nn.CrossEntropyLoss(
        weight=class_weights,
        label_smoothing=0.2
    )

    # Training
    print("\n" + "=" * 60)
    print("TRAINING (1 EPOCH)")
    print("=" * 60 + "\n")

    model.train()
    total_loss = 0
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training")):
        optimizer.zero_grad()

        logits = model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["features"].to(device)
        )

        loss = loss_fn(logits, batch["labels"].to(device))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"\nTrain Loss: {avg_loss:.4f}")

    # Validation
    print("\n" + "=" * 60)
    print("VALIDATION")
    print("=" * 60 + "\n")

    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            out = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["features"].to(device)
            )
            preds.extend(out.argmax(1).cpu().numpy())
            labels.extend(batch["labels"].numpy())

    # Calculate metrics
    f1 = f1_score(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    # Per-class accuracy
    labels_np = np.array(labels)
    preds_np = np.array(preds)

    class0_mask = labels_np == 0
    class1_mask = labels_np == 1

    class0_acc = accuracy_score(labels_np[class0_mask], preds_np[class0_mask])
    class1_acc = accuracy_score(labels_np[class1_mask], preds_np[class1_mask])

    print("\n" + "=" * 60)
    print("FINAL RESULTS")
    print("=" * 60)
    print(f"Overall Accuracy: {acc:.4f} ({acc*100:.2f}%)")
    print(f"F1 Score: {f1:.4f}")
    print(f"Class 0 (Human) Accuracy: {class0_acc:.4f} ({class0_acc*100:.2f}%)")
    print(f"Class 1 (AI) Accuracy: {class1_acc:.4f} ({class1_acc*100:.2f}%)")
    print("=" * 60)

    # Save model
    print("\n" + "=" * 60)
    print("SAVING MODEL")
    print("=" * 60)

    torch.save({
        'model_state': model.state_dict(),
        'feature_stats': train_ds.get_feature_stats(),
        'accuracy': acc,
        'f1_score': f1
    }, "/content/drive/MyDrive/TraceAI_Hybrid_Best.pt")

    tokenizer.save_pretrained("/content/drive/MyDrive/TraceAI_Hybrid_Best")

    print(f"‚úÖ Model saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best.pt")
    print(f"‚úÖ Tokenizer saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best/")

In [8]:
train()

LOADING DATASET
‚úÖ Successfully loaded 20788 samples
Class distribution: {0: 12788, 1: 8000}

VALIDATING DATASET
Total samples: 20788
Class 0 (Human): 12788
Class 1 (AI): 8000
‚úÖ Dataset validation passed!

SPLITTING DATA
Training samples: 16630
  - Class 0: 10230
  - Class 1: 6400
Validation samples: 4158
  - Class 0: 2558
  - Class 1: 1600

LOADING TOKENIZER


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


‚úÖ Tokenizer loaded

CREATING DATASETS
‚úÖ Datasets created

Class weights (dampened): [0.9015572667121887, 1.1398327350616455]

INITIALIZING MODEL


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model initialized
Trainable parameters: 29,172,226
Frozen parameters: 95,703,552

TRAINING (1 EPOCH)



Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1040/1040 [06:44<00:00,  2.57it/s]



Train Loss: 0.5059

VALIDATION



Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 130/130 [00:58<00:00,  2.21it/s]



FINAL RESULTS
Overall Accuracy: 0.9947 (99.47%)
F1 Score: 0.9931
Class 0 (Human) Accuracy: 0.9953 (99.53%)
Class 1 (AI) Accuracy: 0.9938 (99.38%)

SAVING MODEL
‚úÖ Model saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best.pt
‚úÖ Tokenizer saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best/


In [11]:
def infer(text, temperature=3.0):
    """
    Classify text as Human, AI, or Uncertain.

    Args:
        text: Input text to classify
        temperature: Higher = more uncertain (try 2.5-4.0)

    Returns:
        Dictionary with prediction, probabilities, and confidence
    """
    tokenizer = RobertaTokenizer.from_pretrained(
        "/content/drive/MyDrive/TraceAI_Hybrid_Best"
    )

    # FIX: Add weights_only=False to allow loading of feature_stats (numpy arrays)
    checkpoint = torch.load("/content/drive/MyDrive/TraceAI_Hybrid_Best.pt", weights_only=False)

    model = RobertaWithFeatures().to(device)
    model.load_state_dict(checkpoint['model_state'])
    model.eval()

    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)

    # Use saved training statistics
    feat_mean, feat_std = checkpoint['feature_stats']
    feats = extract_features(text)
    feats = (feats - feat_mean) / feat_std
    feats = torch.tensor(feats, dtype=torch.float).unsqueeze(0).to(device)

    with torch.no_grad():
        logits = model(enc["input_ids"], enc["attention_mask"], feats)

    # Strong temperature scaling
    logits = logits / temperature
    probs = torch.softmax(logits, dim=1)[0]

    human_p, ai_p = probs[0].item(), probs[1].item()

    # Wider uncertainty threshold
    if abs(ai_p - human_p) < 0.30:
        label = "UNCERTAIN"
    elif ai_p > human_p:
        label = "AI"
    else:
        label = "HUMAN"

    return {
        "Prediction": label,
        "AI %": round(ai_p * 100, 2),
        "Human %": round(human_p * 100, 2),
        "Confidence": round(abs(ai_p - human_p) * 100, 2)
    }

In [12]:
text = input("Enter text to analyze: ")
result = infer(text, temperature=3.0)

print("\n" + "=" * 60)
print("ANALYSIS RESULTS")
print("=" * 60)
print(f"Prediction: {result['Prediction']}")
print(f"AI Probability: {result['AI %']}%")
print(f"Human Probability: {result['Human %']}%")
print(f"Confidence Gap: {result['Confidence']}%")
print("=" * 60)


Enter text to analyze: What makes me happy: My Mum, Dad, Grandparents (both maternal and paternal),  The Sun, trees, flowers, the sky, the beach, my house, my bed, sausages, chicken, pretzels, Love, School, my ex, Bunnies, Cats, Dogs, Chickens, Peppa Pig, listening to Lil Nas X, Driving, Telling Jokes, Scooting, Shopping, Concerts, Day outs, My sister, My cousins, my aunts and uncles, Meatballs, Potatoes, History, Archery, Geography, Math, Reading, Singing, Dancing, Farting, Comedy, MrBeast, Hikes, Making Memes, Saxophones, Shrek, The Minions, Playing the Piano, X(when it was Twitter), Facebook, Instagram, Michael Jackson, Typing, Sofas, Museums, Castles, My singing monsters, Elton John‚Äôs Music, IShowSpeed, The Mario Movie, Roses, Daisies, Mickey Mouse etc.  The World is such a happy place. Isn‚Äôt that right? 


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ANALYSIS RESULTS
Prediction: AI
AI Probability: 70.9%
Human Probability: 29.1%
Confidence Gap: 41.8%


In [13]:
# Test with the samples you provided
sample_texts = [
    # Human text (label 0)
    "i think it depends on which hypothesis testing you are talking about the classical hypothesis testing neyman pearson is said to be defective because it does not appropriately condition on what actually happened",

    # Human text (label 0)
    "i have no desire to work at a fast food counter and have avoided it most of my life the put offs would be the itchy hot polyester uniforms grease and loud disgruntled customers",

    # AI text (label 1)
    "Limiting car usage has numerous advantages for both individuals and society as a whole. The passages provide examples of communities and cities that have implemented measures to promote alternative transportation and reduce car dependence."
]

print("TESTING WITH SAMPLE DATA")
print("=" * 80)

for i, text in enumerate(sample_texts, 1):
    result = infer(text, temperature=3.0)
    print(f"\nüìù Sample {i}:")
    print(f"Text: {text[:100]}{'...' if len(text) > 100 else ''}")
    print(f"‚Üí {result['Prediction']} | AI: {result['AI %']}% | Human: {result['Human %']}% | Confidence: {result['Confidence']}%")
    print("-" * 80)

TESTING WITH SAMPLE DATA


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Sample 1:
Text: i think it depends on which hypothesis testing you are talking about the classical hypothesis testin...
‚Üí HUMAN | AI: 33.22% | Human: 66.78% | Confidence: 33.56%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Sample 2:
Text: i have no desire to work at a fast food counter and have avoided it most of my life the put offs wou...
‚Üí HUMAN | AI: 33.25% | Human: 66.75% | Confidence: 33.49%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Sample 3:
Text: Limiting car usage has numerous advantages for both individuals and society as a whole. The passages...
‚Üí AI | AI: 70.18% | Human: 29.82% | Confidence: 40.36%
--------------------------------------------------------------------------------


In [14]:
# Test multiple texts
test_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "In accordance with the aforementioned stipulations, it is hereby declared that all parties must comply with the regulatory framework.",
    "I went to the store yesterday and bought some milk, bread, and eggs for dinner.",
    "The implementation of artificial intelligence systems requires careful consideration of ethical implications and potential societal impacts.",
    "hey whats up? nothing much just chilling at home lol"
]

print("BATCH TESTING RESULTS")
print("=" * 80)

for i, text in enumerate(test_texts, 1):
    result = infer(text, temperature=3.0)
    print(f"\nüìù Test {i}:")
    print(f"Text: {text[:70]}{'...' if len(text) > 70 else ''}")
    print(f"‚Üí {result['Prediction']} | AI: {result['AI %']}% | Human: {result['Human %']}% | Confidence: {result['Confidence']}%")
    print("-" * 80)

BATCH TESTING RESULTS


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Test 1:
Text: The quick brown fox jumps over the lazy dog.
‚Üí AI | AI: 68.2% | Human: 31.8% | Confidence: 36.4%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Test 2:
Text: In accordance with the aforementioned stipulations, it is hereby decla...
‚Üí AI | AI: 69.79% | Human: 30.21% | Confidence: 39.57%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Test 3:
Text: I went to the store yesterday and bought some milk, bread, and eggs fo...
‚Üí AI | AI: 68.31% | Human: 31.69% | Confidence: 36.62%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Test 4:
Text: The implementation of artificial intelligence systems requires careful...
‚Üí AI | AI: 70.22% | Human: 29.78% | Confidence: 40.43%
--------------------------------------------------------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üìù Test 5:
Text: hey whats up? nothing much just chilling at home lol
‚Üí UNCERTAIN | AI: 40.05% | Human: 59.95% | Confidence: 19.9%
--------------------------------------------------------------------------------


In [16]:
# CELL 13: Display Saved Model Metrics
def display_saved_metrics():
    """
    Load and display all metrics saved during training
    """
    import torch

    print("=" * 80)
    print("LOADING SAVED MODEL METRICS")
    print("=" * 80)

    try:
        checkpoint = torch.load("/content/drive/MyDrive/TraceAI_Hybrid_Best.pt")

        # Extract metrics
        accuracy = checkpoint.get('accuracy', 'N/A')
        f1_score = checkpoint.get('f1_score', 'N/A')

        confusion_matrix = checkpoint.get('confusion_matrix', {})
        tn = confusion_matrix.get('true_negatives', 0)
        fp = confusion_matrix.get('false_positives', 0)
        fn = confusion_matrix.get('false_negatives', 0)
        tp = confusion_matrix.get('true_positives', 0)

        rates = checkpoint.get('rates', {})
        tpr = rates.get('true_positive_rate', 0)
        fpr = rates.get('false_positive_rate', 0)
        tnr = rates.get('true_negative_rate', 0)
        fnr = rates.get('false_negative_rate', 0)

        metrics = checkpoint.get('metrics', {})
        precision = metrics.get('precision', 0)
        recall = metrics.get('recall', 0)

        # Display results
        print("\n" + "=" * 80)
        print("CONFUSION MATRIX")
        print("=" * 80)
        print(f"{'':20s} Predicted")
        print(f"{'':20s} {'Human':>10s} {'AI':>10s}")
        print(f"{'Actual Human':20s} {tn:10d} {fp:10d}")
        print(f"{'Actual AI':20s} {fn:10d} {tp:10d}")

        print("\n" + "=" * 80)
        print("COUNT METRICS")
        print("=" * 80)
        print(f"{'True Negatives (TN)':40s} {tn:8d}  (Human correctly identified)")
        print(f"{'False Positives (FP)':40s} {fp:8d}  (Human wrongly labeled as AI)")
        print(f"{'False Negatives (FN)':40s} {fn:8d}  (AI wrongly labeled as Human)")
        print(f"{'True Positives (TP)':40s} {tp:8d}  (AI correctly identified)")

        print("\n" + "=" * 80)
        print("RATE METRICS (Percentages)")
        print("=" * 80)
        print(f"{'True Positive Rate (Sensitivity/Recall)':40s} {tpr:7.4f}  ({tpr*100:6.2f}%)")
        print(f"{'False Positive Rate':40s} {fpr:7.4f}  ({fpr*100:6.2f}%)")
        print(f"{'True Negative Rate (Specificity)':40s} {tnr:7.4f}  ({tnr*100:6.2f}%)")
        print(f"{'False Negative Rate':40s} {fnr:7.4f}  ({fnr*100:6.2f}%)")

        print("\n" + "=" * 80)
        print("OVERALL PERFORMANCE METRICS")
        print("=" * 80)
        print(f"{'Overall Accuracy':40s} {accuracy:7.4f}  ({accuracy*100:6.2f}%)")
        print(f"{'F1 Score':40s} {f1_score:7.4f}")
        print(f"{'Precision':40s} {precision:7.4f}  ({precision*100:6.2f}%)")
        print(f"{'Recall':40s} {recall:7.4f}  ({recall*100:6.2f}%)")

        print("\n" + "=" * 80)
        print("INTERPRETATION GUIDE")
        print("=" * 80)
        print("""
üìä Key Metrics Explained:

TRUE POSITIVE RATE (Sensitivity/Recall):
   ‚Üí How well the model identifies AI-generated text
   ‚Üí Higher is better (ideally > 0.85)

FALSE POSITIVE RATE:
   ‚Üí How often human text is misclassified as AI
   ‚Üí Lower is better (ideally < 0.15)

TRUE NEGATIVE RATE (Specificity):
   ‚Üí How well the model identifies human-written text
   ‚Üí Higher is better (ideally > 0.85)

FALSE NEGATIVE RATE:
   ‚Üí How often AI text is misclassified as human
   ‚Üí Lower is better (ideally < 0.15)

PRECISION:
   ‚Üí When model predicts AI, how often is it correct?
   ‚Üí Higher is better

RECALL (same as TPR):
   ‚Üí Of all actual AI texts, how many did we catch?
   ‚Üí Higher is better

F1 SCORE:
   ‚Üí Balanced measure of precision and recall
   ‚Üí Higher is better (ideally > 0.80)
        """)

        print("=" * 80)
        print("‚úÖ METRICS LOADED SUCCESSFULLY")
        print("=" * 80)

    except FileNotFoundError:
        print("\n‚ùå ERROR: Model file not found!")
        print("   Please train the model first using the train() function.")
    except Exception as e:
        print(f"\n‚ùå ERROR: {str(e)}")

# Run the function
display_saved_metrics()

LOADING SAVED MODEL METRICS

‚ùå ERROR: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy._core.multiarray._reconstruct was not an allowed global by default. Please use `torch.serialization.add_safe_globals([numpy._core.multiarray._reconstruct])` or the `torch.serialization.safe_globals([numpy._core.multiarray._reconstruct])` context manager to allowlist this global if you trust this class/function.

Check 

In [17]:
# CELL 13: Display Saved Model Metrics
# ============================================================================
# Run this cell to see all training results and metrics
# ============================================================================

import torch

def display_saved_metrics():
    """
    Load and display all metrics saved during training
    """
    print("=" * 80)
    print("LOADING SAVED MODEL METRICS")
    print("=" * 80)

    try:
        # Load checkpoint with weights_only=False to handle numpy arrays
        checkpoint = torch.load(
            "/content/drive/MyDrive/TraceAI_Hybrid_Best.pt",
            map_location=device,
            weights_only=False
        )

        # Extract metrics
        accuracy = checkpoint.get('accuracy', 'N/A')
        f1_score = checkpoint.get('f1_score', 'N/A')

        confusion_matrix = checkpoint.get('confusion_matrix', {})
        tn = confusion_matrix.get('true_negatives', 0)
        fp = confusion_matrix.get('false_positives', 0)
        fn = confusion_matrix.get('false_negatives', 0)
        tp = confusion_matrix.get('true_positives', 0)

        rates = checkpoint.get('rates', {})
        tpr = rates.get('true_positive_rate', 0)
        fpr = rates.get('false_positive_rate', 0)
        tnr = rates.get('true_negative_rate', 0)
        fnr = rates.get('false_negative_rate', 0)

        metrics = checkpoint.get('metrics', {})
        precision = metrics.get('precision', 0)
        recall = metrics.get('recall', 0)

        # Calculate total samples
        total_samples = tn + fp + fn + tp

        # Display results
        print("\n" + "=" * 80)
        print("CONFUSION MATRIX")
        print("=" * 80)
        print(f"{'':20s} Predicted")
        print(f"{'':20s} {'Human':>10s} {'AI':>10s}")
        print(f"{'Actual Human':20s} {tn:10d} {fp:10d}")
        print(f"{'Actual AI':20s} {fn:10d} {tp:10d}")

        print("\n" + "=" * 80)
        print("COUNT METRICS")
        print("=" * 80)
        print(f"{'True Negatives (TN)':40s} {tn:8d}  (Human correctly identified)")
        print(f"{'False Positives (FP)':40s} {fp:8d}  (Human wrongly labeled as AI)")
        print(f"{'False Negatives (FN)':40s} {fn:8d}  (AI wrongly labeled as Human)")
        print(f"{'True Positives (TP)':40s} {tp:8d}  (AI correctly identified)")

        print("\n" + "=" * 80)
        print("RATE METRICS (Percentages)")
        print("=" * 80)
        print(f"{'True Positive Rate (Sensitivity/Recall)':40s} {tpr:7.4f}  ({tpr*100:6.2f}%)")
        print(f"{'False Positive Rate':40s} {fpr:7.4f}  ({fpr*100:6.2f}%)")
        print(f"{'True Negative Rate (Specificity)':40s} {tnr:7.4f}  ({tnr*100:6.2f}%)")
        print(f"{'False Negative Rate':40s} {fnr:7.4f}  ({fnr*100:6.2f}%)")

        print("\n" + "=" * 80)
        print("OVERALL PERFORMANCE METRICS")
        print("=" * 80)
        print(f"{'Overall Accuracy':40s} {accuracy:7.4f}  ({accuracy*100:6.2f}%)")
        print(f"{'F1 Score':40s} {f1_score:7.4f}")
        print(f"{'Precision':40s} {precision:7.4f}  ({precision*100:6.2f}%)")
        print(f"{'Recall':40s} {recall:7.4f}  ({recall*100:6.2f}%)")

        print("\n" + "=" * 80)
        print("SUMMARY STATISTICS")
        print("=" * 80)
        print(f"{'Total Validation Samples':40s} {total_samples:8d}")
        print(f"{'Correctly Classified':40s} {tn + tp:8d}  ({((tn+tp)/total_samples)*100:6.2f}%)")
        print(f"{'Misclassified':40s} {fp + fn:8d}  ({((fp+fn)/total_samples)*100:6.2f}%)")

        print("\n" + "=" * 80)
        print("INTERPRETATION GUIDE")
        print("=" * 80)
        print("""
üìä Key Metrics Explained:

TRUE POSITIVE RATE (Sensitivity/Recall):
   ‚Üí How well the model identifies AI-generated text
   ‚Üí Higher is better (ideally > 0.85)

FALSE POSITIVE RATE:
   ‚Üí How often human text is misclassified as AI
   ‚Üí Lower is better (ideally < 0.15)

TRUE NEGATIVE RATE (Specificity):
   ‚Üí How well the model identifies human-written text
   ‚Üí Higher is better (ideally > 0.85)

FALSE NEGATIVE RATE:
   ‚Üí How often AI text is misclassified as human
   ‚Üí Lower is better (ideally < 0.15)

PRECISION:
   ‚Üí When model predicts AI, how often is it correct?
   ‚Üí Higher is better

RECALL (same as TPR):
   ‚Üí Of all actual AI texts, how many did we catch?
   ‚Üí Higher is better

F1 SCORE:
   ‚Üí Balanced measure of precision and recall
   ‚Üí Higher is better (ideally > 0.80)
        """)

        print("=" * 80)
        print("‚úÖ METRICS LOADED SUCCESSFULLY")
        print("=" * 80)

    except FileNotFoundError:
        print("\n‚ùå ERROR: Model file not found!")
        print("   Path: /content/drive/MyDrive/TraceAI_Hybrid_Best.pt")
        print("   Please train the model first using the train() function.")
    except Exception as e:
        print(f"\n‚ùå ERROR: {str(e)}")
        import traceback
        traceback.print_exc()

# Run the function
display_saved_metrics()

LOADING SAVED MODEL METRICS

CONFUSION MATRIX
                     Predicted
                          Human         AI
Actual Human                  0          0
Actual AI                     0          0

COUNT METRICS
True Negatives (TN)                             0  (Human correctly identified)
False Positives (FP)                            0  (Human wrongly labeled as AI)
False Negatives (FN)                            0  (AI wrongly labeled as Human)
True Positives (TP)                             0  (AI correctly identified)

RATE METRICS (Percentages)
True Positive Rate (Sensitivity/Recall)   0.0000  (  0.00%)
False Positive Rate                       0.0000  (  0.00%)
True Negative Rate (Specificity)          0.0000  (  0.00%)
False Negative Rate                       0.0000  (  0.00%)

OVERALL PERFORMANCE METRICS
Overall Accuracy                          0.9947  ( 99.47%)
F1 Score                                  0.9931
Precision                                 0.0000  (  0

Traceback (most recent call last):
  File "/tmp/ipython-input-189099889.py", line 84, in display_saved_metrics
    print(f"{'Correctly Classified':40s} {tn + tp:8d}  ({((tn+tp)/total_samples)*100:6.2f}%)")
                                                          ~~~~~~~^^~~~~~~~~~~~~
ZeroDivisionError: division by zero


In [18]:
# CELL 7: Training Function (1 EPOCH, WITH COMPLETE METRICS)
def train():
    print("=" * 60)
    print("LOADING DATASET")
    print("=" * 60)

    # Load dataset - will fail if any line is corrupted
    try:
        df = load_clean_jsonl("final_dataset_no_emojis.jsonl")
    except ValueError as e:
        print(f"\n‚ùå ERROR: {str(e)}")
        print("\nüí° Fix your JSONL file and try again.")
        return

    print("\n" + "=" * 60)
    print("VALIDATING DATASET")
    print("=" * 60)

    class_counts = df["label"].value_counts().sort_index()
    print(f"Total samples: {len(df)}")
    print(f"Class 0 (Human): {class_counts.get(0, 0)}")
    print(f"Class 1 (AI): {class_counts.get(1, 0)}")

    # Validate both classes exist
    if len(class_counts) < 2:
        print("\n‚ùå ERROR: Dataset must have BOTH classes!")
        print(f"   Found only: {list(class_counts.index)}")
        print(f"   Need: [0, 1]")
        return

    if 0 not in class_counts or 1 not in class_counts:
        print("\n‚ùå ERROR: Missing class label!")
        print(f"   Found labels: {list(class_counts.index)}")
        print(f"   Need labels: [0, 1]")
        return

    print("‚úÖ Dataset validation passed!")

    # Split data
    print("\n" + "=" * 60)
    print("SPLITTING DATA")
    print("=" * 60)

    train_df, val_df = train_test_split(
        df, test_size=0.2, stratify=df["label"], random_state=42
    )

    print(f"Training samples: {len(train_df)}")
    print(f"  - Class 0: {(train_df['label'] == 0).sum()}")
    print(f"  - Class 1: {(train_df['label'] == 1).sum()}")
    print(f"Validation samples: {len(val_df)}")
    print(f"  - Class 0: {(val_df['label'] == 0).sum()}")
    print(f"  - Class 1: {(val_df['label'] == 1).sum()}")

    # Load tokenizer
    print("\n" + "=" * 60)
    print("LOADING TOKENIZER")
    print("=" * 60)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    print("‚úÖ Tokenizer loaded")

    # Create datasets
    print("\n" + "=" * 60)
    print("CREATING DATASETS")
    print("=" * 60)

    train_ds = HybridDataset(
        train_df.text.tolist(),
        train_df.label.tolist(),
        tokenizer,
        train_mode=True
    )

    val_ds = HybridDataset(
        val_df.text.tolist(),
        val_df.label.tolist(),
        tokenizer,
        train_mode=False,
        feature_stats=train_ds.get_feature_stats()
    )

    print("‚úÖ Datasets created")

    train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32)

    # Calculate class weights
    counts = train_df["label"].value_counts().sort_index()
    total = len(train_df)

    raw_weights = []
    for i in [0, 1]:
        if i in counts:
            raw_weights.append(total / (2 * counts[i]))
        else:
            raw_weights.append(1.0)

    # Dampen extreme weights
    class_weights = torch.tensor(
        [(w ** 0.5) for w in raw_weights],
        dtype=torch.float
    ).to(device)

    print(f"\nClass weights (dampened): {class_weights.tolist()}")

    # Initialize model
    print("\n" + "=" * 60)
    print("INITIALIZING MODEL")
    print("=" * 60)

    model = RobertaWithFeatures(dropout_rate=0.6).to(device)
    print("‚úÖ Model initialized")

    # Count trainable parameters
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Frozen parameters: {total_params - trainable_params:,}")

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-6, weight_decay=0.01)

    # Loss function
    loss_fn = nn.CrossEntropyLoss(
        weight=class_weights,
        label_smoothing=0.2
    )

    # Training
    print("\n" + "=" * 60)
    print("TRAINING (1 EPOCH)")
    print("=" * 60 + "\n")

    model.train()
    total_loss = 0
    num_batches = len(train_loader)

    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training")):
        optimizer.zero_grad()

        logits = model(
            batch["input_ids"].to(device),
            batch["attention_mask"].to(device),
            batch["features"].to(device)
        )

        loss = loss_fn(logits, batch["labels"].to(device))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"\nTrain Loss: {avg_loss:.4f}")

    # Validation
    print("\n" + "=" * 60)
    print("VALIDATION")
    print("=" * 60 + "\n")

    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            out = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                batch["features"].to(device)
            )
            preds.extend(out.argmax(1).cpu().numpy())
            labels.extend(batch["labels"].numpy())

    # Convert to numpy arrays
    labels_np = np.array(labels)
    preds_np = np.array(preds)

    # Calculate confusion matrix components
    # Class 0 = Human, Class 1 = AI
    true_negatives = int(np.sum((labels_np == 0) & (preds_np == 0)))
    false_positives = int(np.sum((labels_np == 0) & (preds_np == 1)))
    false_negatives = int(np.sum((labels_np == 1) & (preds_np == 0)))
    true_positives = int(np.sum((labels_np == 1) & (preds_np == 1)))

    # Calculate rates
    total_human = np.sum(labels_np == 0)
    total_ai = np.sum(labels_np == 1)

    true_positive_rate = float(true_positives / total_ai if total_ai > 0 else 0)
    false_positive_rate = float(false_positives / total_human if total_human > 0 else 0)
    true_negative_rate = float(true_negatives / total_human if total_human > 0 else 0)
    false_negative_rate = float(false_negatives / total_ai if total_ai > 0 else 0)

    # Calculate standard metrics
    f1 = float(f1_score(labels, preds, average='binary'))
    acc = float(accuracy_score(labels, preds))

    class0_mask = labels_np == 0
    class1_mask = labels_np == 1

    class0_acc = float(accuracy_score(labels_np[class0_mask], preds_np[class0_mask]))
    class1_acc = float(accuracy_score(labels_np[class1_mask], preds_np[class1_mask]))

    # Calculate precision and recall
    precision = float(true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0)
    recall = true_positive_rate

    print("\n" + "=" * 60)
    print("CONFUSION MATRIX")
    print("=" * 60)
    print(f"                    Predicted")
    print(f"                Human    AI")
    print(f"Actual Human    {true_negatives:5d}  {false_positives:5d}")
    print(f"Actual AI       {false_negatives:5d}  {true_positives:5d}")

    print("\n" + "=" * 60)
    print("DETAILED METRICS")
    print("=" * 60)
    print(f"\nüìä Count Metrics:")
    print(f"   True Negatives (TN):  {true_negatives:5d} (Human correctly identified)")
    print(f"   False Positives (FP): {false_positives:5d} (Human wrongly labeled as AI)")
    print(f"   False Negatives (FN): {false_negatives:5d} (AI wrongly labeled as Human)")
    print(f"   True Positives (TP):  {true_positives:5d} (AI correctly identified)")

    print(f"\nüìà Rate Metrics:")
    print(f"   True Positive Rate (TPR/Recall/Sensitivity): {true_positive_rate:.4f} ({true_positive_rate*100:.2f}%)")
    print(f"   False Positive Rate (FPR):                   {false_positive_rate:.4f} ({false_positive_rate*100:.2f}%)")
    print(f"   True Negative Rate (TNR/Specificity):        {true_negative_rate:.4f} ({true_negative_rate*100:.2f}%)")
    print(f"   False Negative Rate (FNR):                   {false_negative_rate:.4f} ({false_negative_rate*100:.2f}%)")

    print(f"\nüéØ Overall Metrics:")
    print(f"   Overall Accuracy:        {acc:.4f} ({acc*100:.2f}%)")
    print(f"   F1 Score:                {f1:.4f}")
    print(f"   Precision:               {precision:.4f} ({precision*100:.2f}%)")
    print(f"   Recall:                  {recall:.4f} ({recall*100:.2f}%)")

    print(f"\nüìã Per-Class Accuracy:")
    print(f"   Class 0 (Human): {class0_acc:.4f} ({class0_acc*100:.2f}%)")
    print(f"   Class 1 (AI):    {class1_acc:.4f} ({class1_acc*100:.2f}%)")

    print("=" * 60)

    # Save model with ALL metrics properly
    print("\n" + "=" * 60)
    print("SAVING MODEL")
    print("=" * 60)

    torch.save({
        'model_state': model.state_dict(),
        'feature_stats': train_ds.get_feature_stats(),
        'accuracy': acc,
        'f1_score': f1,
        'confusion_matrix': {
            'true_negatives': true_negatives,
            'false_positives': false_positives,
            'false_negatives': false_negatives,
            'true_positives': true_positives
        },
        'rates': {
            'true_positive_rate': true_positive_rate,
            'false_positive_rate': false_positive_rate,
            'true_negative_rate': true_negative_rate,
            'false_negative_rate': false_negative_rate
        },
        'metrics': {
            'precision': precision,
            'recall': recall
        }
    }, "/content/drive/MyDrive/TraceAI_Hybrid_Best.pt")

    tokenizer.save_pretrained("/content/drive/MyDrive/TraceAI_Hybrid_Best")

    print(f"‚úÖ Model saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best.pt")
    print(f"‚úÖ Tokenizer saved to: /content/drive/MyDrive/TraceAI_Hybrid_Best/")
    print(f"‚úÖ All metrics saved in checkpoint")

    print("\n" + "=" * 60)
    print("üí° TIP: Run display_saved_metrics() to view these results anytime!")
    print("=" * 60)

In [19]:
train()

LOADING DATASET
‚úÖ Successfully loaded 20788 samples
Class distribution: {0: 12788, 1: 8000}

VALIDATING DATASET
Total samples: 20788
Class 0 (Human): 12788
Class 1 (AI): 8000
‚úÖ Dataset validation passed!

SPLITTING DATA
Training samples: 16630
  - Class 0: 10230
  - Class 1: 6400
Validation samples: 4158
  - Class 0: 2558
  - Class 1: 1600

LOADING TOKENIZER
‚úÖ Tokenizer loaded

CREATING DATASETS
‚úÖ Datasets created

Class weights (dampened): [0.9015572667121887, 1.1398327350616455]

INITIALIZING MODEL


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model initialized
Trainable parameters: 29,172,226
Frozen parameters: 95,703,552

TRAINING (1 EPOCH)



Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1040/1040 [06:37<00:00,  2.62it/s]



Train Loss: 0.5246

VALIDATION



Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 130/130 [00:58<00:00,  2.22it/s]



CONFUSION MATRIX
                    Predicted
                Human    AI
Actual Human     2534     24
Actual AI           8   1592

DETAILED METRICS

üìä Count Metrics:
   True Negatives (TN):   2534 (Human correctly identified)
   False Positives (FP):    24 (Human wrongly labeled as AI)
   False Negatives (FN):     8 (AI wrongly labeled as Human)
   True Positives (TP):   1592 (AI correctly identified)

üìà Rate Metrics:
   True Positive Rate (TPR/Recall/Sensitivity): 0.9950 (99.50%)
   False Positive Rate (FPR):                   0.0094 (0.94%)
   True Negative Rate (TNR/Specificity):        0.9906 (99.06%)
   False Negative Rate (FNR):                   0.0050 (0.50%)

üéØ Overall Metrics:
   Overall Accuracy:        0.9923 (99.23%)
   F1 Score:                0.9900
   Precision:               0.9851 (98.51%)
   Recall:                  0.9950 (99.50%)

üìã Per-Class Accuracy:
   Class 0 (Human): 0.9906 (99.06%)
   Class 1 (AI):    0.9950 (99.50%)

SAVING MODEL
‚úÖ Model sa

In [20]:
display_saved_metrics()

LOADING SAVED MODEL METRICS

CONFUSION MATRIX
                     Predicted
                          Human         AI
Actual Human               2534         24
Actual AI                     8       1592

COUNT METRICS
True Negatives (TN)                          2534  (Human correctly identified)
False Positives (FP)                           24  (Human wrongly labeled as AI)
False Negatives (FN)                            8  (AI wrongly labeled as Human)
True Positives (TP)                          1592  (AI correctly identified)

RATE METRICS (Percentages)
True Positive Rate (Sensitivity/Recall)   0.9950  ( 99.50%)
False Positive Rate                       0.0094  (  0.94%)
True Negative Rate (Specificity)          0.9906  ( 99.06%)
False Negative Rate                       0.0050  (  0.50%)

OVERALL PERFORMANCE METRICS
Overall Accuracy                          0.9923  ( 99.23%)
F1 Score                                  0.9900
Precision                                 0.9851  ( 98