*italicized text*# ModernBERT-based Manipulation Detection (Binary Classification)
This notebook uses `roberta-base` to classify dialogue as manipulative or not using the MentalManip dataset.

In [1]:
#!pip uninstall flash-attn -y
!pip install -q transformers
!pip install -q datasets
!pip install -q evaluate
## transformers upgrade
!pip install -q --upgrade transformers

## Datasets need upgrading to work
!pip install -q --upgrade datasets



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is inc

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForMaskedLM
from sklearn.metrics import classification_report


In [3]:
# Load the MentalManip dataset (binary classification)
# Load dataset
dataset = load_dataset("audreyeleven/MentalManip", name="mentalmanip_maj")

print(dataset)


README.md: 0.00B [00:00, ?B/s]

Some datasets params were ignored: ['license']. Make sure to use only valid params for the dataset builder and to have a up-to-date version of the `datasets` library.


mentalmanip_maj.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/4000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'manipulative', 'technique', 'vulnerability'],
        num_rows: 4000
    })
})


In [4]:
# Ensure the 'manipulative' column is class-labeled
dataset = dataset.class_encode_column("manipulative")




Stringifying the column:   0%|          | 0/4000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [5]:
model_ckpt = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_fn(example):
    return tokenizer(example["dialogue"], truncation=True, padding="max_length", max_length=128)

tokenized = dataset.map(tokenize_fn, batched=True)
tokenized = tokenized.rename_column("manipulative", "labels")
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(tokenized)
print(tokenized['train'][0])

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'labels', 'technique', 'vulnerability', 'input_ids', 'attention_mask'],
        num_rows: 4000
    })
})
{'labels': tensor(1), 'input_ids': tensor([50281, 19589,    18,    27,  7670,     2, 33851,   281,   436,   581,
           27,   346,  4045,   368,  4456,   479,    32, 18328, 30201,    13,
         3978,   818,   394,    15,  1422,    27, 13631,  2502,  2522,   342,
         8862,  7619,    13, 27887,    77, 44256,    13,  8516, 12682,    15,
          309,   369,   253,  1984,   763,  7715,   275,   253,  4759,  3120,
         8541,   665,  6518,   368,  1089,   634,  3057,  9655,    15,  3052,
          309, 10412,    13,   390,   858,   359,   452,   247,  2774,   865,
          187, 19589,    19,    27,  2656,    13,   326,   434,   594, 40479,
           15,   309,   701,   703,  1904,   626,  1014,  4366,   779,    15,
          187, 19589,    18,    27,   309,   871,    15,  1244,   344,   434,
        

In [6]:
# Split the dataset into training and testing sets
train_test_split = tokenized["train"].train_test_split(test_size=0.2) # Adjust the test_size as needed

# Update the tokenized dataset with the new splits
tokenized["train"] = train_test_split["train"]
tokenized["test"] = train_test_split["test"]

# Make the data work with the nomenclature
#tokenized = tokenized.rename_column("manipulative", "labels")

print(tokenized)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'labels', 'technique', 'vulnerability', 'input_ids', 'attention_mask'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['id', 'dialogue', 'labels', 'technique', 'vulnerability', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
})


In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=2,  # Binary classification
    problem_type="single_label_classification" # Specify problem type
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
## Training args

training_args = TrainingArguments(
    output_dir="./bert-binary-manip",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    run_name="bert-binary-manip",
    report_to="none",
)


In [9]:
## Evaluation metrics

import evaluate
import numpy as np

accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')

def compute_metrics(p):
    predictions, labels = p
    predictions_argmax = np.argmax(predictions, axis=1)

    return {
        "accuracy": accuracy.compute(predictions=predictions_argmax, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions_argmax, references=labels, average='weighted')["precision"],
        "recall": recall.compute(predictions=predictions_argmax, references=labels, average='weighted')["recall"],
        "f1": f1.compute(predictions=predictions_argmax, references=labels, average='weighted')["f1"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()



  trainer = Trainer(
W0720 04:09:13.669000 765 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6,0.588288,0.6975,0.588753,0.6975,0.576516
2,0.4756,0.585164,0.7175,0.702576,0.7175,0.643238
3,0.3957,0.697047,0.685,0.67287,0.685,0.677695
4,0.0908,1.812361,0.66375,0.672227,0.66375,0.667551


TrainOutput(global_step=800, training_loss=0.38235014528036115, metrics={'train_runtime': 572.0201, 'train_samples_per_second': 22.377, 'train_steps_per_second': 1.399, 'total_flos': 1090425107251200.0, 'train_loss': 0.38235014528036115, 'epoch': 4.0})

In [11]:

##  Easy eval results...
##  Get the log history from the trainer state
log_history = trainer.state.log_history

# Initialize placeholders
last_train_loss = None
last_eval_loss = None

# Iterate through log history to find the last recorded train and eval loss
for log in reversed(log_history):
    if last_eval_loss is None and "eval_loss" in log:
        last_eval_loss = log["eval_loss"]
    if last_train_loss is None and "loss" in log:
        last_train_loss = log["loss"]
    if last_train_loss is not None and last_eval_loss is not None:
        break

# Calculate overfitting ratio
if last_train_loss is not None and last_eval_loss is not None:
    ratio = last_train_loss / last_eval_loss
    print(f"Training Loss: {last_train_loss:.5f}")
    print(f"Validation Loss: {last_eval_loss:.5f}")
    print(f"Overfitting Ratio: {ratio:.5f}")
    if ratio < 0.6:
        print("Overfitting detected!")
    else:
        print("No significant overfitting.")
else:
    print("Could not find both training and evaluation loss in log history.")


Training Loss: 0.09080
Validation Loss: 1.81236
Overfitting Ratio: 0.05010
Overfitting detected!


In [12]:
# Predict on test set
preds = trainer.predict(tokenized["test"])
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

# Detailed classification report
print(classification_report(y_true, y_pred, target_names=["non-manipulative", "manipulative"]))


                  precision    recall  f1-score   support

non-manipulative       0.66      0.13      0.22       241
    manipulative       0.72      0.97      0.83       559

        accuracy                           0.72       800
       macro avg       0.69      0.55      0.52       800
    weighted avg       0.70      0.72      0.64       800

