In [1]:
!pip install -q transformers datasets accelerate scikit-learn safetensors

In [2]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import classification_report, confusion_matrix

2026-01-20 12:55:10.269013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768913710.292313     411 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768913710.299320     411 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768913710.317003     411 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768913710.317027     411 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768913710.317029     411 computation_placer.cc:177] computation placer alr

In [3]:
dataset = load_dataset("silentone0725/ai-human-text-detection-v1")

print(dataset)

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 36744
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 7874
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7874
    })
})


In [4]:
label2id = {"human": 0, "ai": 1}
id2label = {0: "human", 1: "ai"}

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/36744 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

In [5]:
def clean(example):
    example["text"] = "" if example["text"] is None else str(example["text"])
    return example

dataset = dataset.map(clean)

Map:   0%|          | 0/36744 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoTokenizer

MODEL_NAME = "microsoft/deberta-v3-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"]
)




Map:   0%|          | 0/36744 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

Map:   0%|          | 0/7874 [00:00<?, ? examples/s]

In [7]:
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    label2id=label2id,
    id2label=id2label
)

pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,   # DeBERTa is heavy
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer
)

  trainer = Trainer(


model.safetensors:   0%|          | 0.00/286M [00:00<?, ?B/s]

In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Epoch,Training Loss,Validation Loss
1,0.0131,0.036565
2,0.0,0.011454
3,0.0042,0.027073


TrainOutput(global_step=27558, training_loss=0.01644947550833741, metrics={'train_runtime': 2636.5827, 'train_samples_per_second': 41.809, 'train_steps_per_second': 10.452, 'total_flos': 7301333214240768.0, 'train_loss': 0.01644947550833741, 'epoch': 3.0})

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

predictions = trainer.predict(tokenized["test"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = tokenized["test"]["labels"]

print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["human", "ai"]))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Classification Report:
              precision    recall  f1-score   support

       human       1.00      0.99      1.00      3937
          ai       0.99      1.00      1.00      3937

    accuracy                           1.00      7874
   macro avg       1.00      1.00      1.00      7874
weighted avg       1.00      1.00      1.00      7874

Confusion Matrix:
[[3903   34]
 [   1 3936]]


In [12]:
MODEL_SAVE_PATH = "./ai_text_detector_small"
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

('./ai_text_detector_small/tokenizer_config.json',
 './ai_text_detector_small/special_tokens_map.json',
 './ai_text_detector_small/spm.model',
 './ai_text_detector_small/added_tokens.json',
 './ai_text_detector_small/tokenizer.json')

In [16]:
import torch

# detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # move model to GPU if available

def predict_text(text):
    # tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    # move inputs to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # forward pass
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)

    ai_prob = probs[0][1].item()
    human_prob = probs[0][0].item()
    label = "AI-generated" if ai_prob > human_prob else "Human-written"

    return {"label": label, "ai_probability": round(ai_prob, 3), "human_probability": round(human_prob, 3)}

# test
sample = ""
print(predict_text(sample))


{'label': 'Human-written', 'ai_probability': 0.0, 'human_probability': 1.0}


In [17]:
import shutil

shutil.make_archive("ai_text_detector_small", 'zip', "ai_text_detector_small")

'/kaggle/working/ai_text_detector_small.zip'