In [1]:
# 0. ENV & IMPORTS
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

from utils.paths import PROCESSED, MODELS, FIG_EVAL, LOGS


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

In [3]:
#Load train / val / test CSVs
train_df = pd.read_csv(PROCESSED / "doc_type_train.csv")
val_df   = pd.read_csv(PROCESSED / "doc_type_val.csv")
test_df  = pd.read_csv(PROCESSED / "doc_type_test.csv")

print("Train:", train_df.shape)
print("Val:",   val_df.shape)
print("Test:",  test_df.shape)
print(train_df["doc_type"].value_counts())


Train: (41780, 2)
Val: (8953, 3)
Test: (8954, 2)
doc_type
INVOICE             14000
EMAIL               14000
SCIENTIFIC_PAPER    13780
Name: count, dtype: int64


In [4]:
#Encode labels
# map string labels to ints
labels = sorted(train_df["doc_type"].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}
label2id, id2label


({'EMAIL': 0, 'INVOICE': 1, 'SCIENTIFIC_PAPER': 2},
 {0: 'EMAIL', 1: 'INVOICE', 2: 'SCIENTIFIC_PAPER'})

In [5]:
#apply mapping
for df in (train_df, val_df, test_df):
    df["label"] = df["doc_type"].map(label2id)


In [6]:
#Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[["text", "label"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

train_ds, val_ds, test_ds


(Dataset({
     features: ['text', 'label'],
     num_rows: 41780
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 8953
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 8954
 }))

In [7]:
#Tokenizer & tokenization
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [8]:
MAX_LEN = 512  # transformer limit; contracts get truncated here
#512 is too much for my cpu. I leave 256 for now

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )


In [9]:
#Apply to all splits:
train_tok = train_ds.map(tokenize_batch, batched=True)
val_tok   = val_ds.map(tokenize_batch,   batched=True)
test_tok  = test_ds.map(tokenize_batch,  batched=True)

# remove raw text column, set PyTorch format
cols_to_remove = ["text"]
train_tok = train_tok.remove_columns(cols_to_remove)
val_tok   = val_tok.remove_columns(cols_to_remove)
test_tok  = test_tok.remove_columns(cols_to_remove)

train_tok.set_format("torch")
val_tok.set_format("torch")
test_tok.set_format("torch")

train_tok[0]


Map:   0%|          | 0/41780 [00:00<?, ? examples/s]

Map:   0%|          | 0/8953 [00:00<?, ? examples/s]

Map:   0%|          | 0/8954 [00:00<?, ? examples/s]

{'label': tensor(1),
 'input_ids': tensor([  101,  2727,  1011,  5511,  1011,  2410,  1999,  6767,  6610,  2005,
          3312, 12424, 10786, 10568,  1024,  1020, 26306, 12521,  2692,  2561,
         19410,  1012,  4090,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,   

In [10]:
#Model & training setup
num_labels = len(label2id)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [11]:
#Training arguments:
output_dir = MODELS / "transformer_baseline"
logging_dir = LOGS / "transformer_runs"

training_args = TrainingArguments(
    output_dir=str(output_dir),
    num_train_epochs=3, # 3 is too much for my cpu to handle
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir=str(logging_dir),
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
)


In [12]:
#7. Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1_macro}


In [13]:
#Trainer & training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [15]:
#evaluate on val just to see:
val_metrics = trainer.evaluate(eval_dataset=val_tok)
val_metrics




{'eval_loss': 0.03892216458916664,
 'eval_accuracy': 0.9901639344262295,
 'eval_f1_macro': 0.9901939700611427,
 'eval_runtime': 107.8893,
 'eval_samples_per_second': 2.827,
 'eval_steps_per_second': 0.361,
 'epoch': 1.0}

In [16]:
# get predictions on test
pred_output = trainer.predict(test_tok)
logits = pred_output.predictions
y_test_pred_ids = np.argmax(logits, axis=-1)

y_test_true = test_df["label"].values
y_test_pred = y_test_pred_ids

from sklearn.metrics import classification_report

print(classification_report(
    y_test_true,
    y_test_pred,
    target_names=[id2label[i] for i in range(num_labels)],
    digits=4
))




                  precision    recall  f1-score   support

           EMAIL     0.9867    0.9737    0.9801        76
         INVOICE     1.0000    1.0000    1.0000        76
  LEGAL_DOCUMENT     1.0000    0.9870    0.9935        77
SCIENTIFIC_PAPER     0.9620    0.9870    0.9744        77

        accuracy                         0.9869       306
       macro avg     0.9872    0.9869    0.9870       306
    weighted avg     0.9871    0.9869    0.9870       306



In [17]:
test_metrics = trainer.evaluate(eval_dataset=test_tok)
test_metrics




{'eval_loss': 0.053429193794727325,
 'eval_accuracy': 0.9869281045751634,
 'eval_f1_macro': 0.9869888692444205,
 'eval_runtime': 100.966,
 'eval_samples_per_second': 3.031,
 'eval_steps_per_second': 0.386,
 'epoch': 1.0}

In [18]:
#Confusion matrix:
FIG_EVAL.mkdir(parents=True, exist_ok=True)

cm = confusion_matrix(y_test_true, y_test_pred)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=[id2label[i] for i in range(num_labels)]
)

plt.figure(figsize=(6, 6))
disp.plot(colorbar=False)
plt.title("Confusion matrix - DistilBERT baseline")
plt.tight_layout()
plt.savefig(FIG_EVAL / "confusion_matrix_transformer_baseline.png", dpi=150)
plt.close()


<Figure size 600x600 with 0 Axes>

In [19]:
#Save metrics & model

with open(LOGS / "transformer_baseline_test.txt", "w", encoding="utf-8") as f:
    f.write("Model: DistilBERT (4-class document type)\n")
    f.write(f"Test accuracy: {test_metrics['eval_accuracy']:.4f}\n")
    f.write(f"Test macro F1: {test_metrics['eval_f1_macro']:.4f}\n\n")
    f.write("Detailed classification report:\n\n")
    f.write(classification_report(
        y_test_true,
        y_test_pred,
        target_names=[id2label[i] for i in range(num_labels)],
        digits=4
    ))


In [20]:
# save final model + tokenizer in a nice folder
final_dir = MODELS / "transformer_baseline_final"
final_dir.mkdir(parents=True, exist_ok=True)

model.save_pretrained(final_dir)
tokenizer.save_pretrained(final_dir)

final_dir


WindowsPath('C:/Users/viach/Documents/doc_class/models/transformer_baseline_final')

# Transformer Baseline (DistilBERT)

As a neural baseline, we fine-tuned a **DistilBERT** model (`distilbert-base-uncased`) for 4-class document type classification. Inputs were truncated/padded to **256 tokens**, and the model was trained for **1 epoch** on CPU with a batch size of 8. The same train/validation/test split as the classical baseline was used.

---

### Performance

On the **validation set**, DistilBERT reaches an **accuracy of 0.990** and a **macro F1 of 0.990**.

On the **held-out test set**, it achieves:

* **Accuracy:** 0.9869
* **Macro F1:** 0.9870

Per-class performance is high across the board (F1 $\approx$ 0.97â€“1.00). The main errors occur between **EMAIL and SCIENTIFIC\_PAPER**, which is consistent with the classical baseline and reflects overlap between formal technical emails and scientific abstracts.

---

### Comparison & Interpretation

Compared to the TF-IDF + Logistic Regression baseline (accuracy $\approx$ 0.984, macro F1 $\approx$ 0.984), DistilBERT provides a **small but consistent improvement**.

This suggests that a transformer model captures slightly richer stylistic and semantic cues, although the margin over a well-tuned classical model is modest for this relatively easy task.

In [None]:
"""from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(final_dir)
tokenizer = AutoTokenizer.from_pretrained(final_dir)
"""