In [1]:
# 0. ENV & IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, accuracy_score, f1_score

import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from utils.paths import PROCESSED, MODELS, FIG_EVAL, LOGS


In [2]:
# Load test set
test_df = pd.read_csv(PROCESSED / "doc_type_test.csv")
X_test = test_df["text"].values
y_test_str = test_df["doc_type"].values

print(test_df.shape)
print(test_df["doc_type"].value_counts())


(306, 2)
doc_type
SCIENTIFIC_PAPER    77
LEGAL_DOCUMENT      77
INVOICE             76
EMAIL               76
Name: count, dtype: int64


In [3]:
# load TF-IDF + Logistic Regression
tfidf_model_path = MODELS / "tfidf_logreg_baseline.pkl"
clf_tfidf = joblib.load(tfidf_model_path)
clf_tfidf


In [None]:
#load DistilBERT (transformer baseline)
transformer_dir = MODELS / "transformer_baseline_final"

tokenizer = AutoTokenizer.from_pretrained(transformer_dir)
model = AutoModelForSequenceClassification.from_pretrained(transformer_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
device


'cpu'

In [5]:
#label maoping
id2label = model.config.id2label
label2id = model.config.label2id
id2label, label2id


({0: 'EMAIL', 1: 'INVOICE', 2: 'LEGAL_DOCUMENT', 3: 'SCIENTIFIC_PAPER'},
 {'EMAIL': 0, 'INVOICE': 1, 'LEGAL_DOCUMENT': 2, 'SCIENTIFIC_PAPER': 3})

In [6]:
#TF-IDF predictions
y_pred_tfidf = clf_tfidf.predict(X_test)


In [7]:
# Transformer predictions
# batch to avoid cpu overload
def predict_transformer(texts, batch_size=16):
    all_preds = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = list(texts[i:i+batch_size])
            enc = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=256,   # same as training
                return_tensors="pt",
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            outputs = model(**enc)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            all_preds.extend(preds)
    # map ids back to labels
    return np.array([id2label[int(i)] for i in all_preds])

y_pred_transformer = predict_transformer(X_test)


In [8]:
#sanity check
pd.Series(y_pred_tfidf).value_counts(), pd.Series(y_pred_transformer).value_counts()


(SCIENTIFIC_PAPER    79
 LEGAL_DOCUMENT      78
 INVOICE             76
 EMAIL               73
 Name: count, dtype: int64,
 SCIENTIFIC_PAPER    79
 INVOICE             76
 LEGAL_DOCUMENT      76
 EMAIL               75
 Name: count, dtype: int64)

## Classification reports

### TF-IDF report

In [10]:
report_tfidf = classification_report(
    y_test_str,
    y_pred_tfidf,
    digits=4,
    output_dict=True
)
print(classification_report(y_test_str, y_pred_tfidf, digits=4))


                  precision    recall  f1-score   support

           EMAIL     0.9863    0.9474    0.9664        76
         INVOICE     1.0000    1.0000    1.0000        76
  LEGAL_DOCUMENT     0.9744    0.9870    0.9806        77
SCIENTIFIC_PAPER     0.9747    1.0000    0.9872        77

        accuracy                         0.9837       306
       macro avg     0.9838    0.9836    0.9836       306
    weighted avg     0.9838    0.9837    0.9836       306



### Transformer report

In [11]:
report_trans = classification_report(
    y_test_str,
    y_pred_transformer,
    digits=4,
    output_dict=True
)
print(classification_report(y_test_str, y_pred_transformer, digits=4))


                  precision    recall  f1-score   support

           EMAIL     0.9867    0.9737    0.9801        76
         INVOICE     1.0000    1.0000    1.0000        76
  LEGAL_DOCUMENT     1.0000    0.9870    0.9935        77
SCIENTIFIC_PAPER     0.9620    0.9870    0.9744        77

        accuracy                         0.9869       306
       macro avg     0.9872    0.9869    0.9870       306
    weighted avg     0.9871    0.9869    0.9870       306



In [12]:
df_tfidf = pd.DataFrame(report_tfidf).T
df_trans = pd.DataFrame(report_trans).T

# keep only real classes + macro avg
labels = sorted(test_df["doc_type"].unique())
rows_to_keep = labels + ["macro avg", "weighted avg"]

df_tfidf = df_tfidf.loc[rows_to_keep, ["precision", "recall", "f1-score"]]
df_trans = df_trans.loc[rows_to_keep, ["precision", "recall", "f1-score"]]

df_tfidf["model"] = "TFIDF_LogReg"
df_trans["model"] = "DistilBERT"

df_tfidf, df_trans


(                  precision    recall  f1-score         model
 EMAIL              0.986301  0.947368  0.966443  TFIDF_LogReg
 INVOICE            1.000000  1.000000  1.000000  TFIDF_LogReg
 LEGAL_DOCUMENT     0.974359  0.987013  0.980645  TFIDF_LogReg
 SCIENTIFIC_PAPER   0.974684  1.000000  0.987179  TFIDF_LogReg
 macro avg          0.983836  0.983595  0.983567  TFIDF_LogReg
 weighted avg       0.983775  0.983660  0.983569  TFIDF_LogReg,
                   precision    recall  f1-score       model
 EMAIL              0.986667  0.973684  0.980132  DistilBERT
 INVOICE            1.000000  1.000000  1.000000  DistilBERT
 LEGAL_DOCUMENT     1.000000  0.987013  0.993464  DistilBERT
 SCIENTIFIC_PAPER   0.962025  0.987013  0.974359  DistilBERT
 macro avg          0.987173  0.986928  0.986989  DistilBERT
 weighted avg       0.987133  0.986928  0.986969  DistilBERT)

In [13]:
#combine 
comparison = (
    pd.concat([df_tfidf, df_trans], axis=0)
    .reset_index()
    .rename(columns={"index": "label"})
)

comparison


Unnamed: 0,label,precision,recall,f1-score,model
0,EMAIL,0.986301,0.947368,0.966443,TFIDF_LogReg
1,INVOICE,1.0,1.0,1.0,TFIDF_LogReg
2,LEGAL_DOCUMENT,0.974359,0.987013,0.980645,TFIDF_LogReg
3,SCIENTIFIC_PAPER,0.974684,1.0,0.987179,TFIDF_LogReg
4,macro avg,0.983836,0.983595,0.983567,TFIDF_LogReg
5,weighted avg,0.983775,0.98366,0.983569,TFIDF_LogReg
6,EMAIL,0.986667,0.973684,0.980132,DistilBERT
7,INVOICE,1.0,1.0,1.0,DistilBERT
8,LEGAL_DOCUMENT,1.0,0.987013,0.993464,DistilBERT
9,SCIENTIFIC_PAPER,0.962025,0.987013,0.974359,DistilBERT


In [14]:
LOGS.mkdir(parents=True, exist_ok=True)
comparison.to_csv(LOGS / "baseline_vs_transformer_metrics.csv", index=False)


In [15]:
# Plot per-class F1 comparison
FIG_EVAL.mkdir(parents=True, exist_ok=True)

labels = sorted(test_df["doc_type"].unique())

f1_tfidf = [df_tfidf.loc[label, "f1-score"] for label in labels]
f1_trans = [df_trans.loc[label, "f1-score"] for label in labels]

x = np.arange(len(labels))
width = 0.35

plt.figure(figsize=(8, 5))
plt.bar(x - width/2, f1_tfidf, width, label="TF-IDF + LogReg")
plt.bar(x + width/2, f1_trans, width, label="DistilBERT")

plt.xticks(x, labels, rotation=20)
plt.ylim(0.9, 1.01)
plt.ylabel("F1-score")
plt.title("Per-class F1 comparison")
plt.legend()
plt.tight_layout()
plt.savefig(FIG_EVAL / "f1_per_class_tfidf_vs_transformer.png", dpi=150)
plt.close()


#  Observations and Conclusion

## Observations
* Both models **perfectly classify invoices (F1 = 1.00)**, reflecting the highly regular structure of the invoice texts.
* **DistilBERT improves notably** on **LEGAL\_DOCUMENT** (F1: 0.98 $\rightarrow$ 0.99) and slightly on **EMAIL**, reducing the number of misclassified emails and contracts.
* For **SCIENTIFIC\_PAPER**, **TF-IDF + LogReg slightly outperforms DistilBERT** in F1 (0.987 vs 0.974), although both remain very strong.
* Most remaining errors for both models arise from confusion between **EMAIL and SCIENTIFIC\_PAPER**, where emails contain highly technical or formal language similar to abstracts.

---

## Conclusion
A simple **TF-IDF + Logistic Regression** baseline already performs **extremely well** on this four-class document type task. **DistilBERT** achieves marginally better overall performance and improves classification of legal documents and some emails, but the gain over a well-tuned classical model is modest. For this dataset, the choice between models is therefore a **trade-off** between slightly higher accuracy (DistilBERT) and lower computational cost and simpler deployment (TF-IDF + LogReg).