In [1]:
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from utils.paths import MODELS

# Load both models
tfidf_model = joblib.load(MODELS / "tfidf_logreg_baseline.pkl")

transformer_dir = MODELS / "transformer_baseline_final"
tokenizer = AutoTokenizer.from_pretrained(transformer_dir)
transformer = AutoModelForSequenceClassification.from_pretrained(transformer_dir)
id2label = transformer.config.id2label
device = "cuda" if torch.cuda.is_available() else "cpu"
transformer.to(device)


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [2]:
# TF-IDF inference
def predict_tfidf(text):
    return tfidf_model.predict([text])[0]

In [3]:
# Transformer inference
def predict_transformer(text, max_length=256):
    transformer.eval()
    with torch.no_grad():
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length
        ).to(device)
        outputs = transformer(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=-1).item()
    return id2label[pred_id]


In [4]:
samples = [
    "Dear John, please see attached the updated budget spreadsheet.",
    "INVOICE 12345 issued to client ACME Corp for consulting services.",
    "This Agreement is made between the Contractor and the Client...",
    "In this paper, we propose a novel neural architecture for image classification."
]

for text in samples:
    pred_tfidf = predict_tfidf(text)
    pred_trans = predict_transformer(text)
    print(f"\nText: {text[:60]}...")
    print(f"TF-IDF: {pred_tfidf} | Transformer: {pred_trans}")



Text: Dear John, please see attached the updated budget spreadshee...
TF-IDF: EMAIL | Transformer: EMAIL

Text: INVOICE 12345 issued to client ACME Corp for consulting serv...
TF-IDF: EMAIL | Transformer: EMAIL

Text: This Agreement is made between the Contractor and the Client...
TF-IDF: EMAIL | Transformer: EMAIL

Text: In this paper, we propose a novel neural architecture for im...
TF-IDF: SCIENTIFIC_PAPER | Transformer: SCIENTIFIC_PAPER



Models  behave best when the input looks like the training data.

Invoices were trained as structured pseudo-documents, not one-line headers.


Legal docs were huge contracts. A single sentence may not be enough signal.


The model is trained on full document-like representations. Very short snippets, such as ‘Invoice 12345, total CHF 299.95, VAT 7.7%’, may be misclassified, because they do not match the textual structure the model was trained on. For invoices, the intended input is a generated full-text representation based on invoice fields (client, date, total, tax, due date, status).”

In [7]:
samples = [
    "Dear John, please see attached the updated budget spreadsheet.",
    "INVOICE 12345 issued to client ACME Corp for consulting services.",
    "This Agreement is made between the Contractor and the Client...",
    "In this paper, we propose a novel neural architecture for image classification."
    "This case is about an apprenticeship test that had a disparate impact on Black apprenticeship applicants. The Equal Employment Opportunity Commission (EEOC) filed this lawsuit on December 27, 2004, in U.S. District Court for the Southern District of Ohio. Filing on behalf of thirteen Black individuals and a class of similarly situated Black apprenticeship test takers, the EEOC alleged that the individuals’ employer, the Ford Motor Company, as well as their union, the United Automobile, Aerospace, and Agricultural implement workers of America (the “UAW”), and the Ford-UAW Joint Apprenticeship Committee, violated Title VII of the Civil Rights Act, 42 U.S.C. § 1981, and Michigan state anti-discrimination law. At issue were the selection tests for apprenticeship training programs, whose disparate impact denied Black applicants eligibility and admission. The EEOC sought injunctive relief, as well as damages (including backpay) for the Black apprenticeship applicants. The case was assigned to Judge Susan J. Dlott.",
    """Crystal May
Philip Moody
ugoodman@example.com
151
9
24.66
23/03/1976
6389 Debbie Island Suite 470
Coxbury
27006726"""

]

for text in samples:
    pred_tfidf = predict_tfidf(text)
    pred_trans = predict_transformer(text)
    print(f"\nText: {text[:60]}...")
    print(f"TF-IDF: {pred_tfidf} | Transformer: {pred_trans}")



Text: Dear John, please see attached the updated budget spreadshee...
TF-IDF: EMAIL | Transformer: EMAIL

Text: INVOICE 12345 issued to client ACME Corp for consulting serv...
TF-IDF: EMAIL | Transformer: EMAIL

Text: This Agreement is made between the Contractor and the Client...
TF-IDF: EMAIL | Transformer: EMAIL

Text: In this paper, we propose a novel neural architecture for im...
TF-IDF: SCIENTIFIC_PAPER | Transformer: LEGAL_DOCUMENT

Text: Crystal May
Philip Moody
ugoodman@example.com
151
9
24.66
23...
TF-IDF: EMAIL | Transformer: EMAIL
