In [1]:
# 1. ENVIRONMENT
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

from utils.paths import PROCESSED, MODELS, FIG_EVAL, LOGS

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import joblib

In [2]:
#--------------- 2. LOAD DATA---------------------
train_path = PROCESSED / "doc_type_train.csv"
test_path  = PROCESSED / "doc_type_test.csv"
val_path = PROCESSED / "doc_type_val.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)

X_train = train_df["text"].values
y_train = train_df["doc_type"].values

X_val   = val_df["text"].values
y_val   = val_df["doc_type"].values

X_test  = test_df["text"].values
y_test  = test_df["doc_type"].values

## TF-IDF + Logistic Regression pipeline

In [3]:
# 2. HYPERPARAM TUNING ON VALIDATION SET
#    (tiny grid over C for LogisticRegression)

Cs = [0.5, 1.0, 2.0]
best_C = None
best_f1 = -1.0
val_reports = {}

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


for C in Cs:
    print(f"\n=== Training with C={C} ===")
    clf = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=50000,
            ngram_range=(1, 2),
            lowercase=True,
        )),
        ("logreg", LogisticRegression(
            max_iter=200,
            n_jobs=-1,
            C=C,
        )),
    ])
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)

    rep_dict = classification_report(
        y_val, y_val_pred,
        digits=4,
        output_dict=True
    )
    rep_text = classification_report(y_val, y_val_pred, digits=4)
    val_reports[C] = rep_text
    f1_macro = rep_dict["macro avg"]["f1-score"]

    print(rep_text)
    print(f"Macro F1 on val for C={C}: {f1_macro:.4f}")

    if f1_macro > best_f1:
        best_f1 = f1_macro
        best_C = C

print(f"\nBest C found on validation set: {best_C} (macro F1={best_f1:.4f})")



=== Training with C=0.5 ===
                  precision    recall  f1-score   support

           EMAIL     0.9997    0.9977    0.9987      3000
         INVOICE     1.0000    1.0000    1.0000      3000
SCIENTIFIC_PAPER     0.9976    0.9997    0.9986      2953

        accuracy                         0.9991      8953
       macro avg     0.9991    0.9991    0.9991      8953
    weighted avg     0.9991    0.9991    0.9991      8953

Macro F1 on val for C=0.5: 0.9991

=== Training with C=1.0 ===
                  precision    recall  f1-score   support

           EMAIL     0.9997    0.9983    0.9990      3000
         INVOICE     1.0000    1.0000    1.0000      3000
SCIENTIFIC_PAPER     0.9983    0.9997    0.9990      2953

        accuracy                         0.9993      8953
       macro avg     0.9993    0.9993    0.9993      8953
    weighted avg     0.9993    0.9993    0.9993      8953

Macro F1 on val for C=1.0: 0.9993

=== Training with C=2.0 ===
                  precision

Both C = 1.0 and C = 2.0 achieved perfect macro F1 = 1.0, so C = 1.0 was selected as the simplest setting.

In [5]:
# combine train & val
X_train_full = pd.concat([train_df["text"], val_df["text"]]).values
y_train_full = pd.concat([train_df["doc_type"], val_df["doc_type"]]).values

clf_final = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2),
        lowercase=True,
    )),
    ("logreg", LogisticRegression(
        max_iter=200,
        n_jobs=-1,
        C=best_C,      
    )),
])

print("\n=== Training final baseline model on train+val ===")
clf_final.fit(X_train_full, y_train_full)


=== Training final baseline model on train+val ===


In [6]:
y_test_pred = clf_final.predict(X_test)
report_test = classification_report(y_test, y_test_pred, digits=4)
print("\n=== Final evaluation on TEST set ===")
print(report_test)



=== Final evaluation on TEST set ===
                  precision    recall  f1-score   support

           EMAIL     0.9997    0.9980    0.9988      3000
         INVOICE     1.0000    1.0000    1.0000      3000
SCIENTIFIC_PAPER     0.9980    0.9997    0.9988      2954

        accuracy                         0.9992      8954
       macro avg     0.9992    0.9992    0.9992      8954
    weighted avg     0.9992    0.9992    0.9992      8954



TF-IDF + Logistic Regression (tuned on validation, trained on train+val, evaluated on test)
Accuracy ≈ 98.3 %, macro F1 ≈ 98.3%

In [7]:
LOGS.mkdir(parents=True, exist_ok=True)
with open(LOGS / "baseline_logreg.txt", "w", encoding="utf-8") as f:
    f.write("Model: TF-IDF + LogisticRegression (train+val, eval on test)\n")
    f.write(f"Train+Val size: {len(X_train_full)}\n")
    f.write(f"Test size: {len(X_test)}\n\n")
    f.write(report_test)

In [8]:
# Confusion matrix

FIG_EVAL.mkdir(parents=True, exist_ok=True)
cm = confusion_matrix(y_test, y_test_pred, labels=clf_final.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=clf_final.classes_)
plt.figure(figsize=(6, 6))
disp.plot(colorbar=False)
plt.title("Confusion matrix – TF-IDF + Logistic Regression")
plt.tight_layout()
plt.savefig(FIG_EVAL / "confusion_matrix_baseline.png", dpi=150)
plt.close()


<Figure size 600x600 with 0 Axes>

Emails occasionally misclassified as scientific papers (3 cases) or legal documents (2 cases).
Likely due to formal business language or embedded technical text.

Invoices, Legal documents, and Scientific papers are all perfectly or near-perfectly recognized.

In [9]:
# save metrics
LOGS.mkdir(parents=True, exist_ok=True)

log_path = LOGS / "baseline_logreg.txt"
with open(log_path, "w", encoding="utf-8") as f:
    f.write("Model: TF-IDF + LogisticRegression\n")
    f.write(f"Train size: {len(X_train)}\n")
    f.write(f"Test size: {len(X_test)}\n\n")
    f.write(report)


NameError: name 'report' is not defined

In [10]:
# save to models


MODELS.mkdir(parents=True, exist_ok=True)

model_path = MODELS / "tfidf_logreg_baseline.pkl"

joblib.dump(clf_final, model_path)

print("Saved model to:", model_path)


Saved model to: C:\Users\viach\Documents\doc_class\models\tfidf_logreg_baseline.pkl


In [13]:
# ----------------------------------------------------------------------
# 1. Truncation helper
# ----------------------------------------------------------------------

def truncate_texts(texts, max_chars=1000):
    """Truncate each text in an iterable to at most max_chars."""
    return [
        (t[:max_chars] if isinstance(t, str) else "")
        for t in texts
    ]


In [14]:
# ----------------------------------------------------------------------
# 2. Build truncated train (train+val) and test sets
#   
# ----------------------------------------------------------------------
X_train_full = pd.concat([train_df["text"], val_df["text"]]).values
y_train_full = pd.concat([train_df["doc_type"], val_df["doc_type"]]).values

X_train_full_short = truncate_texts(X_train_full, max_chars=1000)
X_test_short       = truncate_texts(X_test,       max_chars=1000)

In [15]:

# ----------------------------------------------------------------------
# 3. Train truncated baseline model
# ----------------------------------------------------------------------
clf_short = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        ngram_range=(1, 2),
        lowercase=True,
    )),
    ("logreg", LogisticRegression(
        max_iter=200,
        n_jobs=-1,
        # you can set C=best_C if you want to match the main baseline exactly
        C=best_C,
    )),
])

clf_short.fit(X_train_full_short, y_train_full)
y_pred_short = clf_short.predict(X_test_short)

report = classification_report(y_test, y_pred_short, digits=4)
print(report)

                  precision    recall  f1-score   support

           EMAIL     0.9997    0.9973    0.9985      3000
         INVOICE     0.9997    1.0000    0.9998      3000
SCIENTIFIC_PAPER     0.9976    0.9997    0.9986      2954

        accuracy                         0.9990      8954
       macro avg     0.9990    0.9990    0.9990      8954
    weighted avg     0.9990    0.9990    0.9990      8954



In [16]:
# ----------------------------------------------------------------------
# 4. Log truncated results
# ----------------------------------------------------------------------
LOGS.mkdir(parents=True, exist_ok=True)

with open(LOGS / "baseline_logreg_truncated.txt", "w", encoding="utf-8") as f:
    f.write("Model: TF-IDF + LogisticRegression (all docs truncated to 1000 chars)\n")
    f.write(f"Train size (train+val, truncated): {len(X_train_full_short)}\n")
    f.write(f"Test size (truncated): {len(X_test_short)}\n\n")
    f.write(report)


In [17]:

# ----------------------------------------------------------------------
# 5. Confusion matrix for truncated baseline
# ----------------------------------------------------------------------
FIG_EVAL.mkdir(parents=True, exist_ok=True)

cm_short = confusion_matrix(y_test, y_pred_short, labels=clf_short.classes_)
disp_short = ConfusionMatrixDisplay(confusion_matrix=cm_short,
                                    display_labels=clf_short.classes_)

plt.figure(figsize=(6, 6))
disp_short.plot(colorbar=False)
plt.title("Confusion matrix – TF-IDF + Logistic Regression (truncated)")
plt.tight_layout()
plt.savefig(FIG_EVAL / "confusion_matrix_baseline_truncated.png", dpi=150)
plt.close()

<Figure size 600x600 with 0 Axes>

#  Model Performance Analysis

## Baseline TF-IDF + Logistic Regression (full text)
The tuned baseline model (**C = 1.0**) trained on train + validation and evaluated on the held-out test set achieves:

* **Accuracy:** 0.9837
* **Macro F1:** 0.9836

Per-class results show very strong performance:

* **EMAIL:** F1 $\approx$ 0.97 (a few emails misclassified)
* **INVOICE:** F1 = 1.00 (**all invoices correctly classified**)
* **LEGAL\_DOCUMENT:** F1 $\approx$ 0.98
* **SCIENTIFIC\_PAPER:** F1 $\approx$ 0.99

The confusion matrix indicates that most errors come from **EMAIL** occasionally being predicted as **LEGAL\_DOCUMENT** or **SCIENTIFIC\_PAPER**, while the other three classes are almost perfectly recognized.

---

## Truncation Robustness (first 1,000 characters only)
To test whether the model depends on full document length, all texts (train + val + test) were truncated to the **first 1,000 characters** and the model was retrained with the same configuration.

Results on the truncated test set:

* **Accuracy:** 0.9837
* **Macro F1:** 0.9836 (virtually unchanged)

Per-class:

* **EMAIL:** recall drops slightly (some emails predicted as **SCIENTIFIC\_PAPER**)
* **INVOICE:** remains perfectly classified
* **LEGAL\_DOCUMENT** and **SCIENTIFIC\_PAPER:** still classified almost perfectly.

The truncated confusion matrix shows **5 misclassified emails** (1 as LEGAL\_DOCUMENT, 4 as SCIENTIFIC\_PAPER), while all other classes remain error-free.

---

## Interpretation 
Overall, truncating documents to **1,000 characters does not significantly degrade performance**.

The model relies primarily on **lexical and stylistic cues** rather than raw document length, and it robustly distinguishes between invoices, legal contracts, scientific abstracts and typical emails. Remaining errors occur mainly on borderline cases where emails closely resemble technical or formal documents.