# 02_train_model

In [1]:
from pathlib import Path
import json, re
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

from pathlib import Path
import json

HERE = Path.cwd().resolve()

def find_config(start: Path) -> Path:
    for p in [start, *start.parents]:
        cand = p / "project_config.json"
        if cand.exists():
            return cand
    raise FileNotFoundError("project_config.json not found. Run 00_config_and_checks.ipynb first.")

CONFIG_PATH = find_config(HERE)
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
print("Config:", CONFIG_PATH)




PROCESSED_DIR = Path(cfg["PROCESSED_DIR"])
MODELS_DIR = Path(cfg["MODELS_DIR"])

train_df = pd.read_csv(PROCESSED_DIR / "train.csv")
val_df = pd.read_csv(PROCESSED_DIR / "val.csv")

print("Train:", train_df.shape, "Val:", val_df.shape)
print(train_df["doc_type"].value_counts())

Config: C:\Users\viach\Downloads\document-classifier-portfolio-v2\project_config.json
Train: (42000, 4) Val: (9000, 4)
doc_type
SCIENTIFIC_PAPER    14000
EMAIL               14000
INVOICE             14000
Name: count, dtype: int64


In [2]:
# Invoice augmentation (train only)
RNG = np.random.default_rng(42)

def aug_invoice(text: str) -> str:
    t = text
    # reorder lines
    lines = [ln for ln in t.splitlines() if ln.strip()]
    if len(lines) > 3:
        idx = np.arange(len(lines))
        RNG.shuffle(idx)
        lines = [lines[i] for i in idx]
        t = "\n".join(lines)
    # mild noise
    if RNG.random() < 0.35:
        t = re.sub(r"\s+", " ", t).strip()
    if RNG.random() < 0.25:
        t = t.replace(":", " : ")
    if RNG.random() < 0.25:
        t += "\nPayment terms: Pay within 30 days."
    if RNG.random() < 0.15:
        t += "\nVAT: 7.7%"
    return t.strip()

def augment_invoices(df: pd.DataFrame, factor: int = 2) -> pd.DataFrame:
    inv = df[df["doc_type"] == "INVOICE"].copy()
    if inv.empty or factor <= 1:
        return df
    base = inv["text"].astype(str).tolist()
    rows = []
    for _ in range((factor - 1) * len(base)):
        b = base[RNG.integers(0, len(base))]
        rows.append({"text": aug_invoice(b), "doc_type": "INVOICE", "source": "SYN_INVOICE"})
    aug = pd.DataFrame(rows)
    out = pd.concat([df, aug], ignore_index=True)
    out = out.drop_duplicates(subset=["doc_type", "text"]).reset_index(drop=True)
    return out

In [3]:
train_aug = train_df.copy()

In [4]:
import time
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

# 1) Keep training size under control (dev-friendly, reproducible)
MAX_PER_CLASS = None  # set to None to use all

if MAX_PER_CLASS is not None:
    train_use = (
        train_aug.groupby("doc_type", group_keys=False)
        .apply(lambda g: g.sample(n=min(len(g), MAX_PER_CLASS), random_state=42))
        .reset_index(drop=True)
    )
    val_use = (
        val_df.groupby("doc_type", group_keys=False)
        .apply(lambda g: g.sample(n=min(len(g), MAX_PER_CLASS // 4), random_state=42))
        .reset_index(drop=True)
    )
else:
    train_use = train_aug
    val_use = val_df

print("Train use:", train_use.shape)
print(train_use["doc_type"].value_counts())
print("Val use:", val_use.shape)

# 2) Data arrays
X_train = train_use["text"].astype(str).to_numpy()
y_train = train_use["doc_type"].astype(str).to_numpy()
X_val = val_use["text"].astype(str).to_numpy()
y_val = val_use["doc_type"].astype(str).to_numpy()

# 3) One strong baseline first (word unigrams). Bigrams + char-ngrams can come later as an experiment.
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=30000,     # smaller vocab = much faster
        ngram_range=(1, 1),     # unigrams first (bigrams are expensive)
        min_df=5,               # ignore rare terms early
        max_df=0.95,            # drop ultra-common terms
        stop_words="english",
        sublinear_tf=True,
        dtype=np.float32
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        solver="saga",          # good for big sparse matrices
        n_jobs=-1,
        class_weight="balanced",
        C=2.0
    )),
])

t0 = time.time()
print("Fitting...")
pipe.fit(X_train, y_train)
print("Fit time (s):", round(time.time() - t0, 1))

pred = pipe.predict(X_val)
f1 = f1_score(y_val, pred, average="macro")
print("macro_f1:", round(f1, 4))
print(classification_report(y_val, pred, digits=3))


Train use: (42000, 4)
doc_type
SCIENTIFIC_PAPER    14000
EMAIL               14000
INVOICE             14000
Name: count, dtype: int64
Val use: (9000, 4)
Fitting...
Fit time (s): 10.6
macro_f1: 0.9991
                  precision    recall  f1-score   support

           EMAIL      1.000     0.998     0.999      3000
         INVOICE      1.000     1.000     1.000      3000
SCIENTIFIC_PAPER      0.998     1.000     0.999      3000

        accuracy                          0.999      9000
       macro avg      0.999     0.999     0.999      9000
    weighted avg      0.999     0.999     0.999      9000



In [5]:
X_train = train_aug["text"].astype(str).values
y_train = train_aug["doc_type"].astype(str).values
X_val = val_df["text"].astype(str).values
y_val = val_df["doc_type"].astype(str).values

candidates = {
    "tfidf_word_logreg": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=3, stop_words="english")),
        ("clf", LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced", C=1.0)),
    ]),
    "tfidf_char_logreg": Pipeline([
        ("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), min_df=3)),
        ("clf", LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced", C=1.0)),
    ]),
}

scores = []
for name, pipe in candidates.items():
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_val)
    f1 = f1_score(y_val, pred, average="macro")
    scores.append((name, f1))
    print(name, "macro_f1:", f1)

best_name = sorted(scores, key=lambda x: -x[1])[0][0]
best_model = candidates[best_name]
best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

print("Best:", best_name)

tfidf_word_logreg macro_f1: 0.9985553475696106
tfidf_char_logreg macro_f1: 0.9985555507006009
Best: tfidf_char_logreg


In [6]:
import json, joblib
from pathlib import Path

VERSION = "v1"  

OUT_DIR = MODELS_DIR / "text_model" / VERSION
OUT_DIR.mkdir(parents=True, exist_ok=True)

model_path = OUT_DIR / "model.pkl"
joblib.dump(pipe, model_path)   # 'pipe' is the trained Pipeline

meta = {
    "model": "tfidf_logreg",
    "vectorizer": {
        "type": "tfidf",
        "max_features": pipe.named_steps["tfidf"].max_features,
        "ngram_range": pipe.named_steps["tfidf"].ngram_range,
        "min_df": pipe.named_steps["tfidf"].min_df,
        "max_df": pipe.named_steps["tfidf"].max_df,
        "stop_words": pipe.named_steps["tfidf"].stop_words,
    },
    "classifier": {
        "type": "logreg",
        "solver": pipe.named_steps["clf"].solver,
        "C": pipe.named_steps["clf"].C,
        "class_weight": pipe.named_steps["clf"].class_weight,
        "max_iter": pipe.named_steps["clf"].max_iter,
    },
    "classes": list(getattr(pipe.named_steps["clf"], "classes_", [])),
    "version": VERSION
}

(Path(OUT_DIR) / "meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Saved:", model_path)


Saved: C:\Users\viach\Downloads\document-classifier-portfolio-v2\models\text_model\v1\model.pkl
