In [1]:
# ============================================================
# check_ocr_text_quality.py
# OCR ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú ÌíàÏßà Ï†êÍ≤Ä
#  - ÎàÑÎùΩ ÌååÏùº, Îπà ÌÖçÏä§Ìä∏ ÎπÑÏú®, ÌèâÍ∑† Í∏∏Ïù¥, ÏÉòÌîå ÎÇ¥Ïö© ÌôïÏù∏
# ============================================================

import os, glob, pandas as pd
from tqdm import tqdm

BASE = "/data/ephemeral/home/data"
OCR_TRAIN_DIR = f"{BASE}/ocr/train_texts_v6"
OCR_TEST_DIR  = f"{BASE}/ocr/test_texts_v6"
TRAIN_META = f"{BASE}/meta_stage0_6_train_v6.csv"
TRAIN_CSV  = f"{BASE}/raw/train.csv"
SUB_CSV    = f"{BASE}/raw/sample_submission.csv"

# ----------------------------
# Helper
# ----------------------------
def load_text(path):
    if not os.path.exists(path):
        return ""
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        txt = f.read().strip()
    return txt

def analyze_folder(ocr_dir, expected_ids):
    stats = {"found": 0, "missing": 0, "empty": 0, "lengths": []}
    missing_files = []

    for id_ in tqdm(expected_ids, desc=f"Checking {os.path.basename(ocr_dir)}"):
        stem = os.path.splitext(id_)[0]
        candidates = glob.glob(os.path.join(ocr_dir, f"{stem}.*"))
        if not candidates:
            stats["missing"] += 1
            missing_files.append(id_)
            continue

        txt = load_text(candidates[0])
        if len(txt.strip()) == 0:
            stats["empty"] += 1
        stats["found"] += 1
        stats["lengths"].append(len(txt))

    total = len(expected_ids)
    avg_len = sum(stats["lengths"]) / max(1, len(stats["lengths"]))
    print(f"\nüìÇ Folder: {ocr_dir}")
    print(f" - Total expected: {total}")
    print(f" - Found: {stats['found']} ({stats['found']/total:.2%})")
    print(f" - Missing: {stats['missing']} ({stats['missing']/total:.2%})")
    print(f" - Empty: {stats['empty']} ({stats['empty']/total:.2%})")
    print(f" - Avg text length: {avg_len:.1f}")
    print(f" - Min/Max length: {min(stats['lengths'] or [0])} / {max(stats['lengths'] or [0])}")

    # ÏÉòÌîå 3Í∞ú Ï∂úÎ†•
    print("\nü™∂ Sample texts:")
    sample_ids = expected_ids[:3]
    for sid in sample_ids:
        cands = glob.glob(os.path.join(ocr_dir, f"{os.path.splitext(sid)[0]}.*"))
        if not cands: continue
        with open(cands[0], "r", encoding="utf-8", errors="ignore") as f:
            txt = f.read().strip().replace("\n", " ")
        print(f" - {sid}: {txt[:150]}{'...' if len(txt) > 150 else ''}")

    if stats["missing"] > 0:
        print("\n‚ö†Ô∏è Missing file examples:")
        print(stats["missing"], "missing files, showing first 5 ‚Üí", missing_files[:5])

    return stats

# ----------------------------
# Train / Test ÌôïÏù∏
# ----------------------------
train_meta = pd.read_csv(TRAIN_META)
train_csv  = pd.read_csv(TRAIN_CSV)
train_csv["basename"] = train_csv["ID"].apply(lambda x: f"{x}.jpg" if not str(x).endswith(".jpg") else x)

# train set Í∏∞Ï§Ä (meta Í∏∞Ï§ÄÏúºÎ°úÎèÑ Í∞ÄÎä•)
train_ids = train_csv["ID"].tolist()
test_ids  = pd.read_csv(SUB_CSV)["ID"].tolist()

print("üîç Checking OCR extraction quality...\n")
train_stats = analyze_folder(OCR_TRAIN_DIR, train_ids)
test_stats  = analyze_folder(OCR_TEST_DIR, test_ids)

print("\n‚úÖ OCR check finished.")


üîç Checking OCR extraction quality...



Checking train_texts_v6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1570/1570 [00:01<00:00, 886.21it/s]



üìÇ Folder: /data/ephemeral/home/data/ocr/train_texts_v6
 - Total expected: 1570
 - Found: 1570 (100.00%)
 - Missing: 0 (0.00%)
 - Empty: 0 (0.00%)
 - Avg text length: 343.1
 - Min/Max length: 2 / 1308

ü™∂ Sample texts:
 - 002f99746285dfdd.jpg: 72Ïò§
 - 008ccd231e1fea5d.jpg: ÏßÑÎ£åÎπÑ(ÏïΩÏ†úÎπÑ) ÎÇ±ÏûÖ ÌôïÏù∏ÏÑú :ÏûêÎèôÎ¨¥Î≤àÌò∏ Ï£ºÏù∏ÌïòÎ¨¥Î©¥Ìò∏ Í¥Ä   Ïûê ÏòÅ E Ìè¨ÎØ∏Î≤ÑÎã§ Ïã†CUlleÎ¨¥Î∞∞) UÎ†à Ïä§ÌÉÄaÌÑ∞ Oe 8CJ ÎπÑÍ∑ºÏó¨ Î¨¥Íæ∏e (u+J Ïñ¥Ïù¥ÎÇ¥Îã§ ÎÇ¥C (U6>U_ 8e8 6646 ÏïÑ0Î¶¨ Ïöîa @Ìã∞yJ 70-2 ÏöîÌûà euG5 70 Ïöîa @Î∂Ä)) OSes - ...
 - 008f5911bfda7695.jpg: ÏßÑÎ£åÎπÑ(ÏïΩÏ†úÎπÑ) }ÏûÖ ÌôïÏù∏ÏÑú ÌïúÏûê ÏÑ¨ Î™Ö Ï£ºÎü∞Îì± ÎÇòÏûëÌò∏ ÏßÑ_ÎπÑ(ÏÑ†-Îãà) ÎÇ¥Ïô∏ ÏÜåÍ∑πÎ™πÍ∏∞ ÎåÄÏÉÅÎ†• ÎπÑÌÉÄÏù¥ Ï†Ñ: ; Ï£º Î¶¨ :P Î≥∏ÏÑ†Î∂ÄÎã§ (kD Í∞ÄÎìú Ïù¥Î¶¨) @3 ] (CSa) r8 3#ÏòÅ J4) 'o Jx 39430 39,103 (l V 47 v4o 4O VU ÏÜåÎìùÍ∑úÍ±∞ ÎçîÏÇ¨...


Checking test_texts_v6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3140/3140 [00:06<00:00, 483.99it/s]


üìÇ Folder: /data/ephemeral/home/data/ocr/test_texts_v6
 - Total expected: 3140
 - Found: 3140 (100.00%)
 - Missing: 0 (0.00%)
 - Empty: 192 (6.11%)
 - Avg text length: 101.6
 - Min/Max length: 0 / 1255

ü™∂ Sample texts:
 - 0008fdb22ddce0ce.jpg: Uoa OaDI ÌÜ†Ìä∏ Ìëπ %
 - 00091bffdffd83de.jpg: 
 - 00396fbc1f6cc21d.jpg: KO 'IU I Ïò¨ R0 5 i 5 Î∂ô Ïö© Ï§ë Î¨º 8 # Ïû•0 NIO ÏûêjE ÌçΩ 8 Ïö∏ Í≤® 16 a(ÏÑú Ïö© Íµ≠ ÌÜµ 1 Î≤Ñ Í∫º ÏÑúO C

‚úÖ OCR check finished.





In [2]:
# ============================================================
# ocr_prob_generator_train_v6.py
# OCR ÌÖçÏä§Ìä∏ Í∏∞Î∞ò Î¨∏ÏÑú Î∂ÑÎ•ò ÌôïÎ•† ÏÉùÏÑ± (RoBERTa fine-tuning, ÏïàÏ†ï ÏÑ∏ÌåÖ)
#  - Train 5-Fold Stratified + EarlyStopping + Cosine Scheduler + fp16
#  - Outputs:
#       /interim/ocr_valid_probs/fold{fold}_ocr_valid.csv
#       /interim/ocr_test_probs.csv
#       ./ocr_model_folds/fold{fold}/best/ (Í∞Å Ìè¥Îìú Î≤†Ïä§Ìä∏ Î™®Îç∏)
# ============================================================

import os, glob, re, random, json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import torch
from torch.utils.data import Dataset, DataLoader


# ============================================================
# Config
# ============================================================
BASE = "/data/ephemeral/home/data"
OCR_TEXT_DIR_TRAIN = f"{BASE}/ocr/train_texts_v6"   # *.txt / *.json (id Í∏∞Î∞ò ÌååÏùºÎ™Ö)
OCR_TEXT_DIR_TEST  = f"{BASE}/ocr/test_texts_v6"
TRAIN_META  = f"{BASE}/meta_stage0_6_train_v6.csv"
TRAIN_CSV   = f"{BASE}/raw/train.csv"
SUB_CSV     = f"{BASE}/raw/sample_submission.csv"

SAVE_VALID_DIR = f"{BASE}/interim/ocr_valid_probs"
SAVE_TEST_PATH = f"{BASE}/interim/ocr_test_probs.csv"
os.makedirs(SAVE_VALID_DIR, exist_ok=True)
os.makedirs("./ocr_model_folds", exist_ok=True)

MODEL_NAME = "klue/roberta-base"
NUM_CLASSES = 17
NFOLDS = 5
MAX_LEN = 512
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# OCR Ï†ÑÏö© ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ (ÌÖçÏä§Ìä∏ ÌååÏù∏ÌäúÎãù ÏïàÏ†ï ÏÑ∏ÌåÖ)
LR = 5e-5              # Ïù¥ÎØ∏ÏßÄ Î∞±Î≥∏(1e-4)Í≥º Îã¨Î¶¨ ÌÖçÏä§Ìä∏Îäî 5e-5Í∞Ä ÏïàÏ†Ñ
WEIGHT_DECAY = 1e-4
EPOCHS = 30
BATCH_SIZE = 16
WARMUP_RATIO = 0.1
PATIENCE = 5           # ÏñºÎ¶¨Ïä§ÌÉë Ïù∏ÎÇ¥




In [3]:
# ============================================================
# Reproducibility
# ============================================================
def seed_everything(seed=SEED):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


seed_everything()


# ============================================================
# Utils
# ============================================================
def load_ocr_text(filepath):
    """OCR ÌÖçÏä§Ìä∏ ÌååÏùº(txt/json) Î°úÎìú"""
    text = ""
    if filepath.endswith(".txt"):
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
    elif filepath.endswith(".json"):
        with open(filepath, "r", encoding="utf-8") as f:
            js = json.load(f)
            if isinstance(js, dict) and "text" in js:
                text = js["text"]
            elif isinstance(js, list):
                # [{ "text": "..."} ...] ÌòïÌÉú
                text = " ".join([x.get("text", "") for x in js if isinstance(x, dict)])
    text = re.sub(r"\s+", " ", text).strip()
    return text[:3000]


class OCRDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        txt = self.texts[idx] if self.texts[idx] else "[EMPTY]"
        enc = self.tokenizer(
            txt,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


@torch.no_grad()
def infer_probs(model, dataset, batch_size=BATCH_SIZE):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    all_probs = []
    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
        out = model(**batch)
        probs = torch.softmax(out.logits, dim=1).cpu().numpy()
        all_probs.append(probs)
    return np.vstack(all_probs)




In [4]:
# ============================================================
# Load train meta and join targets
# ============================================================
meta = pd.read_csv(TRAIN_META)
train_csv = pd.read_csv(TRAIN_CSV)
train_csv["basename"] = train_csv["ID"].apply(lambda x: f"{x}.jpg" if not str(x).endswith(".jpg") else x)
meta["basename"] = meta["filepath"].apply(os.path.basename)

df = pd.merge(meta, train_csv[["basename", "target"]], on="basename", how="left").dropna(subset=["target"])
df["target"] = df["target"].astype(int)

# OCR ÌÖçÏä§Ìä∏ Î°úÎìú
texts = []
for b in tqdm(df["basename"], desc="Load OCR train texts"):
    stem = os.path.splitext(b)[0]
    cands = glob.glob(os.path.join(OCR_TEXT_DIR_TRAIN, f"{stem}.*"))
    texts.append(load_ocr_text(cands[0]) if cands else "")
df["ocr_text"] = texts

empty_ratio = (df["ocr_text"].str.len() == 0).mean()
print(f"[Info] Train OCR empty ratio: {empty_ratio:.3f} ({empty_ratio*100:.1f}%)")
if empty_ratio > 0.9:
    print("[Warn] Í±∞Ïùò Î™®Îì† OCR ÌÖçÏä§Ìä∏Í∞Ä ÎπÑÏñ¥ÏûàÏäµÎãàÎã§. /ocr/train_texts_v6 ÌååÏù¥ÌîÑÎùºÏù∏ÏùÑ Ï†êÍ≤ÄÌïòÏÑ∏Ïöî.")



Load OCR train texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1570/1570 [00:01<00:00, 894.04it/s]

[Info] Train OCR empty ratio: 0.000 (0.0%)





In [5]:

# ============================================================
# Tokenizer
# ============================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# ============================================================
# 5-Fold Training
# ============================================================
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, val_idx) in enumerate(skf.split(df, df["target"])):
    print(f"\n===== Fold {fold+1}/{NFOLDS} =====")
    tr_df, val_df = df.iloc[tr_idx].reset_index(drop=True), df.iloc[val_idx].reset_index(drop=True)

    tr_ds  = OCRDataset(tr_df["ocr_text"].tolist(), tr_df["target"].tolist(), tokenizer)
    val_ds = OCRDataset(val_df["ocr_text"].tolist(), val_df["target"].tolist(), tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=NUM_CLASSES
    ).to(DEVICE)

    args = TrainingArguments(
        output_dir=f"./ocr_model_folds/fold{fold}",
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        warmup_ratio=WARMUP_RATIO,
        seed=SEED,
        report_to="none",
        lr_scheduler_type="cosine",
        logging_steps=100,
        fp16=(DEVICE == "cuda"),
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        acc = (preds == labels).mean()
        return {"accuracy": acc}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tr_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=PATIENCE)],
    )

    trainer.train()

    # --- Î≤†Ïä§Ìä∏ Í∞ÄÏ§ëÏπò Ï†ÄÏû• ---
    best_dir = f"./ocr_model_folds/fold{fold}/best"
    os.makedirs(best_dir, exist_ok=True)

    model.config.id2label = {i: str(i) for i in range(NUM_CLASSES)}
    model.config.label2id = {str(i): i for i in range(NUM_CLASSES)}

    trainer.save_model(best_dir)

    # --- Validation probs Ï†ÄÏû• ---
    best_model = AutoModelForSequenceClassification.from_pretrained(
        best_dir, num_labels=NUM_CLASSES
    ).to(DEVICE)
    val_probs = infer_probs(best_model, val_ds, batch_size=BATCH_SIZE)
    out_df = pd.DataFrame(val_probs, columns=[f"prob_{i}" for i in range(NUM_CLASSES)])
    out_df.insert(0, "basename", val_df["basename"])
    out_path = os.path.join(SAVE_VALID_DIR, f"fold{fold}_ocr_valid.csv")
    out_df.to_csv(out_path, index=False)
    print(f"‚úÖ Saved valid probs: {out_path}")





===== Fold 1/5 =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.82429,0.582803
2,2.394000,0.759756,0.83121
3,0.954700,0.380793,0.926752
4,0.253700,0.184982,0.964968
5,0.253700,0.225895,0.958599
6,0.089200,0.187781,0.964968
7,0.052500,0.25272,0.961783
8,0.030200,0.247413,0.958599
9,0.027600,0.20201,0.971338


‚úÖ Saved valid probs: /data/ephemeral/home/data/interim/ocr_valid_probs/fold0_ocr_valid.csv

===== Fold 2/5 =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.864179,0.573248
2,2.392100,0.914994,0.840764
3,1.084100,0.219467,0.977707
4,0.277100,0.089285,0.984076
5,0.277100,0.102528,0.980892
6,0.083300,0.166799,0.974522
7,0.090600,0.184608,0.968153
8,0.053900,0.09459,0.984076
9,0.029800,0.190982,0.971338


‚úÖ Saved valid probs: /data/ephemeral/home/data/interim/ocr_valid_probs/fold1_ocr_valid.csv

===== Fold 3/5 =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.876112,0.582803
2,2.415200,0.820304,0.828025
3,1.019400,0.322751,0.942675
4,0.294500,0.132128,0.968153
5,0.294500,0.117007,0.977707
6,0.128900,0.152713,0.968153
7,0.065300,0.099823,0.984076
8,0.057600,0.143323,0.977707
9,0.032600,0.133823,0.974522
10,0.032600,0.088025,0.977707


‚úÖ Saved valid probs: /data/ephemeral/home/data/interim/ocr_valid_probs/fold2_ocr_valid.csv

===== Fold 4/5 =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.92295,0.484076
2,2.415700,1.096885,0.691083
3,1.148800,0.438108,0.917197
4,0.417200,0.172654,0.968153
5,0.417200,0.212659,0.958599
6,0.140000,0.231031,0.958599
7,0.058700,0.186444,0.968153
8,0.039600,0.18945,0.971338
9,0.021000,0.178226,0.974522


‚úÖ Saved valid probs: /data/ephemeral/home/data/interim/ocr_valid_probs/fold3_ocr_valid.csv

===== Fold 5/5 =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.870315,0.576433
2,2.412500,0.861485,0.780255
3,0.990700,0.407329,0.901274
4,0.243900,0.49747,0.88535
5,0.243900,0.526364,0.904459
6,0.132600,0.428583,0.929936
7,0.084800,0.359364,0.93949
8,0.022200,0.463556,0.933121
9,0.038300,0.289539,0.949045
10,0.038300,0.389464,0.93949


‚úÖ Saved valid probs: /data/ephemeral/home/data/interim/ocr_valid_probs/fold4_ocr_valid.csv


In [6]:

# ============================================================
# Test probs (fold-average)
# ============================================================
sub = pd.read_csv(SUB_CSV)
test_texts = []
for id_ in tqdm(sub["ID"], desc="Load OCR test texts"):
    stem = os.path.splitext(id_)[0]
    cands = glob.glob(os.path.join(OCR_TEXT_DIR_TEST, f"{stem}.*"))
    test_texts.append(load_ocr_text(cands[0]) if cands else "")

empty_ratio_t = (pd.Series(test_texts).str.len() == 0).mean()
print(f"[Info] Test OCR empty ratio: {empty_ratio_t:.3f} ({empty_ratio_t*100:.1f}%)")
if empty_ratio_t > 0.9:
    print("[Warn] ÌÖåÏä§Ìä∏ OCR ÌÖçÏä§Ìä∏Í∞Ä Í±∞Ïùò ÎπÑÏñ¥ÏûàÏäµÎãàÎã§. /ocr/test_texts_v6 ÌååÏù¥ÌîÑÎùºÏù∏ÏùÑ Ï†êÍ≤ÄÌïòÏÑ∏Ïöî.")

test_ds = OCRDataset(test_texts, labels=None, tokenizer=tokenizer)
test_probs_all = []

for fold in range(NFOLDS):
    best_dir = f"./ocr_model_folds/fold{fold}/best"
    if not os.path.exists(best_dir):
        best_dir = f"./ocr_model_folds/fold{fold}"
    model = AutoModelForSequenceClassification.from_pretrained(
        best_dir, num_labels=NUM_CLASSES
    ).to(DEVICE)
    probs = infer_probs(model, test_ds, batch_size=BATCH_SIZE)
    test_probs_all.append(probs)

final_test_probs = np.mean(test_probs_all, axis=0)
out_df = pd.DataFrame(final_test_probs, columns=[f"prob_{i}" for i in range(NUM_CLASSES)])
out_df.insert(0, "ID", sub["ID"])
out_df.to_csv(SAVE_TEST_PATH, index=False)
print(f"‚úÖ Saved test OCR probs: {SAVE_TEST_PATH}")

Load OCR test texts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3140/3140 [00:06<00:00, 472.07it/s]


[Info] Test OCR empty ratio: 0.061 (6.1%)
‚úÖ Saved test OCR probs: /data/ephemeral/home/data/interim/ocr_test_probs.csv
