In [None]:
%pip uninstall -y transformers
%pip install -U "transformers>=4.39,<5" accelerate sentencepiece safetensors
# 推奨の固定例:
# pip install -U transformers==4.41.2 accelerate==0.33.0 sentencepiece==0.1.99 safetensors


In [None]:
import transformers, torch
print("transformers:", transformers.__version__)
print("torch:", torch.__version__)


In [None]:
# =========================================================
# DeBERTa-v3-Large
#  - name 用 / desc 用 を別々に学習
#  - train: OOF確率CSV,  test: 予測確率CSV を保存
# =========================================================
import os
import numpy as np
import pandas as pd
from dataclasses import dataclass
import re

import torch
from torch.utils.data import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from scipy.special import softmax
import inspect
from transformers import TrainingArguments
from transformers.trainer_utils import EvalPrediction

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)
import inspect
from transformers import TrainingArguments
import torch, os
import torch.nn as nn



In [None]:
# ---------------- Config ----------------
@dataclass
class CFG:
    model_name: str = "microsoft/deberta-v3-large"
    seed: int = 42
    n_splits: int = 5
    max_len_default: int = 512       # desc 向け
    max_len_name: int = 128          # name 向け（短文）
    lr: float = 2e-5
    epochs: int = 4
    train_bs: int = 4
    eval_bs: int = 8
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    grad_accum: int = 2
    out_root: str = "deberta_v3_large_runs"
    target_col: str = "final_status"
    fp16: bool = True


In [None]:
TRAIN_PATH = "/kaggle/input/mufg-dataset-2025/train.csv" 
TEST_PATH  = "/kaggle/input/mufg-dataset-2025/test.csv"

os.makedirs(CFG.out_root, exist_ok=True)
np.random.seed(CFG.seed)
torch.manual_seed(CFG.seed)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

    
cList = {
    "ain't": "am not","aren't": "are not","can't": "cannot","can't've": "cannot have","'cause": "because",  "could've": "could have",
    "couldn't": "could not","couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not",
    "hadn't've": "had not have","hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
    "he'll've": "he will have","he's": "he is","how'd": "how did","how'd'y": "how do you","how'll": "how will","how's": "how is",
    "I'd": "I would","I'd've": "I would have","I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","isn't": "is not",
    "it'd": "it had","it'd've": "it would have","it'll": "it will", "it'll've": "it will have","it's": "it is","let's": "let us","ma'am": "madam",
    "mayn't": "may not","might've": "might have","mightn't": "might not","mightn't've": "might not have","must've": "must have","mustn't": "must not",
    "mustn't've": "must not have","needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
    "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not","shan't've": "shall not have","she'd": "she would",
    "she'd've": "she would have","she'll": "she will","she'll've": "she will have","she's": "she is","should've": "should have",
    "shouldn't": "should not","shouldn't've": "should not have","so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have",
    "that's": "that is","there'd": "there had","there'd've": "there would have","there's": "there is","they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are","they've": "they have",
    "to've": "to have","wasn't": "was not","we'd": "we had","we'd've": "we would have","we'll": "we will","we'll've": "we will have",
    "we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
    "what's": "what is","what've": "what have","when's": "when is","when've": "when have","where'd": "where did","where's": "where is",
    "where've": "where have","who'll": "who will","who'll've": "who will have","who's": "who is","who've": "who have","why's": "why is",
    "why've": "why have","will've": "will have","won't": "will not","won't've": "will not have","would've": "would have","wouldn't": "would not",
    "wouldn't've": "would not have","y'all": "you all","y'alls": "you alls","y'all'd": "you all would","y'all'd've": "you all would have",
    "y'all're": "you all are","y'all've": "you all have","you'd": "you had","you'd've": "you would have","you'll": "you will",
    "you'll've": "you will have","you're": "you are","you've": "you have"
}

# Function to expand contractions
def expand_contractions(text, cList):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in cList.keys()) + r')\b')
    def replace(match):
        return cList[match.group(0)]
    return pattern.sub(replace, text)

def prepro(df, vectorizer=None):
    df['name'] = df['name'].fillna("").astype(str)
    df['desc'] = df['desc'].fillna("").astype(str)
    df['name'] = [dataPreprocessing(x) for x in df['name']]
    df['desc'] = [dataPreprocessing(x) for x in df['desc']]
    
    df['name'] = df['name'].apply(lambda x: expand_contractions(x, cList))
    df['desc'] = df['desc'].apply(lambda x: expand_contractions(x, cList))

    kw = df["keywords"].fillna("").str.replace("-", " ")
    df["desc"] = ("Keywords: " + kw + " | Description: " + df["desc"]).str.strip()

    return df

In [None]:
def _make_training_args(out_dir):
    import inspect, os, torch
    from transformers import TrainingArguments

    def has(param_name):  # 引数が存在するかのユーティリティ
        return param_name in inspect.signature(TrainingArguments.__init__).parameters

    # --- 基本 ---
    kwargs = dict(
        output_dir=out_dir,
        learning_rate=CFG.lr,
        num_train_epochs=CFG.epochs,
        per_device_train_batch_size=CFG.train_bs,
        per_device_eval_batch_size=CFG.eval_bs,
        gradient_accumulation_steps=CFG.grad_accum,
        weight_decay=CFG.weight_decay,
        warmup_ratio=CFG.warmup_ratio,
        seed=CFG.seed,
        fp16=bool(CFG.fp16 and torch.cuda.is_available()),
        logging_steps=100,
    )

    # --- 評価・保存戦略（旧新どちらでも） ---
    eval_key = "evaluation_strategy" if has("evaluation_strategy") else ("eval_strategy" if has("eval_strategy") else None)
    if eval_key:
        kwargs[eval_key] = "epoch"
    if has("save_strategy"):
        kwargs["save_strategy"] = "epoch"
    if eval_key and has("save_strategy") and has("load_best_model_at_end"):
        kwargs["load_best_model_at_end"] = True
    if has("metric_for_best_model"):
        kwargs["metric_for_best_model"] = "f1"
    if has("greater_is_better"):
        kwargs["greater_is_better"] = True
    if has("save_total_limit"):
        kwargs["save_total_limit"] = 1
    if has("report_to"):
        kwargs["report_to"] = "none"
    if has("logging_strategy"):
        kwargs["logging_strategy"] = "steps"

    # --- 高速化オプション（存在すれば適用） ---
    if has("group_by_length"):
        kwargs["group_by_length"] = True
    if has("dataloader_num_workers"):
        kwargs["dataloader_num_workers"] = max(1, min(4, (os.cpu_count() or 2) - 1))
    if has("dataloader_pin_memory"):
        kwargs["dataloader_pin_memory"] = True
    if has("optim"):
        # PyTorch 2.x なら fused を試す。未対応環境では自動で通常版にフォールバック
        kwargs["optim"] = "adamw_torch_fused"

    # --- bf16 自動（fp16を使わない＆Ampere以上） ---
    if has("bf16"):
        is_ampere = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
        kwargs["bf16"] = (not kwargs.get("fp16", False)) and is_ampere

    return TrainingArguments(**kwargs)


In [None]:
from transformers import EarlyStoppingCallback
callbacks = [EarlyStoppingCallback(early_stopping_patience=1,
                                   early_stopping_threshold=1e-4)]

In [None]:
# 追加：事前encodeとインデックスで切るDataset
class EncodedSubset(Dataset):
    def __init__(self, enc, idx, labels=None):
        self.enc = enc
        self.idx = np.asarray(idx)
        self.labels = None if labels is None else labels[self.idx]
    def __len__(self): return len(self.idx)
    def __getitem__(self, i):
        j = self.idx[i]
        item = {k: torch.tensor(v[j]) for k, v in self.enc.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[i], dtype=torch.long)
        return item


In [None]:
def maybe_prefix_for_e5(texts, model_name: str):
    """e5系は 'passage: ' を前置（desc/name ともに文書=passage扱い）"""
    if "e5" in model_name.lower():
        return [f"passage: {t}" if isinstance(t, str) else "passage: " for t in texts]
    return [t if isinstance(t, str) else "" for t in texts]


In [None]:
# =========================
# helpers & config
# =========================
import os, inspect, gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from scipy.special import softmax
from tqdm.auto import tqdm

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer,
)
from transformers.trainer_utils import EvalPrediction

# 進捗の表示/抑制を一元管理
def _tqdm(seq, **kw):
    return tqdm(seq, leave=False, ncols=100, **kw)

def maybe_prefix_for_e5(texts, model_name):
    # e5 は "passage: " を前置（desc/長文は passage 扱い）
    if "e5" in model_name.lower():
        return [f"passage: {t if isinstance(t, str) else ''}" for t in texts]
    return [t if isinstance(t, str) else "" for t in texts]

# EncodedSubset: 事前トークナイズ済み辞書(enc)を index で取り出す Dataset
class EncodedSubset(torch.utils.data.Dataset):
    def __init__(self, enc_dict, indices, labels):
        self.enc = enc_dict
        self.indices = np.array(indices)
        self.labels = None if labels is None else np.array(labels)

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, i):
        idx = self.indices[i]
        item = {k: torch.tensor(v[idx]) for k, v in self.enc.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# 重み付きCE（元コードの意図を踏襲）
class WeightedCETrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None and not torch.is_tensor(class_weights):
            class_weights = torch.tensor(class_weights, dtype=torch.float)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs: bool = False, **kwargs):
        labels = inputs.pop("labels")
        if labels.dtype != torch.long:
            labels = labels.to(torch.long)
        outputs = model(**inputs)
        logits = outputs["logits"] if isinstance(outputs, dict) else outputs.logits
        num_labels = logits.size(-1)

        weight = self.class_weights.to(logits.device) if self.class_weights is not None else None
        ls = getattr(self.args, "label_smoothing_factor", 0.0)
        loss_fct = nn.CrossEntropyLoss(weight=weight, label_smoothing=float(ls) if ls else 0.0)
        loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    if isinstance(eval_pred, EvalPrediction):
        logits, labels = eval_pred.predictions, eval_pred.label_ids
    else:
        logits, labels = eval_pred
    prob = softmax(logits, axis=1)[:, 1]
    pred = (prob > 0.5).astype(int)
    return {"f1": f1_score(labels, pred)}

# TrainingArguments をここで作る（進捗が見える設定）
import inspect
from transformers import TrainingArguments

def _make_training_args(output_dir,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=64,
                        num_train_epochs=1.0,
                        fp16=True, bf16=False, seed=42):
    """
    transformers のバージョン差異を吸収して TrainingArguments を構築
    """
    sig = inspect.signature(TrainingArguments.__init__).parameters
    kw = dict(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=2e-5,
        weight_decay=0.01,
        gradient_accumulation_steps=1,
        seed=seed,
        logging_steps=50,
    )

    # あるものだけ入れる
    if "fp16" in sig: kw["fp16"] = fp16
    if "bf16" in sig: kw["bf16"] = bf16
    if "warmup_ratio" in sig:
        kw["warmup_ratio"] = 0.06
    elif "warmup_steps" in sig:
        kw["warmup_steps"] = 100  # 近似

    # 評価/保存/ロギング系（新→旧の順で対応）
    if "evaluation_strategy" in sig:
        kw["evaluation_strategy"] = "steps"
        if "eval_steps" in sig: kw["eval_steps"] = 200
    elif "evaluate_during_training" in sig:
        kw["evaluate_during_training"] = True
        if "eval_steps" in sig: kw["eval_steps"] = 200

    if "logging_strategy" in sig: kw["logging_strategy"] = "steps"
    if "save_strategy" in sig:    kw["save_strategy"] = "no"
    if "report_to" in sig:        kw["report_to"] = "none"
    if "label_smoothing_factor" in sig: kw["label_smoothing_factor"] = 0.0
    if "dataloader_num_workers" in sig: kw["dataloader_num_workers"] = 2
    if "disable_tqdm" in sig:     kw["disable_tqdm"] = False
    if "load_best_model_at_end" in sig: kw["load_best_model_at_end"] = False

    # None は渡さない
    kw = {k: v for k, v in kw.items() if v is not None}
    return TrainingArguments(**kw)

# =========================
# main function (差し替え)
# =========================
def run_one_text_column(text_col: str, max_len: int, model_name: str | None = None):
    if model_name is None:
        model_name = CFG.model_name
    print(f"\n========== RUN for column: {text_col} (max_len={max_len}, model={model_name}) ==========")

    # 出力先
    safe_model_tag = model_name.replace("/", "_").replace("-", "_")
    out_dir = os.path.join(CFG.out_root, f"{text_col}__{safe_model_tag}")
    os.makedirs(out_dir, exist_ok=True)

    # データ読み込み
    df  = pd.read_csv(TRAIN_PATH)
    dft = pd.read_csv(TEST_PATH)
    df = prepro(df)
    dft = prepro(dft)
    if text_col not in df.columns or text_col not in dft.columns:
        raise ValueError(f"列 {text_col} が train/test の両方に存在しません。")
    df[CFG.target_col] = df[CFG.target_col].astype(int)
    print(f"Train size: {len(df)} | Test size: {len(dft)}")

    # Tokenizer / collator
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

    # テキスト前処理（e5は 'passage: ' を前置）
    train_texts = maybe_prefix_for_e5(df[text_col].fillna("").astype(str).tolist(), model_name)
    test_texts  = maybe_prefix_for_e5(dft[text_col].fillna("").astype(str).tolist(), model_name)

    # 進捗表示しながらトークナイズ（大規模でも状況が見える）  ### NEW
    def batch_tokenize(texts):
        enc = {"input_ids": [], "attention_mask": []}
        bs = 2048  # CPUバッチ
        for i in _tqdm(range(0, len(texts), bs), desc="Tokenizing"):
            batch = texts[i:i+bs]
            out = tokenizer(batch, truncation=True, max_length=max_len, padding=False)
            enc["input_ids"].extend(out["input_ids"])
            enc["attention_mask"].extend(out["attention_mask"])
        return enc

    enc_train = batch_tokenize(train_texts)
    enc_test  = batch_tokenize(test_texts)

    test_ds = EncodedSubset(enc_test, np.arange(len(dft)), None)

    skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
    oof_prob = np.zeros(len(df), dtype=np.float32)
    test_prob_folds = []

    for fold, (trn_idx, val_idx) in enumerate(skf.split(df, df[CFG.target_col])):
        print(f"\n---- Fold {fold+1}/{CFG.n_splits} ----")
        trn_df = df.iloc[trn_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)

        train_ds = EncodedSubset(enc_train, trn_idx, df[CFG.target_col].values)
        valid_ds = EncodedSubset(enc_train, val_idx, df[CFG.target_col].values)

        config = AutoConfig.from_pretrained(model_name)
        config.num_labels = 2

        model  = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
        # model.gradient_checkpointing_enable()  # 必要ならONに（VRAM節約）

        # クラス重み
        cw = compute_class_weight("balanced", classes=[0, 1], y=trn_df[CFG.target_col].values)

        # ★ 進捗/ログが見える TrainingArguments
        args = _make_training_args(
            os.path.join(out_dir, f"fold{fold}"),
            per_device_train_batch_size=CFG.train_bs if hasattr(CFG, "train_bs") else 16,
            per_device_eval_batch_size=CFG.valid_bs if hasattr(CFG, "valid_bs") else 64,
            num_train_epochs=CFG.epochs if hasattr(CFG, "epochs") else 1.0,
            fp16=torch.cuda.is_available(),  # 自動でfp16
            bf16=False,
            seed=CFG.seed
        )

        # HFのバージョン差異に対応（tokenizer/processing_class引数）
        extra_init = {}
        trainer_sig = inspect.signature(Trainer.__init__).parameters
        if "processing_class" in trainer_sig:
            extra_init["processing_class"] = tokenizer
        elif "tokenizer" in trainer_sig:
            extra_init["tokenizer"] = tokenizer

        trainer = WeightedCETrainer(
            class_weights=cw,
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=valid_ds,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            **extra_init,
        )

        print("Training...")
        trainer.train()  # tqdmで進捗が出ます

        # OOF
        print("Predict (valid)...")
        logits_val = trainer.predict(valid_ds).predictions  # ここもtqdm表示あり
        prob_val = softmax(logits_val, axis=1)[:, 1].astype(np.float32)
        oof_prob[val_idx] = prob_val
        f1_fold = f1_score(val_df[CFG.target_col].values, (prob_val > 0.5).astype(int))
        print(f"Fold {fold} F1 @0.5: {f1_fold:.4f}")

        # Test（foldごと）
        print("Predict (test)...")
        logits_te = trainer.predict(test_ds).predictions
        prob_te = softmax(logits_te, axis=1)[:, 1].astype(np.float32)
        test_prob_folds.append(prob_te)

        # メモリ解放
        del trainer, model, train_ds, valid_ds, logits_val, logits_te
        torch.cuda.empty_cache(); gc.collect()

    # ------- 保存：train（OOF） -------
    df_oof = pd.DataFrame({
        "id": df["id"] if "id" in df.columns else np.arange(len(df)),
        f"oof_prob_{text_col}": oof_prob,
        "label": df[CFG.target_col].values
    })
    oof_path = os.path.join(out_dir, f"oof_{safe_model_tag}_{text_col}.csv")
    df_oof.to_csv(oof_path, index=False)

    # ------- 保存：test（fold平均） -------
    test_prob = np.mean(np.vstack(test_prob_folds), axis=0).astype(np.float32)
    df_test_pred = pd.DataFrame({
        "id": dft["id"] if "id" in dft.columns else np.arange(len(dft)),
        f"prob_{text_col}": test_prob
    })
    test_path = os.path.join(out_dir, f"test_{safe_model_tag}_{text_col}.csv")
    df_test_pred.to_csv(test_path, index=False)

    # レポート
    ths = np.linspace(0.05, 0.95, 37)
    f1s = [f1_score(df_oof["label"], (df_oof[f"oof_prob_{text_col}"] > t).astype(int)) for t in ths]
    best_t = ths[int(np.argmax(f1s))]
    print(f"\n[{text_col}] OOF F1 @0.5 = {f1_score(df_oof['label'], (df_oof[f'oof_prob_{text_col}']>0.5).astype(int)):.4f}")
    print(f"[{text_col}] OOF F1 @best={best_t:.2f} = {max(f1s):.4f}")
    print(f"Saved:\n - {oof_path}\n - {test_path}")


In [None]:
# とにかく速く：MiniLM-L6
run_one_text_column("desc", max_len=256, model_name="sentence-transformers/all-MiniLM-L6-v2")

# 速くて精度も良い：GTE small
# run_one_text_column("desc", max_len=256, model_name="thenlper/gte-small")

# e5 の小型版（前置きはそのまま適用されます）
# run_one_text_column("desc", max_len=256, model_name="intfloat/e5-small-v2")

# DeBERTa軽量
# run_one_text_column("desc", max_len=256, model_name="microsoft/deberta-v3-small")


In [None]:
# 1) all-mpnet-base-v2
# run_one_text_column("desc", max_len=256, model_name="sentence-transformers/all-mpnet-base-v2")

# 2) e5-large-v2（自動で 'passage: ' が前置されます）
# run_one_text_column("desc", max_len=256, model_name="intfloat/e5-large-v2")

# 3) deberta-v3-large
# run_one_text_column("desc", max_len=256, model_name="microsoft/deberta-v3-large")


In [None]:
# run_one_text_column("name", max_len=128, model_name="microsoft/MiniLM-L12-H384-uncased")
# run_one_text_column("desc", max_len=256, model_name="microsoft/deberta-v3-base")