In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# ===== Colab-ready: Code Similarity Cross-Encoder Fine-tuning =====
# - CSV columns: ['code_a_norm','code_b_norm','similar'] (0/1)
# - Pretrained backbone(.pt) 로드 → 이진 판별 헤드
# - AMP, Scheduler, F1기반 threshold 튜닝, Best save
# ---------------------------------------------------------------

import os, math, random, warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

# --------- [1] Paths ----------
PRETRAINED_CKPT = "/content/drive/MyDrive/models/rolebert/rolebert_scratch.pt"
TOKENIZER_FILE  = "/content/drive/MyDrive/models/rolebert/tokenizer.json"
TOKENIZER_DIR   = os.path.dirname(TOKENIZER_FILE)
TRAIN_CSV       = "/content/drive/MyDrive/dacon_preprocess_data/train_pairs_real_final_180000.csv"
OUT_DIR         = "/content/drive/MyDrive/rolebert_finetune_out"
os.makedirs(OUT_DIR, exist_ok=True)

# --------- [2] Basic config ----------
SEED         = 42
MAX_LEN      = 512
BATCH_SIZE   = 16
EPOCHS       = 3
LR_BACKBONE  = 2e-5
LR_HEAD      = 1e-4
WARMUP_RATIO = 0.05
VAL_RATIO    = 0.1
GRAD_ACCUM   = 1
WEIGHT_DECAY = 0.01
USE_AMP      = True
BACKBONE_SKELETON = "bert-base-uncased"   # 구조 스켈레톤(가중치는 .pt에서 주입)

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------- [3] Tokenizer ----------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)

# --------- [4] Dataset ----------
REQUIRED_COLS = {"code_a_norm","code_b_norm","similar"}

def load_pairs_csv(path):
    df = pd.read_csv(path)
    cols_lower = {c.lower(): c for c in df.columns}
    # 유연 매핑: 대소문자 혼재 대비
    need = {}
    for k in REQUIRED_COLS:
        if k in cols_lower:
            need[k] = cols_lower[k]
        else:
            raise ValueError(f"CSV에 '{k}' 컬럼이 필요합니다. 현재: {list(df.columns)}")
    df = df.rename(columns={
        need["code_a_norm"]: "code_a_norm",
        need["code_b_norm"]: "code_b_norm",
        need["similar"]:     "similar",
    })
    df["similar"] = df["similar"].astype(int)
    return df

df_all = load_pairs_csv(TRAIN_CSV)

# --------- [5] Split ----------
n_total = len(df_all)
n_val   = max(1, int(n_total * VAL_RATIO))
n_train = n_total - n_val
train_df, val_df = random_split(df_all, [n_train, n_val], generator=torch.Generator().manual_seed(SEED))
train_df = df_all.iloc[train_df.indices].reset_index(drop=True)
val_df   = df_all.iloc[val_df.indices].reset_index(drop=True)

# --------- [6] Datasets / Loaders ----------
class PairDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df, self.tk, self.max_len = df, tokenizer, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        a = str(self.df.loc[i, "code_a_norm"])
        b = str(self.df.loc[i, "code_b_norm"])
        y = float(self.df.loc[i, "similar"])
        enc = self.tk(a, b, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        # 허용 키만 추려서 예기치 않은 키 전달 방지
        allowed = {k: v.squeeze(0) for k, v in enc.items() if k in ["input_ids","attention_mask","token_type_ids"]}
        allowed["labels"] = torch.tensor(y, dtype=torch.float)
        return allowed

train_ds = PairDataset(train_df, tokenizer, MAX_LEN)
val_ds   = PairDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# --------- [7] Model ----------
class CrossEncoder(nn.Module):
    def __init__(self, backbone_name_or_dir):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(backbone_name_or_dir)
        hidden = self.backbone.config.hidden_size
        self.head = nn.Linear(hidden, 1)
    def forward(self, input_ids, attention_mask, token_type_ids=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls = out.last_hidden_state[:, 0]
        logit = self.head(cls).squeeze(-1)
        loss = None
        if labels is not None:
            loss = nn.BCEWithLogitsLoss()(logit, labels)
        return {"loss": loss, "logits": logit}

model = CrossEncoder(BACKBONE_SKELETON)
# 토크나이저 크기 맞추기(필요 시)
try:
    model.backbone.resize_token_embeddings(len(tokenizer))
except Exception:
    pass

# --------- [8] Load pretrained checkpoint (.pt) ----------
ckpt = torch.load(PRETRAINED_CKPT, map_location="cpu")
state_dict = ckpt.get("state_dict", ckpt)
new_sd = model.state_dict()
loaded = 0
for k, v in state_dict.items():
    mk = k[7:] if k.startswith("module.") else k
    if mk in new_sd and isinstance(v, torch.Tensor) and new_sd[mk].shape == v.shape:
        new_sd[mk] = v; loaded += 1
model.load_state_dict(new_sd, strict=False)
print(f"[Checkpoint] Loaded backbone params: {loaded}")

model.to(device)

# --------- [9] Optim / Scheduler ----------
head_params = list(model.head.parameters())
backbone_params = [p for n,p in model.named_parameters() if p.requires_grad and not n.startswith("head.")]
optimizer = AdamW(
    [{"params": backbone_params, "lr": LR_BACKBONE},
     {"params": head_params, "lr": LR_HEAD}],
    weight_decay=WEIGHT_DECAY
)
total_steps = math.ceil(len(train_loader)/GRAD_ACCUM) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

# --------- [10] Train & Eval ----------
def sigmoid_np(x): return 1/(1+np.exp(-x))

def evaluate(model, loader, best_thr=None):
    model.eval()
    probs, labels = [], []
    with torch.no_grad():
        for batch in loader:
            # 안전: 허용 키만 모델에 전달
            for k in list(batch.keys()):
                if hasattr(batch[k], "to"): batch[k] = batch[k].to(device)
            inputs = {k: batch[k] for k in ["input_ids","attention_mask","token_type_ids"] if k in batch}
            out = model(**inputs)
            logits = out["logits"].detach().cpu().numpy()
            probs.append(sigmoid_np(logits))
            labels.append(batch["labels"].detach().cpu().numpy())
    probs = np.concatenate(probs)
    labels = np.concatenate(labels).astype(int)
    thr = 0.5 if best_thr is None else best_thr
    if best_thr is None:
        cand = np.linspace(0.05, 0.95, 19)
        f1s = [f1_score(labels, (probs>=t).astype(int)) for t in cand]
        thr = float(cand[int(np.argmax(f1s))])
    preds = (probs >= thr).astype(int)
    return {
        "acc": float(accuracy_score(labels, preds)),
        "f1":  float(f1_score(labels, preds)),
        "auc": float(roc_auc_score(labels, probs)) if len(np.unique(labels))>1 else float("nan"),
        "thr": float(thr),
    }

best_f1, best_thr = -1.0, 0.5
best_state = None

for epoch in range(1, EPOCHS+1):
    model.train()
    running = 0.0
    optimizer.zero_grad(set_to_none=True)
    for step, batch in enumerate(train_loader, 1):
        for k in list(batch.keys()):
            if hasattr(batch[k], "to"): batch[k] = batch[k].to(device)
        inputs = {k: batch[k] for k in ["input_ids","attention_mask","token_type_ids"] if k in batch}
        with torch.cuda.amp.autocast(enabled=USE_AMP):
            out = model(**inputs)
            loss = nn.BCEWithLogitsLoss()(out["logits"], batch["labels"])
        scaler.scale(loss).backward()
        if step % GRAD_ACCUM == 0:
            scaler.step(optimizer); scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()
        running += loss.item()
        if step % 200 == 0:
            print(f"Epoch {epoch} Step {step}/{len(train_loader)} | loss={running/step:.4f}")

    val_metrics = evaluate(model, val_loader, best_thr=None)
    print(f"[Val] epoch={epoch} | acc={val_metrics['acc']:.4f} f1={val_metrics['f1']:.4f} auc={val_metrics['auc']:.4f} thr={val_metrics['thr']:.2f}")

    if val_metrics["f1"] > best_f1:
        best_f1, best_thr = val_metrics["f1"], val_metrics["thr"]
        best_state = {
            "model": model.state_dict(),
            "config": {"max_len": MAX_LEN, "thr": best_thr, "backbone_skeleton": BACKBONE_SKELETON},
        }
        torch.save(best_state, os.path.join(OUT_DIR, "best_crossencoder.pt"))
        # 편의용 HF 저장
        try:
            model.backbone.save_pretrained(os.path.join(OUT_DIR, "hf_backbone"))
            tokenizer.save_pretrained(os.path.join(OUT_DIR, "hf_tokenizer"))
        except Exception:
            pass

print(f"[Done] Best F1={best_f1:.4f}, thr={best_thr:.2f}")

[Checkpoint] Loaded backbone params: 0
Epoch 1 Step 200/10125 | loss=0.6840
Epoch 1 Step 400/10125 | loss=0.6473
Epoch 1 Step 600/10125 | loss=0.5894
Epoch 1 Step 800/10125 | loss=0.5445
Epoch 1 Step 1000/10125 | loss=0.5053
Epoch 1 Step 1200/10125 | loss=0.4748
Epoch 1 Step 1400/10125 | loss=0.4536
Epoch 1 Step 1600/10125 | loss=0.4323
Epoch 1 Step 1800/10125 | loss=0.4142
Epoch 1 Step 2000/10125 | loss=0.3980
Epoch 1 Step 2200/10125 | loss=0.3847
Epoch 1 Step 2400/10125 | loss=0.3744
Epoch 1 Step 2600/10125 | loss=0.3642
Epoch 1 Step 2800/10125 | loss=0.3548
Epoch 1 Step 3000/10125 | loss=0.3461
Epoch 1 Step 3200/10125 | loss=0.3384
Epoch 1 Step 3400/10125 | loss=0.3314
Epoch 1 Step 3600/10125 | loss=0.3238
Epoch 1 Step 3800/10125 | loss=0.3166
Epoch 1 Step 4000/10125 | loss=0.3113
Epoch 1 Step 4200/10125 | loss=0.3064
Epoch 1 Step 4400/10125 | loss=0.3008
Epoch 1 Step 4600/10125 | loss=0.2957
Epoch 1 Step 4800/10125 | loss=0.2913
Epoch 1 Step 5000/10125 | loss=0.2869
Epoch 1 Step 52

In [16]:
# --------- Paths ---------
BEST_PT        = "/content/drive/MyDrive/rolebert_finetune_out/best_crossencoder.pt"  # fine tuning best
TOKENIZER_DIR  = "/content/drive/MyDrive/models/rolebert"
TEST_CSV       = "/content/drive/MyDrive/dacon_preprocess_data/test_norm_final.csv"
OUT_DIR        = "/content/drive/MyDrive/rolebert_finetune_out"
OUT_CSV        = os.path.join(OUT_DIR, "preds_test.csv")
os.makedirs(OUT_DIR, exist_ok=True)

# --------- Model skeleton (학습 때와 동일) ---------
BACKBONE_SKELETON = "bert-base-uncased"

class CrossEncoder(nn.Module):
    def __init__(self, backbone_name_or_dir):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(backbone_name_or_dir)
        hidden = self.backbone.config.hidden_size
        self.head = nn.Linear(hidden, 1)
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls = out.last_hidden_state[:, 0]
        logit = self.head(cls).squeeze(-1)
        return logit  # (B,)

# --------- Load tokenizer ---------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)

# --------- Rebuild model & load checkpoint ---------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrossEncoder(BACKBONE_SKELETON)
try:
    model.backbone.resize_token_embeddings(len(tokenizer))
except Exception:
    pass

ckpt = torch.load(BEST_PT, map_location="cpu")
state_dict = ckpt.get("model", ckpt)  # {'model': state_dict, 'config': {...}} 형태 대비
missing, unexpected = model.load_state_dict(state_dict, strict=False)
print("[Load] missing:", missing, "| unexpected:", unexpected)

cfg = ckpt.get("config", {})
MAX_LEN = int(cfg.get("max_len", 512))
BEST_THR = float(cfg.get("thr", 0.5))
print(f"[Config] max_len={MAX_LEN}, threshold={BEST_THR:.3f}")

model.to(device).eval()

# --------- Load test CSV ---------
def load_test(path):
    df = pd.read_csv(path)
    cols = {c.lower(): c for c in df.columns}
    need = {}
    for k in ["pair_id","code1_norm","code2_norm"]:
        if k in cols: need[k] = cols[k]
        else: raise ValueError(f"테스트 CSV에 '{k}' 컬럼이 필요합니다. 현재 컬럼: {list(df.columns)}")
    df = df.rename(columns={
        need["pair_id"]: "pair_id",
        need["code1_norm"]: "code1_norm",
        need["code2_norm"]: "code2_norm",
    })
    return df

test_df = load_test(TEST_CSV)

# --------- Dataset / DataLoader ---------
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df, self.tk, self.max_len = df, tokenizer, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        a = str(self.df.loc[i, "code1_norm"])
        b = str(self.df.loc[i, "code2_norm"])
        enc = self.tk(a, b, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        # 허용 키만 추출
        item = {k: v.squeeze(0) for k, v in enc.items() if k in ["input_ids","attention_mask","token_type_ids"]}
        item["pair_id"] = str(self.df.loc[i, "pair_id"])
        return item

test_ds = TestDataset(test_df, tokenizer, MAX_LEN)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

# --------- Inference ---------
all_pair_id, all_prob = [], []
sigmoid = lambda x: 1/(1+np.exp(-x))

with torch.no_grad():
    for batch in test_loader:
        pair_ids = batch.pop("pair_id")
        # to(device)
        for k in list(batch.keys()):
            if hasattr(batch[k], "to"):
                batch[k] = batch[k].to(device)
        # 모델 입력 (허용키만)
        inputs = {k: batch[k] for k in ["input_ids","attention_mask","token_type_ids"] if k in batch}
        logits = model(**inputs)              # (B,)
        probs = torch.sigmoid(logits).cpu().numpy()
        all_pair_id.extend(pair_ids)
        all_prob.extend(probs.tolist())

# --------- Threshold → 0/1 ---------
preds = (np.array(all_prob) >= BEST_THR).astype(int)

# --------- Save CSV: pair_id, similar ---------
out_df = pd.DataFrame({"pair_id": all_pair_id, "similar": preds})
out_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"[Saved] {OUT_CSV}  rows={len(out_df)}")

[Load] missing: [] | unexpected: []
[Config] max_len=512, threshold=0.200
[Saved] /content/drive/MyDrive/rolebert_finetune_out/preds_test.csv  rows=179700
