In [None]:
pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
pip install -q transformers datasets accelerate tqdm

SyntaxError: invalid syntax (ipython-input-984273395.py, line 1)

In [None]:
import os
import random
import math
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM, get_linear_schedule_with_warmup


In [None]:
# ================================
# 0. 설정값 (필요하면 위쪽만 수정해서 튜닝)
# ================================
MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species" #DNA 전용 언어모델 이름

TEST_PATH = "test.csv"                  # 대회 제공 test.csv 경로
SAMPLE_SUB_PATH = "sample_submission.csv"
OUTPUT_PATH = "submission_ntv2_contrastive.csv"

MAX_SEQ_LEN = 512                         # 토큰 최대 길이 (모델 max_length 보다 작게) #최대 몇 글자까지 볼건지
EMBED_DIM = 512                           # hidden_size (v2-50m은 512)

DO_TRAIN = True                           # 파인튜닝 할지 여부
MAX_TRAIN_SEQS = 20000                    # test 중 학습에 쓸 최대 시퀀스 수
EPOCHS = 1                                # Colab 기준 1~2 에폭 추천
BATCH_SIZE = 8
LR = 2e-5
WARMUP_RATIO = 0.05

# 돌연변이 수준 (시퀀스 길이에 대한 비율)
MUTATION_LEVELS = [0.002, 0.005, 0.01]    # 0.2%, 0.5%, 1% SNV #DNA를 얼마나 조금 바꿀지
BASE_MARGIN = 0.1                         # 최소 margin
ALPHA_MARGIN = 5.0                        # 돌연변이 비율에 따라 margin 증가 정도

SEED = 2025


In [None]:
# ================================
# 1. 유틸 함수들
# ================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


#DNA 일부를 조금 바꿔 모델이 이 차이를 느끼는지 훈련
def mutate_sequence_snvs(seq: str, mutation_ratio: float) -> Tuple[str, int]:
    """
    주어진 DNA 염기열에 대해 SNV(single nucleotide variants)만 랜덤으로 넣어서
    '조금 다른' variant 시퀀스를 만든다.

    return: (mutated_seq, num_mutations)
    """
    bases = ["A", "C", "G", "T"]
    seq = seq.upper()

    length = len(seq)
    num_mutations = max(1, int(length * mutation_ratio))

    if num_mutations >= length:
        num_mutations = length // 2 if length >= 2 else 1

    positions = random.sample(range(length), num_mutations)
    seq_list = list(seq)

    for pos in positions:
        original = seq_list[pos]
        # 원래 염기 제외한 다른 염기 중 랜덤
        candidates = [b for b in bases if b != original]
        if not candidates:
            continue
        seq_list[pos] = random.choice(candidates)

    mutated = "".join(seq_list)
    return mutated, num_mutations


#DNA전체를 하나의 숫자벡터로 요약
def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    pad 토큰을 제외하고 token dimension 평균을 내서 [batch, hidden] 임베딩 생성
    """
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked_hidden = last_hidden_state * mask
    summed = masked_hidden.sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    return summed / counts

#두 벡터가 얼마나 다른지 점수내기: 값이 클수록 좋다
def cosine_distance(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    """
    코사인 거리: 1 - cos_sim
    """
    return 1.0 - F.cosine_similarity(a, b)

In [None]:
# ================================
# 2. Dataset 정의
# ================================

#“얼마나 바뀌었는지에 따라 점수 차이를 느끼게” 학습
class MutationContrastiveDataset(Dataset):
    """
    - original seq
    - 여러 수준의 mutated seq (SNV 비율만 다름)

    한 sample에서:
    - original embedding
    - 각 수준별 mutated embedding 간 거리에 margin loss를 줄 예정
    """

    def __init__(self, seq_list: List[str], mutation_levels: List[float]):
        self.seqs = seq_list
        self.mutation_levels = mutation_levels

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        mutated_seqs = []
        num_muts = []
        for r in self.mutation_levels:
            m_seq, n_mut = mutate_sequence_snvs(seq, r)
            mutated_seqs.append(m_seq)
            num_muts.append(n_mut)
        return {
            "original": seq,
            "mutated_list": mutated_seqs,
            "num_mut_list": num_muts,
        }


#DataLoader가 여러 문제를 한 묶음(batch)으로 만들 때 쓰는 함수
'''
DNA 문자열을 **AI가 이해하는 숫자(ID)**로 변환

원본용 토큰

변이 1,2,3용 토큰

“몇 글자 바뀌었는지” 텐서로 만듦
'''
def collate_fn(batch, tokenizer, max_len: int):
    originals = [b["original"] for b in batch]
    mutated_lists = [b["mutated_list"] for b in batch]
    num_mut_lists = [b["num_mut_list"] for b in batch]

    # 원본 토큰화
    orig_enc = tokenizer(
        originals,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )

    # mutation level 별로 토큰화 (각 level에 대해 배치 전체)
    # mutated_tokenized[level_idx]["input_ids"].shape == [B, L]
    mutated_tokenized = []
    num_levels = len(mutated_lists[0])

    for level_idx in range(num_levels):
        seqs_level = [mutated_lists[i][level_idx] for i in range(len(batch))]
        enc = tokenizer(
            seqs_level,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
        )
        mutated_tokenized.append(enc)

    num_mut_tensor = torch.tensor(num_mut_lists, dtype=torch.float32)  # [B, num_levels]

    return orig_enc, mutated_tokenized, num_mut_tensor


In [None]:
# ================================
# 3. 모델 로드
# ================================
def load_glm_model(model_name: str, device: torch.device):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True)
    model.to(device)

    return tokenizer, model

In [None]:
# ================================
# 4. 파인튜닝 루프
# ================================

#1.원본 DNA 임베딩 만들기, 2.돌연변이 DNA 임베딩 만들기, 3.거리 계산
def train_variant_sensitive_glm(
    model,
    tokenizer,
    train_seqs: List[str],
    device: torch.device,
):
    dataset = MutationContrastiveDataset(train_seqs, MUTATION_LEVELS)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda batch: collate_fn(batch, tokenizer, MAX_SEQ_LEN),
    )

    num_training_steps = EPOCHS * len(dataloader)
    warmup_steps = int(num_training_steps * WARMUP_RATIO)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps,
    )

    model.train()

    for epoch in range(EPOCHS):
        epoch_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")

        for step, (orig_enc, mutated_tokenized, num_mut_tensor) in enumerate(progress_bar):
            optimizer.zero_grad()

             # 이 줄 추가!!!
            num_mut_tensor = num_mut_tensor.to(device)

            orig_input_ids = orig_enc["input_ids"].to(device)
            orig_attn_mask = orig_enc["attention_mask"].to(device)

            # 원본 시퀀스 임베딩
            outputs_orig = model(
                input_ids=orig_input_ids,
                attention_mask=orig_attn_mask,
                output_hidden_states=True,
            )
            last_hidden_orig = outputs_orig.hidden_states[-1]  # [B, L, H]
            emb_orig = mean_pool(last_hidden_orig, orig_attn_mask)  # [B, H] #원본 DNA 임베딩 만들기

            # mutation level 별 임베딩 및 loss 계산
            batch_size = orig_input_ids.size(0)
            num_levels = len(mutated_tokenized)

            loss_all_levels = 0.0

            # mutation 수 (정수)를 비율로 바꿔 margin 설계
            seq_len_est = orig_attn_mask.sum(dim=1, keepdim=True)  # [B, 1]
            mut_ratio = num_mut_tensor / seq_len_est  # [B, num_levels]

            for level_idx in range(num_levels):
                enc_level = mutated_tokenized[level_idx]
                m_input_ids = enc_level["input_ids"].to(device)
                m_attn_mask = enc_level["attention_mask"].to(device)

                outputs_mut = model(
                    input_ids=m_input_ids,
                    attention_mask=m_attn_mask,
                    output_hidden_states=True,
                )
                last_hidden_mut = outputs_mut.hidden_states[-1]
                emb_mut = mean_pool(last_hidden_mut, m_attn_mask)  # [B, H] #돌연변이 임베딩 DNA 만들기

                dist = cosine_distance(emb_orig, emb_mut)  # [B] #거리  계산

                # margin: BASE_MARGIN + ALPHA * mutation_ratio
                margin = BASE_MARGIN + ALPHA_MARGIN * mut_ratio[:, level_idx] #DNA를 많이 바꾸면 점수 차이도 더 커야한
                margin = margin.to(device)

                # hinge loss: dist >= margin
                loss_level = F.relu(margin - dist).mean()
                loss_all_levels = loss_all_levels + loss_level

            loss = loss_all_levels / num_levels
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

        print(f"[Epoch {epoch+1}] mean loss = {epoch_loss / len(dataloader):.4f}")

    return model

In [None]:
# ================================
# 5. test.csv 전체 임베딩 추출
# ================================
class TestSeqDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.ids = df["ID"].tolist()
        self.seqs = df["seq"].astype(str).tolist()

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return self.ids[idx], self.seqs[idx]


def collate_test(batch, tokenizer, max_len: int):
    ids = [b[0] for b in batch]
    seqs = [b[1] for b in batch]
    enc = tokenizer(
        seqs,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )
    return ids, enc


def extract_embeddings( #학습 끝난 모델로 벡터화시켜 반환
    model,
    tokenizer,
    test_df: pd.DataFrame,
    device: torch.device,
) -> pd.DataFrame:
    model.eval()
    dataset = TestSeqDataset(test_df)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda batch: collate_test(batch, tokenizer, MAX_SEQ_LEN),
    )

    all_ids = []
    all_embs = []

    with torch.no_grad():
        for ids, enc in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = enc["input_ids"].to(device)
            attn_mask = enc["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attn_mask,
                output_hidden_states=True,
            )
            last_hidden = outputs.hidden_states[-1]
            emb = mean_pool(last_hidden, attn_mask)  # [B, H]

            emb = emb.detach().cpu().numpy()
            all_ids.extend(ids)
            all_embs.append(emb)

    all_embs = np.vstack(all_embs)  # [N, EMBED_DIM]

    # submission 포맷으로 변환
    # sample_submission.csv의 ID 순서에 맞춰서 정렬
    sub = pd.read_csv(SAMPLE_SUB_PATH)
    id_to_index = {id_: i for i, id_ in enumerate(all_ids)}
    ordered_embs = np.zeros((len(sub), EMBED_DIM), dtype=np.float32)

    for i, id_ in enumerate(sub["ID"].tolist()):
        idx = id_to_index[id_]
        ordered_embs[i] = all_embs[idx]

    emb_cols = [f"emb_{i:04d}" for i in range(EMBED_DIM)]
    emb_df = pd.DataFrame(ordered_embs, columns=emb_cols)
    out_df = pd.concat([sub[["ID"]], emb_df], axis=1)

    return out_df


In [None]:
# ================================
# 6. 메인 실행부
# ================================
def main():
    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # 데이터 로드
    test_df = pd.read_csv(TEST_PATH)
    print("test shape:", test_df.shape)

    # gLM 로드
    tokenizer, model = load_glm_model(MODEL_NAME, device)

    # ---------- 파인튜닝 ----------
    if DO_TRAIN:
        # 너무 크면 일부 샘플만 사용 (시간 절약용)
        uniq_seqs = test_df["seq"].astype(str).unique().tolist()
        random.shuffle(uniq_seqs)
        if len(uniq_seqs) > MAX_TRAIN_SEQS:
            train_seqs = uniq_seqs[:MAX_TRAIN_SEQS]
        else:
            train_seqs = uniq_seqs

        print(f"Train sequences: {len(train_seqs)}")
        model = train_variant_sensitive_glm(
            model=model,
            tokenizer=tokenizer,
            train_seqs=train_seqs,
            device=device,
        )

    # ---------- 임베딩 추출 ----------
    submission_df = extract_embeddings(
        model=model,
        tokenizer=tokenizer,
        test_df=test_df,
        device=device,
    )

    # 저장
    submission_df.to_csv(OUTPUT_PATH, index=False)
    print("Saved submission to:", OUTPUT_PATH)


if __name__ == "__main__":
    main()

Device: cuda
test shape: (13711, 2)
Train sequences: 13711


Epoch 1/1:   0%|          | 0/1714 [00:00<?, ?it/s]

[Epoch 1] mean loss = 0.0651


Extracting embeddings:   0%|          | 0/1714 [00:00<?, ?it/s]

Saved submission to: submission_ntv2_contrastive.csv


#코드 개선

In [None]:
import os
import random
import math
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup


In [None]:
# ================================
# 0. 설정값
# ================================
MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"

TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
OUTPUT_PATH = "submission_v2_fixed.csv"

MAX_SEQ_LEN = 512
EMBED_DIM = 512   # 최종 embedding dimension

DO_TRAIN = True
MAX_TRAIN_SEQS = 20000
EPOCHS = 1
BATCH_SIZE = 8
LR = 2e-5
WARMUP_RATIO = 0.05

MUTATION_LEVELS = [0.002, 0.005, 0.01]
BASE_MARGIN = 0.1
ALPHA_MARGIN = 5.0

SEED = 2025



In [None]:
# ================================
# 1. 유틸
# ================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


# ---- Center Crop ----
def center_crop(seq, max_len):
    if len(seq) <= max_len:
        return seq
    start = (len(seq) - max_len) // 2
    return seq[start:start + max_len]


# ---- Reverse Complement ----
def reverse_complement(seq):
    comp = {"A": "T", "T": "A", "C": "G", "G": "C"}
    return "".join(comp.get(b, b) for b in reversed(seq))


# ---- SNV ----
def mutate_snvs(seq: str, mutation_ratio: float) -> Tuple[str, int]:
    bases = ["A", "C", "G", "T"]
    seq = seq.upper()
    length = len(seq)
    num_mut = max(1, int(length * mutation_ratio))

    positions = random.sample(range(length), num_mut)
    seq_list = list(seq)

    for pos in positions:
        orig = seq_list[pos]
        candidates = [b for b in bases if b != orig]
        seq_list[pos] = random.choice(candidates)

    return "".join(seq_list), num_mut


# ---- Deletion ----
def mutate_delete(seq: str, ratio: float):
    seq = list(seq)
    k = max(1, int(len(seq) * ratio))
    for _ in range(k):
        if len(seq) <= 1: break
        pos = random.randrange(len(seq))
        del seq[pos]
    return "".join(seq)


# ---- Insertion ----
def mutate_insert(seq: str, ratio: float):
    bases = ["A", "C", "G", "T"]
    seq = list(seq)
    k = max(1, int(len(seq) * ratio))
    for _ in range(k):
        pos = random.randrange(len(seq))
        seq.insert(pos, random.choice(bases))
    return "".join(seq)


# ---- Advanced Pooling: mean + max concat ----
def advanced_pool(last_hidden, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
    hidden = last_hidden * mask

    mean = hidden.sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
    max_ = hidden.masked_fill(mask == 0, -1e9).max(dim=1).values

    return torch.cat([mean, max_], dim=1)   # [B, hidden*2]


# ---- Cosine distance ----
def cosine_distance(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    return 1.0 - F.cosine_similarity(a, b)



In [None]:
# ================================
# 2. Dataset
# ================================
class MutationContrastiveDataset(Dataset):
    def __init__(self, seq_list: List[str], mutation_levels: List[float]):
        self.seqs = seq_list
        self.mutation_levels = mutation_levels

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq = center_crop(self.seqs[idx], MAX_SEQ_LEN)

        mutated = []
        mut_counts = []

        for r in self.mutation_levels:
            m1, c1 = mutate_snvs(seq, r)
            m2 = mutate_delete(seq, r)
            m3 = mutate_insert(seq, r)
            m4 = reverse_complement(seq)

            mutated.append([m1, m2, m3, m4])
            mut_counts.append([c1, len(seq)*r, len(seq)*r, len(seq)])

        return {"orig": seq, "mut": mutated, "mut_count": mut_counts}


def collate_fn(batch, tokenizer):
    origs = [b["orig"] for b in batch]
    orig_enc = tokenizer(
        origs,
        padding=True,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )

    mutated = [b["mut"] for b in batch]      # shape: B × L × A
    mut_counts = torch.tensor([b["mut_count"] for b in batch], dtype=torch.float32)

    B = len(batch)
    L = len(mutated[0])
    A = len(mutated[0][0])

    mut_token = [[None for _ in range(A)] for _ in range(L)]

    for i in range(L):
        for j in range(A):
            seqs = [mutated[b][i][j] for b in range(B)]
            enc = tokenizer(
                seqs,
                padding=True,
                truncation=True,
                max_length=MAX_SEQ_LEN,
                return_tensors="pt"
            )
            mut_token[i][j] = enc

    return orig_enc, mut_token, mut_counts


In [None]:
# ================================
# 3. Model Loader (Backbone + External Projection Head)
# ================================
def load_glm_model(model_name, device):

    tokenizer = AutoTokenizer.from_pretrained(
        model_name, trust_remote_code=True
    )

    # ✔ backbone만 로드 → state_dict 충돌 없음
    backbone = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True,
    ignore_mismatched_sizes=True
).to(device)

    hidden = backbone.config.hidden_size      # v2-50M: 768

    proj_head = nn.Sequential(
        nn.Linear(hidden * 2, 512),
        nn.ReLU(),
        nn.Linear(512, 512)
    ).to(device)

    return tokenizer, backbone, proj_head


# ---- Forward Embedding ----
def forward_embedding(backbone, proj_head, input_ids, attn_mask):
    out = backbone(
        input_ids=input_ids,
        attention_mask=attn_mask,
        output_hidden_states=True,
    )
    last_hidden = out.hidden_states[-1]

    pooled = advanced_pool(last_hidden, attn_mask)
    emb = proj_head(pooled)
    emb = F.normalize(emb, dim=1)

    return emb


In [None]:
# ================================
# 4. Training (Contrastive v2)
# ================================
def train_variant_sensitive_glm(backbone, proj_head, tokenizer, seqs, device):

    dataset = MutationContrastiveDataset(seqs, MUTATION_LEVELS)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda b: collate_fn(b, tokenizer)
    )

    num_steps = EPOCHS * len(dataloader)
    warmup_steps = int(num_steps * WARMUP_RATIO)

    # backbone + head 모두 학습
    params = list(backbone.parameters()) + list(proj_head.parameters())
    optimizer = torch.optim.AdamW(params, lr=LR)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, warmup_steps, num_steps
    )

    backbone.train()
    proj_head.train()

    for epoch in range(EPOCHS):

        total_loss = 0.0

        for orig_enc, mut_token, mut_counts in tqdm(dataloader):

            optimizer.zero_grad()

            orig_ids = orig_enc["input_ids"].to(device)
            orig_mask = orig_enc["attention_mask"].to(device)

            emb_orig = forward_embedding(backbone, proj_head, orig_ids, orig_mask)

            mut_counts = mut_counts.to(device)
            B, L, A = mut_counts.shape
            seq_len_est = orig_mask.sum(dim=1).unsqueeze(1)

            loss_sum = 0.0

            for i in range(L):
                for j in range(A):
                    enc = mut_token[i][j]
                    m_ids = enc["input_ids"].to(device)
                    m_mask = enc["attention_mask"].to(device)

                    emb_mut = forward_embedding(backbone, proj_head, m_ids, m_mask)
                    dist = cosine_distance(emb_orig, emb_mut)

                    mut_ratio = mut_counts[:, i, j] / seq_len_est.squeeze(1)
                    margin = BASE_MARGIN + ALPHA_MARGIN * torch.log1p(5 * mut_ratio)
                    loss_level = F.relu(margin - dist).mean()

                    loss_sum += loss_level

            loss = loss_sum / (L * A)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(params, 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        print(f"[Epoch {epoch+1}] Loss = {total_loss / len(dataloader):.4f}")

    return backbone, proj_head


In [None]:
# ================================
# 5. Embedding Extraction
# ================================
class TestSeqDataset(Dataset):
    def __init__(self, df):
        self.ids = df["ID"].tolist()
        self.seqs = df["seq"].astype(str).tolist()

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        seq = center_crop(self.seqs[idx], MAX_SEQ_LEN)
        return self.ids[idx], seq


def collate_test(batch, tokenizer):
    ids = [b[0] for b in batch]
    seqs = [b[1] for b in batch]

    enc = tokenizer(
        seqs,
        padding=True,
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_tensors="pt"
    )

    return ids, enc


def extract_embeddings(backbone, proj_head, tokenizer, df, device):
    backbone.eval()
    proj_head.eval()

    ds = TestSeqDataset(df)
    dl = DataLoader(
        ds,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda b: collate_test(b, tokenizer)
    )

    all_ids = []
    all_embs = []

    with torch.no_grad():
        for ids, enc in tqdm(dl):

            input_ids = enc["input_ids"].to(device)
            attn_mask = enc["attention_mask"].to(device)

            emb = forward_embedding(backbone, proj_head, input_ids, attn_mask)

            all_ids.extend(ids)
            all_embs.append(emb.cpu().numpy())

    all_embs = np.vstack(all_embs)

    sub = pd.read_csv(SAMPLE_SUB_PATH)
    id_to_index = {id_: i for i, id_ in enumerate(all_ids)}

    ordered = np.zeros((len(sub), EMBED_DIM), dtype=np.float32)

    for i, id_ in enumerate(sub["ID"].tolist()):
        ordered[i] = all_embs[id_to_index[id_]]

    emb_cols = [f"emb_{i:04d}" for i in range(EMBED_DIM)]
    return pd.concat([sub[["ID"]], pd.DataFrame(ordered, columns=emb_cols)], axis=1)

In [None]:
# ================================
# 6. Main
# ================================
def main():

    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    test_df = pd.read_csv(TEST_PATH)

    tokenizer, backbone, proj_head = load_glm_model(MODEL_NAME, device)

    if DO_TRAIN:
        uniq = test_df["seq"].astype(str).unique().tolist()
        random.shuffle(uniq)
        train_seqs = uniq[:MAX_TRAIN_SEQS]

        backbone, proj_head = train_variant_sensitive_glm(
            backbone, proj_head, tokenizer, train_seqs, device
        )

    submission_df = extract_embeddings(
        backbone, proj_head, tokenizer, test_df, device
    )

    submission_df.to_csv(OUTPUT_PATH, index=False)
    print("Saved:", OUTPUT_PATH)



if __name__ == "__main__":
    main()

Device: cuda


Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-v2-50m-multi-species and are newly initialized: ['encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.output.dense.bias', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.output.dense.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.output.dense.bias', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.4.output.dense.bias', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.5.output.dense.bias', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.output.dense.bias', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.output.dense.bias', 'encoder.layer.8.intermediate.dense.b

  0%|          | 0/1714 [00:00<?, ?it/s]

[Epoch 1] Loss = 4.3056


  0%|          | 0/1714 [00:00<?, ?it/s]

Saved: submission_v2_fixed.csv


#너무 실행시간이 길어서 다른코드

In [None]:
pip install -q transformers tqdm

In [None]:
import os
import random
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForMaskedLM


In [None]:
# ================================
# 0. 설정값 (필요하면 여기만 수정)
# ================================
MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"

TEST_PATH = "test.csv"                  # 대회 제공 test.csv 경로
SAMPLE_SUB_PATH = "sample_submission.csv"
OUTPUT_PATH = "submission_ntv2_fast.csv"

MAX_SEQ_LEN = 256                         # 토큰 최대 길이 (512보다 줄여 속도 ↑)
BATCH_SIZE = 16                           # GPU 여유되면 32로 키워도 됨
SEED = 2025


In [None]:
# ================================
# 1. 유틸 함수들
# ================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    pad 토큰을 제외하고 token dimension 평균을 내서 [batch, hidden] 임베딩 생성
    """
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked_hidden = last_hidden_state * mask
    summed = masked_hidden.sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    return summed / counts


In [None]:
# ================================
# 2. Dataset & collate
# ================================
class TestSeqDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.ids = df["ID"].tolist()
        self.seqs = df["seq"].astype(str).tolist()

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return self.ids[idx], self.seqs[idx]


def collate_test(batch, tokenizer, max_len: int):
    ids = [b[0] for b in batch]
    seqs = [b[1] for b in batch]
    enc = tokenizer(
        seqs,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )
    return ids, enc


In [None]:
# ================================
# 3. 모델 로드
# ================================
def load_glm_model(model_name: str, device: torch.device):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # fp16로 줄이고 싶으면 torch_dtype=torch.float16 옵션을 써도 됨
    model = AutoModelForMaskedLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        # torch_dtype=torch.float16   # GPU + 메모리 여유 있으면 주석 해제
    )
    model.to(device)
    model.eval()
    return tokenizer, model


In [None]:
# ================================
# 4. 임베딩 추출
# ================================
def extract_embeddings(
    model,
    tokenizer,
    test_df: pd.DataFrame,
    sample_sub: pd.DataFrame,
    device: torch.device,
) -> pd.DataFrame:
    dataset = TestSeqDataset(test_df)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        collate_fn=lambda batch: collate_test(batch, tokenizer, MAX_SEQ_LEN),
    )

    all_ids = []
    all_embs = []

    hidden_size = model.config.hidden_size  # 이 모델은 512
    print("hidden_size (embed_dim):", hidden_size)

    with torch.no_grad():
        for ids, enc in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = enc["input_ids"].to(device)
            attn_mask = enc["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attn_mask,
                output_hidden_states=True,
            )
            last_hidden = outputs.hidden_states[-1]  # [B, L, H]
            emb = mean_pool(last_hidden, attn_mask)  # [B, H]

            emb = emb.detach().cpu().numpy().astype(np.float32)
            all_ids.extend(ids)
            all_embs.append(emb)

    all_embs = np.vstack(all_embs)  # [N, H]

    # sample_submission 의 ID 순서에 맞춰 정렬
    id_to_index = {id_: i for i, id_ in enumerate(all_ids)}
    ordered_embs = np.zeros((len(sample_sub), hidden_size), dtype=np.float32)

    for i, id_ in enumerate(sample_sub["ID"].tolist()):
        idx = id_to_index[id_]
        ordered_embs[i] = all_embs[idx]

    emb_cols = [f"emb_{i:04d}" for i in range(hidden_size)]
    emb_df = pd.DataFrame(ordered_embs, columns=emb_cols)
    out_df = pd.concat([sample_sub[["ID"]], emb_df], axis=1)
    return out_df


In [None]:
# ================================
# 5. main
# ================================
def main():
    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    test_df = pd.read_csv(TEST_PATH)
    sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
    print("test shape:", test_df.shape)

    tokenizer, model = load_glm_model(MODEL_NAME, device)

    submission_df = extract_embeddings(
        model=model,
        tokenizer=tokenizer,
        test_df=test_df,
        sample_sub=sample_sub,
        device=device,
    )

    submission_df.to_csv(OUTPUT_PATH, index=False)
    print("Saved submission to:", OUTPUT_PATH)


if __name__ == "__main__":
    main()

Device: cpu
test shape: (13711, 2)
hidden_size (embed_dim): 512


Extracting embeddings:   0%|          | 0/857 [00:00<?, ?it/s]

Saved submission to: ./submission_ntv2_fast.csv


#제미나이 코드

In [None]:
# -*- coding: utf-8 -*-

# =========================================================
# 1. (Colab일 때만) 필요한 라이브러리 설치
#    - 이미 설치되어 있으면 이 셀은 건너뛰어도 됩니다.
# =========================================================
# !pip install transformers datasets accelerate pandas numpy torch tqdm


In [None]:
# =========================================================
# 2. 라이브러리 임포트
# =========================================================
import os
import random
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
# ---------------------------------------------------------
# 디바이스 설정 (GPU 사용 가능 시 사용)
# ---------------------------------------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {DEVICE}")

DATA_PATH = Path("./")  # test.csv, sample_submission.csv 있는 경로 기준


사용 장치: cuda


In [None]:
# =========================================================
# 3. 재현성 위한 시드 고정
# =========================================================
def set_seed(seed: int = 2025):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [None]:
# =========================================================
# 4. 데이터셋 클래스
#    - 개별 샘플은 토큰화하지 않고 seq 문자열만 넘기고,
#      collate_fn에서 한 번에 배치 토큰화 -> 더 빠름
# =========================================================
class SequenceDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.ids = data["ID"].tolist()
        self.sequences = data["seq"].tolist()

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx: int) -> dict:
        return {
            "id": self.ids[idx],
            "seq": self.sequences[idx],
        }

In [None]:
# =========================================================
# 5. Pooling 함수 (Mean + Max Concatenation)
#    - pad 토큰은 mask로 제거
# =========================================================
def concat_mean_max_pooling(
    last_hidden_state: torch.Tensor,  # [B, L, H]
    attention_mask: torch.Tensor,     # [B, L]
) -> torch.Tensor:
    # mask: [B, L, 1]
    mask = attention_mask.unsqueeze(-1).float()

    # ----- Mean Pooling -----
    masked_hidden = last_hidden_state * mask
    sum_hidden = masked_hidden.sum(dim=1)               # [B, H]
    token_count = mask.sum(dim=1).clamp(min=1e-6)       # [B, 1]
    mean_emb = sum_hidden / token_count                 # [B, H]

    # ----- Max Pooling -----
    # pad 위치는 아주 작은 값으로 채워서 max에서 제외
    very_neg = torch.finfo(last_hidden_state.dtype).min
    hidden_for_max = last_hidden_state.masked_fill(
        attention_mask.unsqueeze(-1) == 0, very_neg
    )
    max_emb, _ = hidden_for_max.max(dim=1)              # [B, H]

    # ----- Concatenate -----
    seq_emb = torch.cat([mean_emb, max_emb], dim=1)     # [B, 2H]
    return seq_emb


In [None]:
# =========================================================
# 6. 임베딩 추출 함수
# =========================================================
def get_embeddings(
    model: AutoModelForMaskedLM,
    tokenizer: AutoTokenizer,
    dataloader: DataLoader,
    max_len: int,
    device: torch.device,
) -> Tuple[np.ndarray, List[str]]:
    model.eval()
    all_embs = []
    all_ids = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Calculating Embeddings"):
            ids = batch["id"]
            seqs = batch["seq"]

            # 배치 토큰화
            tokenized = tokenizer(
                seqs,
                padding="max_length",
                truncation=True,
                max_length=max_len,
                return_tensors="pt",
            )
            input_ids = tokenized["input_ids"].to(device)
            attention_mask = tokenized["attention_mask"].to(device)

            # 🔴 여기 수정: hidden_states를 켜고, 마지막 레이어를 직접 꺼내쓰기
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,   # ✅ 켜기
                return_dict=True,            # 안전하게 dict 형태 보장
            )

            # MaskedLMOutput 에는 last_hidden_state가 없으니까
            # hidden_states 튜플의 마지막 것을 사용
            last_hidden = outputs.hidden_states[-1]   # ✅ [B, L, H]

            # Mean + Max pooling 결합
            seq_emb = concat_mean_max_pooling(last_hidden, attention_mask)  # [B, 2H]

            all_embs.append(seq_emb.cpu())
            all_ids.extend(ids)

    emb_tensor = torch.vstack(all_embs).float()
    emb_np = emb_tensor.numpy()
    N, H = emb_np.shape
    print(f"✅ 최종 임베딩 차원: {N} x {H}")
    return emb_np, all_ids



In [None]:
# =========================================================
# 7. 메인 실행 블록
# =========================================================
def main():
    set_seed(2025)

    # ---------- 하이퍼파라미터 ----------
    # v2-50m : hidden_size=512 ⇒ mean+max concat => 1024차원 (2048 이내)
    MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"
    MAX_LENGTH = 256      # 시퀀스 길이에 따라 256~512 정도로 조정
    BATCH_SIZE = 32       # GPU 여유 없으면 16으로 줄이기
    SUBMISSION_FILE_NAME = "submission_ntv2_concat_pooling.csv"

    # ---------- 데이터 로드 ----------
    test_df = pd.read_csv("test.csv")
    print(f"테스트 데이터셋 크기: {len(test_df)}")

    # ---------- 모델/토크나이저 로드 ----------
    print("모델 및 토크나이저 로드 시작...")
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
    )
    model = AutoModelForMaskedLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        # GPU + 메모리 여유 충분하면 아래 주석 해제해서 half precision 사용 가능
        # torch_dtype=torch.float16,
    )
    model.to(DEVICE)
    print("모델 로드 완료.")

    # ---------- 데이터셋/로더 ----------
    test_dataset = SequenceDataset(test_df)
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=2,      # Colab CPU 상황에 따라 0~2 정도
        pin_memory=True if DEVICE.type == "cuda" else False,
    )

    # ---------- 임베딩 추출 ----------
    emb_np, all_ids = get_embeddings(
        model=model,
        tokenizer=tokenizer,
        dataloader=test_dataloader,
        max_len=MAX_LENGTH,
        device=DEVICE,
    )

    # ---------- 제출 파일 생성 ----------
    H = emb_np.shape[1]
    emb_cols = [f"emb_{i:04d}" for i in range(H)]
    emb_df = pd.DataFrame(emb_np, columns=emb_cols)

    # ID–임베딩 합치기
    tmp_df = pd.DataFrame({"ID": all_ids})
    tmp_df = pd.concat([tmp_df, emb_df], axis=1)

    # test_df의 ID 순서를 유지
    submission_df = test_df[["ID"]].merge(tmp_df, on="ID", how="left")

    submission_df.to_csv(DATA_PATH / SUBMISSION_FILE_NAME, index=False, encoding="utf-8")
    print(f"🎉 제출 파일 생성 완료: {SUBMISSION_FILE_NAME}")
    print(submission_df.head())


if __name__ == "__main__":
    main()

테스트 데이터셋 크기: 13711
모델 및 토크나이저 로드 시작...
모델 로드 완료.


Calculating Embeddings:   0%|          | 0/429 [00:00<?, ?it/s]

✅ 최종 임베딩 차원: 13711 x 1024
🎉 제출 파일 생성 완료: submission_ntv2_concat_pooling.csv
            ID  emb_0000  emb_0001  emb_0002  emb_0003  emb_0004  emb_0005  \
0  TEST_000000 -0.076687  0.532987  0.250104  0.095763  0.156266  0.105156   
1  TEST_000001 -0.085685  0.429889  0.162488  0.018770  0.180672  0.030552   
2  TEST_000002 -0.132982  0.442900  0.154394  0.054666  0.087602  0.050151   
3  TEST_000003 -0.067654  0.335053  0.164019  0.050935  0.122121  0.061331   
4  TEST_000004  0.001792  0.429978  0.129435  0.022114  0.160501  0.128668   

   emb_0006  emb_0007  emb_0008  ...  emb_1014  emb_1015  emb_1016  emb_1017  \
0 -0.098269  0.237808 -0.060832  ...  1.130631  0.956810  1.064700  0.892267   
1 -0.173395  0.071261  0.031437  ...  1.194993  1.177694  1.312644  0.752649   
2 -0.168084  0.469866  0.008924  ...  1.287498  1.098501  1.016127  0.657049   
3 -0.241115  0.446619  0.027432  ...  1.207277  0.906364  0.979558  0.993381   
4 -0.150665  0.328996 -0.111540  ...  0.912517  1.12070

# 첫번째 코드 개선 다른버전

In [None]:
# -*- coding: utf-8 -*-
# 구글 코랩에서 실행 가능하며, `test.csv`와 `sample_submission.csv`가 현재 디렉토리에 있어야 합니다.
# ----------------------------------------------------------------------
# 1. 필수 라이브러리 설치 (Colab용)
# ----------------------------------------------------------------------
!pip install transformers datasets accelerate pandas numpy torch tqdm

# ----------------------------------------------------------------------
# 2. 라이브러리 임포트 및 유틸리티
# ----------------------------------------------------------------------
import os
import random
import math
from typing import List, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup



In [None]:
# ================================
# 0. 설정값 (하이퍼파라미터 튜닝 영역)
# ================================
# v2-50m 모델 사용
MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"

TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
OUTPUT_PATH = "submission_ntv2_boosted_contrastive.csv"

MAX_SEQ_LEN = 512                                # 토큰 최대 길이
BASE_HIDDEN_DIM = 512                            # v2-50m 모델의 기본 hidden_size
EMBED_DIM = BASE_HIDDEN_DIM * 2                  # Mean + Max Pooling으로 1024차원 사용

DO_TRAIN = True                                  # 파인튜닝 여부 (대회에서는 True 권장)
MAX_TRAIN_SEQS = 20000                           # test 중 학습에 쓸 최대 시퀀스 수 (속도 고려)
EPOCHS = 1                                       # Colab 환경 최적화
BATCH_SIZE = 8
LR = 1e-5                                        # 학습률 조정 (1e-5 ~ 3e-5)
WARMUP_RATIO = 0.05

# Triplet Contrastive Learning 설정
# Mut_A: 작은 변이 (Positive 역할), Mut_B: 큰 변이 (Negative 역할)
MUTATION_LEVEL_A = 0.002                         # 0.2% SNV (작은 변이)
MUTATION_LEVEL_B = 0.01                          # 1.0% SNV (큰 변이)

# Triplet Loss Margin: dist(Anchor, Positive) + MARGIN < dist(Anchor, Negative)
# Margin을 동적으로 설정: BASE_MARGIN + (Mut_B - Mut_A) * ALPHA_MARGIN
TRIPLET_MARGIN = 0.2
ALPHA_MARGIN_SCALE = 1.0 # 동적 마진 스케일 조정 (PCC 개선)

SEED = 2025

In [None]:
# ================================
# 1. 유틸 함수들
# ================================
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def mutate_sequence_snvs(seq: str, mutation_ratio: float) -> Tuple[str, int]:
    """
    주어진 DNA 염기열에 대해 SNV(single nucleotide variants)만 랜덤으로 넣어서
    '조금 다른' variant 시퀀스를 만든다.
    """
    bases = ["A", "C", "G", "T"]
    seq = seq.upper()
    length = len(seq)

    # 최소 1개 변이 보장
    num_mutations = max(1, int(length * mutation_ratio))

    if num_mutations >= length:
        num_mutations = length // 2 if length >= 2 else 1

    positions = random.sample(range(length), num_mutations)
    seq_list = list(seq)

    for pos in positions:
        original = seq_list[pos]
        candidates = [b for b in bases if b != original]
        if not candidates:
            continue
        seq_list[pos] = random.choice(candidates)

    mutated = "".join(seq_list)
    return mutated, num_mutations

def get_pooled_embedding(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    Mean Pooling과 Max Pooling을 결합하여 (Concatenative Pooling) 임베딩을 추출합니다.
    [B, L, H] -> [B, 2*H]
    """
    # 1. Mean Pooling (패딩 제외)
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    masked_hidden = last_hidden_state * mask
    summed = masked_hidden.sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    mean_emb = summed / counts # [B, H]

    # 2. Max Pooling (패딩 제외)
    # 패딩 위치는 -inf로 설정하여 Max Pooling 시 선택되지 않도록 함
    masked_hidden_max = last_hidden_state.masked_fill(~mask.bool(), -1e9)
    max_emb, _ = torch.max(masked_hidden_max, dim=1) # [B, H]

    # 3. Concatenate (결합)
    return torch.cat((mean_emb, max_emb), dim=1) # [B, 2*H]

def cosine_distance(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    """
    코사인 거리: 1 - cos_sim
    """
    return 1.0 - F.cosine_similarity(a, b)

In [None]:
# ================================
# 2. Dataset 정의 (Triplet Loss 구조)
# ================================
class MutationContrastiveDataset(Dataset):
    """
    Anchor(Original), Positive(Small Mutation), Negative(Large Mutation) Triplet을 생성
    """
    def __init__(self, seq_list: List[str]):
        self.seqs = seq_list
        # 작은 변이: Positive 역할을 하여 거리를 가깝게 유도
        self.mutation_ratio_P = MUTATION_LEVEL_A
        # 큰 변이: Negative 역할을 하여 거리를 멀게 유도
        self.mutation_ratio_N = MUTATION_LEVEL_B

    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, idx):
        seq = self.seqs[idx]

        # P: Original - Small Mutation (Positive Pair)
        mut_P, n_mut_P = mutate_sequence_snvs(seq, self.mutation_ratio_P)

        # N: Original - Large Mutation (Negative Pair)
        mut_N, n_mut_N = mutate_sequence_snvs(seq, self.mutation_ratio_N)

        return {
            "anchor": seq,
            "positive": mut_P,
            "negative": mut_N,
            "num_mut_P": n_mut_P,
            "num_mut_N": n_mut_N,
        }

def collate_fn(batch, tokenizer, max_len: int):
    anchors = [b["anchor"] for b in batch]
    positives = [b["positive"] for b in batch]
    negatives = [b["negative"] for b in batch]

    num_mut_P = [b["num_mut_P"] for b in batch]
    num_mut_N = [b["num_mut_N"] for b in batch]

    # 모든 시퀀스를 한 번에 토큰화
    all_seqs = anchors + positives + negatives

    enc = tokenizer(
        all_seqs,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )

    # 배치 크기
    B = len(anchors)

    # 결과 분리
    enc_A = {k: v[:B] for k, v in enc.items()}
    enc_P = {k: v[B:2*B] for k, v in enc.items()}
    enc_N = {k: v[2*B:] for k, v in enc.items()}

    num_mut_tensor = torch.tensor([num_mut_P, num_mut_N], dtype=torch.float32).T # [B, 2]

    return enc_A, enc_P, enc_N, num_mut_tensor

In [None]:
# ================================
# 3. 모델 아키텍처 및 로드 (Dropout 추가)
# ================================
class VariantSensitiveGLM(nn.Module):
    """
    기존 AutoModel 위에 Dropout 레이어를 추가하여 Fine-Tuning 안정성 및 일반화 개선
    """
    def __init__(self, model_name: str, hidden_dropout_prob: float = 0.1):
        super().__init__()
        # AutoModelForMaskedLM 대신, 임베딩 추출에 더 적합한 AutoModel 사용
        self.base_model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            ignore_mismatched_sizes=True # MLM 헤드 가중치 불일치 무시
        )
        # 임베딩 추출 시 안정성을 위해 마지막 히든 스테이트에 Dropout 적용
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )

        # 마지막 히든 스테이트에 Dropout 적용
        last_hidden_state = self.dropout(outputs.last_hidden_state)

        # Concatenative Pooling을 통해 최종 임베딩 벡터 반환
        pooled_emb = get_pooled_embedding(last_hidden_state, attention_mask)

        return pooled_emb # [B, 2*H]

def load_glm_model(model_name: str, device: torch.device):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = VariantSensitiveGLM(model_name)
    model.to(device)

    return tokenizer, model

In [None]:
# ================================
# 4. 파인튜닝 루프 (Triplet Loss 적용)
# ================================
def train_variant_sensitive_glm(
    model: VariantSensitiveGLM,
    tokenizer: AutoTokenizer,
    train_seqs: List[str],
    device: torch.device,
):
    dataset = MutationContrastiveDataset(train_seqs)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=lambda batch: collate_fn(batch, tokenizer, MAX_SEQ_LEN),
    )

    num_training_steps = EPOCHS * len(dataloader)
    warmup_steps = int(num_training_steps * WARMUP_RATIO)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_training_steps,
    )

    model.train()

    # 동적 마진 계산: mutation_ratio_N - mutation_ratio_P = 0.01 - 0.002 = 0.008
    # Triplet Loss Margin을 동적 비율에 따라 조정
    # 이는 변이 크기에 따른 거리 차이를 강제하여 PCC를 개선하는 데 도움을 줌
    dynamic_margin = TRIPLET_MARGIN + ALPHA_MARGIN_SCALE * (MUTATION_LEVEL_B - MUTATION_LEVEL_A)
    dynamic_margin = torch.tensor(dynamic_margin, dtype=torch.float32).to(device)
    print(f"Triplet Loss Margin: {dynamic_margin.item():.4f}")


    for epoch in range(EPOCHS):
        epoch_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")

        for step, (enc_A, enc_P, enc_N, num_mut_tensor) in enumerate(progress_bar):
            optimizer.zero_grad()

            # 1. Anchor, Positive, Negative 임베딩 추출 (Pooling은 모델 내부에서 처리)
            # [B, 2*H]
            emb_A = model(enc_A["input_ids"].to(device), enc_A["attention_mask"].to(device))
            emb_P = model(enc_P["input_ids"].to(device), enc_P["attention_mask"].to(device))
            emb_N = model(enc_N["input_ids"].to(device), enc_N["attention_mask"].to(device))

            # 2. 코사인 거리 계산
            dist_AP = cosine_distance(emb_A, emb_P) # Anchor-Positive 거리 (작아야 함) [B]
            dist_AN = cosine_distance(emb_A, emb_N) # Anchor-Negative 거리 (커야 함) [B]

            # 3. Triplet Margin Loss 적용
            # Triplet Loss: max(0, dist(A, P) - dist(A, N) + margin)
            # 우리는 dist(A, P) < dist(A, N)을 원하므로, dist(A, P) - dist(A, N)이 음수여야 함

            # 동적 마진 대신 고정 마진 사용 (안정성 확보)
            loss_triplet = F.relu(dist_AP - dist_AN + dynamic_margin)
            loss = loss_triplet.mean()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

        print(f"[Epoch {epoch+1}] mean loss = {epoch_loss / len(dataloader):.4f}")

    return model

In [None]:
# ================================
# 5. test.csv 전체 임베딩 추출 (추론)
# ================================
class TestSeqDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.ids = df["ID"].tolist()
        self.seqs = df["seq"].astype(str).tolist()

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return self.ids[idx], self.seqs[idx]


def collate_test(batch, tokenizer, max_len: int):
    ids = [b[0] for b in batch]
    seqs = [b[1] for b in batch]
    enc = tokenizer(
        seqs,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt",
    )
    return ids, enc


def extract_embeddings(
    model: VariantSensitiveGLM,
    tokenizer: AutoTokenizer,
    test_df: pd.DataFrame,
    device: torch.device,
) -> pd.DataFrame:
    model.eval()
    dataset = TestSeqDataset(test_df)
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE * 4, # 추론 시 배치 사이즈 증대
        shuffle=False,
        collate_fn=lambda batch: collate_test(batch, tokenizer, MAX_SEQ_LEN),
    )

    all_ids = []
    all_embs = []

    with torch.no_grad():
        for ids, enc in tqdm(dataloader, desc="Extracting embeddings"):
            input_ids = enc["input_ids"].to(device)
            attn_mask = enc["attention_mask"].to(device)

            # 모델의 forward 함수는 이미 Concatenative Pooling된 [B, 2*H] 임베딩을 반환
            emb = model(input_ids, attn_mask)

            emb = emb.detach().cpu().numpy()
            all_ids.extend(ids)
            all_embs.append(emb)

    all_embs = np.vstack(all_embs)  # [N, EMBED_DIM]

    # submission 포맷으로 변환
    sub = pd.read_csv(SAMPLE_SUB_PATH)

    # 임베딩 차원이 1024차원일 경우, sample_submission의 768차원에 맞춰야 하므로 잘라내야 함.
    # 하지만, 대회 규정상 2048차원까지 허용되므로 1024차원 그대로 사용하고 submission 파일을 새로 생성하는 것이 원칙.
    # 현재 sample_submission은 768차원이므로, 1024차원을 사용하려면 submission 컬럼을 수정해야 합니다.
    # 여기서는 1024차원 그대로 출력하며, 제출 시에는 1024차원으로 컬럼을 맞춰서 제출해야 합니다.

    # ID 순서에 맞춰서 정렬
    id_to_index = {id_: i for i, id_ in enumerate(all_ids)}
    ordered_embs = np.zeros((len(test_df), EMBED_DIM), dtype=np.float32)

    for i, id_ in enumerate(test_df["ID"].tolist()): # test_df 순서로 정렬
        idx = id_to_index[id_]
        ordered_embs[i] = all_embs[idx]

    emb_cols = [f"emb_{i:04d}" for i in range(EMBED_DIM)]
    emb_df = pd.DataFrame(ordered_embs, columns=emb_cols)
    out_df = pd.concat([test_df[["ID"]], emb_df], axis=1) # ID와 임베딩 결합

    return out_df

# ================================
# 6. 메인 실행부
# ================================
def main():
    set_seed(SEED)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    print(f"Final Embedding Dimension: {EMBED_DIM}")

    # 1. 데이터 로드
    test_df = pd.read_csv(TEST_PATH)
    print("test shape:", test_df.shape)

    # 2. gLM 로드
    tokenizer, model = load_glm_model(MODEL_NAME, device)

    # 3. ---------- 파인튜닝 (Self-Supervised Contrastive) ----------
    if DO_TRAIN:
        # 학습에 사용할 시퀀스 준비
        uniq_seqs = test_df["seq"].astype(str).unique().tolist()
        random.shuffle(uniq_seqs)
        train_seqs = uniq_seqs[:MAX_TRAIN_SEQS] if len(uniq_seqs) > MAX_TRAIN_SEQS else uniq_seqs

        print(f"Train sequences (randomly selected from test.csv): {len(train_seqs)}")
        model = train_variant_sensitive_glm(
            model=model,
            tokenizer=tokenizer,
            train_seqs=train_seqs,
            device=device,
        )

    # 4. ---------- 임베딩 추출 ----------
    submission_df = extract_embeddings(
        model=model,
        tokenizer=tokenizer,
        test_df=test_df,
        device=device,
    )

    # 5. 저장
    submission_df.to_csv(OUTPUT_PATH, index=False)
    print("Saved submission to:", OUTPUT_PATH)
    print("\n--- Final Submission Head ---")
    print(submission_df.head())


if __name__ == "__main__":
    main()

Device: cuda
Final Embedding Dimension: 1024
test shape: (13711, 2)


Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-v2-50m-multi-species and are newly initialized: ['encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.output.dense.bias', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.output.dense.bias', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.output.dense.bias', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.4.output.dense.bias', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.5.output.dense.bias', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.output.dense.bias', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.output.dense.bias', 'encoder.layer.8.intermediate.dense.b

Train sequences (randomly selected from test.csv): 13711
Triplet Loss Margin: 0.2080


Epoch 1/1:   0%|          | 0/1714 [00:00<?, ?it/s]

[Epoch 1] mean loss = 0.1646


Extracting embeddings:   0%|          | 0/429 [00:00<?, ?it/s]

Saved submission to: submission_ntv2_boosted_contrastive.csv

--- Final Submission Head ---
            ID  emb_0000  emb_0001  emb_0002  emb_0003  emb_0004  emb_0005  \
0  TEST_000000 -1.655428  0.013831  0.865508 -0.918311  0.052765  0.464090   
1  TEST_000001  0.136888  0.475598 -0.155153 -0.598040  1.100501  0.013453   
2  TEST_000002 -0.767427 -0.067005  0.392438 -0.826294  1.395029 -0.028359   
3  TEST_000003  0.877122  0.195837 -0.759369  0.170704 -0.235813  0.337687   
4  TEST_000004 -1.185780 -0.695641 -0.033980 -1.010404 -0.552789 -0.045639   

   emb_0006  emb_0007  emb_0008  ...  emb_1014  emb_1015  emb_1016  emb_1017  \
0  0.202034  0.604572 -1.006131  ...  0.395031 -0.450418  0.206423  0.715348   
1 -0.505302  0.812696 -0.676294  ... -0.174828 -0.475667  0.503319  0.863121   
2 -0.618389  1.069441 -1.302056  ...  0.388670 -0.910601  0.411169  0.892094   
3 -0.148771  0.692563 -0.118860  ...  0.615766  0.861263  0.070031  0.233345   
4  1.093248  0.117325 -0.514936  ...  0