<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/high_score_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U bitsandbytes peft accelerate transformers datasets




In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['test.csv', 'sample_submission.csv']


In [3]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [None]:
# ============================================
#  제2회 MAI 대회 최종 제출 Inference (Stage2 LoRA 우선) - 고GPU 활용 버전
#  Strategy: Weighted last4 + Multi-crop(view-batch) + Reverse complement
#            + LayerNorm + PCA Whitening
#  Notes:
#    - 샘플×뷰 동시 배치로 forward → GPU 사용률↑
#    - FP16(권장, VRAM 여유시) or 8bit(메모리 절약)
#    - TF32, cuDNN benchmark, AMP 적용
# ============================================

import os, numpy as np, pandas as pd, torch
from tqdm import tqdm
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig

# (옵션) PEFT 로더: LoRA가 있을 때 사용
try:
    from peft import AutoPeftModelForMaskedLM
    HAS_PEFT = True
except Exception:
    HAS_PEFT = False

# -----------------------------
# 0) 환경설정
# -----------------------------
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Device:", DEVICE)

# GPU 커널/정밀도 최적화
if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

def _amp_dtype():
    if DEVICE == "cuda" and torch.cuda.is_available():
        cap = torch.cuda.get_device_capability()
        return torch.bfloat16 if cap[0] >= 8 else torch.float16  # Ampere+ → bf16
    return None

AMP_DTYPE = _amp_dtype()

# -----------------------------
# 1) 경로/하이퍼파라미터
# -----------------------------
data_path = "/content/open"  # test.csv, sample_submission.csv
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
STAGE2_CKPT = "/content/stage2_contrastive_lora8bit"  # LoRA 경로(없으면 폴백)

# === 핵심 성능 파라미터 ===
USE_8BIT   = False   # ✅ VRAM 여유되면 False로 두고 FP16 추론 권장(속도↑/점유율↑)
BATCH_SEQ  = 32      # 한 번에 처리할 샘플 수 (VRAM 따라 16~64 튜닝)
N_VIEWS    = 4       # per strand views (원본/역상보 각각)
MAX_LEN    = 1024    # crop 길이 (512→1024로 늘리면 계산량↑)
N_PCA      = 512     # 최종 출력 차원(≤2048)

LAYER_WEIGHTS = torch.tensor([0.1, 0.2, 0.3, 0.4])  # 마지막 4개 레이어 가중치

# -----------------------------
# 2) 데이터 로드
# -----------------------------
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape, sub_df.shape)

# -----------------------------
# 3) reverse complement
# -----------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -----------------------------
# 4) 모델/토크나이저 로드 (Stage2 우선)
# -----------------------------
if USE_8BIT:
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)

    if os.path.isdir(STAGE2_CKPT) and HAS_PEFT:
        print("✅ Loading Stage2 LoRA (8bit):", STAGE2_CKPT)
        tokenizer = AutoTokenizer.from_pretrained(STAGE2_CKPT, trust_remote_code=True)
        model = AutoPeftModelForMaskedLM.from_pretrained(
            STAGE2_CKPT, trust_remote_code=True,
            quantization_config=bnb_config, device_map={"": 0}
        )
    else:
        print("⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (8bit).")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        model = AutoModelForMaskedLM.from_pretrained(
            MODEL_ID, trust_remote_code=True,
            quantization_config=bnb_config, device_map={"": 0}
        )
else:
    # FP16 로드 (권장)
    if os.path.isdir(STAGE2_CKPT) and HAS_PEFT:
        print("✅ Loading Stage2 LoRA (FP16):", STAGE2_CKPT)
        tokenizer = AutoTokenizer.from_pretrained(STAGE2_CKPT, trust_remote_code=True)
        model = AutoPeftModelForMaskedLM.from_pretrained(
            STAGE2_CKPT, trust_remote_code=True,
            torch_dtype=torch.float16, device_map={"": 0}
        )
    else:
        print("⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (FP16).")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        model = AutoModelForMaskedLM.from_pretrained(
            MODEL_ID, trust_remote_code=True,
            torch_dtype=torch.float16, device_map={"": 0}
        )

model.eval(); model.config.use_cache = False
LAYER_WEIGHTS = LAYER_WEIGHTS.to(model.device)

# -----------------------------
# 5) 뷰 생성 유틸
# -----------------------------
def make_crops(seq, n_views, max_len):
    out = []
    for s in (seq, reverse_complement(seq)):
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                out.append(s)
            else:
                st = np.random.randint(0, L - max_len + 1)
                out.append(s[st:st+max_len])
    return out  # len = 2*n_views

# -----------------------------
# 6) 샘플×뷰 동시 배치 임베딩
# -----------------------------
from contextlib import nullcontext

@torch.no_grad()
def embed_batch(seq_list, n_views=N_VIEWS, max_len=MAX_LEN):
    """
    seq_list: 길이 B의 시퀀스 리스트
    - 각 시퀀스에 대해 (원본/역상보)×n_views crop 생성
    - 전체(2*n_views*B) 를 한 방에 토크나이즈 → 1 forward
    - 마지막 4개 레이어 가중합 → attention mask 평균 → 시퀀스별 view 평균
    반환: (B, H) torch.FloatTensor(cpu)
    """
    # 1) 모든 샘플의 crop 생성 후 평탄화
    batch_crops = []
    starts = [0]
    for seq in seq_list:
        cs = make_crops(seq, n_views, max_len)
        batch_crops.extend(cs)
        starts.append(starts[-1] + len(cs))   # 인덱스 경계 기록

    # 2) 한 번에 토크나이즈
    tok = tokenizer(
        batch_crops, return_tensors="pt",
        truncation=True, padding=True, max_length=max_len
    ).to(model.device)

    # 3) AMP (8bit일 땐 실효 적을 수 있어 nullcontext)
    use_amp = (not USE_8BIT) and (DEVICE == "cuda")
    autocast_ctx = torch.autocast("cuda", dtype=AMP_DTYPE) if use_amp else nullcontext()

    with autocast_ctx:
        out = model(**tok, output_hidden_states=True)
        hs  = torch.stack(out.hidden_states[-4:], dim=0)                 # (4,B,T,H)
        w   = (hs * LAYER_WEIGHTS.view(4,1,1,1)).sum(0)                  # (B,T,H)
        mask = tok["attention_mask"].unsqueeze(-1)                       # (B,T,1)
        emb  = (w * mask).sum(1) / mask.sum(1).clamp(min=1)              # (B,H)

    # 4) 원 시퀀스 단위로 view 평균
    outs = []
    for i in range(len(seq_list)):
        a, b = starts[i], starts[i+1]
        outs.append(emb[a:b].mean(0, keepdim=True))
    return torch.vstack(outs).cpu()                                      # (B, H)

# -----------------------------
# 7) 전체 추론 루프 (샘플 배치 단위)
# -----------------------------
all_ids, all_embs = [], []
seqs = test_df["seq"].tolist()
ids  = test_df["ID"].tolist()

for s in tqdm(range(0, len(seqs), BATCH_SEQ), desc="Embedding (samples×views batched)"):
    chunk = seqs[s:s+BATCH_SEQ]
    embs  = embed_batch(chunk)           # (B, H)
    all_embs.append(embs)
    all_ids.extend(ids[s:s+BATCH_SEQ])

emb_tensor = torch.vstack(all_embs)      # (N, H)
print("✅ Raw embedding shape:", emb_tensor.shape)

# -----------------------------
# 8) 정규화 + Whitening (PCA)
# -----------------------------
from torch.nn.functional import layer_norm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])   # (N, H)
emb_np = emb_normed.numpy()

# PCA 차원 축소 + 화이트닝
n_comp = min(N_PCA, emb_np.shape[1])
pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)                          # (N, n_comp)

# L2 normalize (cosine 안정화)
emb_final = emb_pca / (np.linalg.norm(emb_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape:", emb_final.shape)

# -----------------------------
# 9) 제출 파일 생성
# -----------------------------
assert emb_final.shape[1] <= 2048, "임베딩 차원 2048 초과 금지!"
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_final_stage2_or_base.csv"
submission.to_csv(out_path, index=False)
print("🎯 Saved:", out_path)

# (선택) Colab 다운로드
try:
    from google.colab import files
    files.download(out_path)
except Exception as e:
    print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
    print("Error:", e)


✅ Device: cuda
✅ Loaded: (13711, 2) (13711, 769)
⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (FP16).


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Embedding (samples×views batched): 100%|██████████| 429/429 [05:33<00:00,  1.29it/s]


✅ Raw embedding shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
🎯 Saved: /content/submission_final_stage2_or_base.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ============================================
#  MAI 최종 제출 Inference (규칙준수: 단일 추론, 비앙상블)
#  Strategy: Canonical strand + Center crop(1회) + Weighted last4 → LayerNorm → PCA(512) → L2
#  Notes: 배치만으로 GPU 활용 ↑, 결과는 완전 결정적
# ============================================

import os, numpy as np, pandas as pd, torch
from tqdm import tqdm
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModelForMaskedLM

# -----------------------------
# 0) 환경/경로
# -----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED)
print("✅ Device:", DEVICE)

data_path = "/content/open"
MODEL_ID  = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"  # (마감 전 공개/오픈 라이선스 확인 제출서에 명시)

# -----------------------------
# 1) 데이터
# -----------------------------
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape)

# -----------------------------
# 2) 유틸 (결정적 전처리)
# -----------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

def canonical_strand(seq: str) -> str:
    """RC 중 사전식으로 작은 쪽을 택해 방향을 고정(평균 없음)."""
    rc = reverse_complement(seq)
    return seq if seq <= rc else rc

def center_crop(seq: str, max_len: int) -> str:
    L = len(seq)
    if L <= max_len:
        return seq
    start = (L - max_len) // 2
    return seq[start:start+max_len]

# -----------------------------
# 3) 모델 (FP32, 단일 추론)
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# -----------------------------
# 4) 하이퍼파라미터
# -----------------------------
BATCH_SEQ  = 32      # 샘플 배치 크기(메모리 따라 16~64 조절)
MAX_LEN    = 1024    # 중앙 고정 crop 길이(1회)
N_PCA      = 512     # 제출차원(≤2048)
LAYER_W    = torch.tensor([0.1, 0.2, 0.3, 0.4], dtype=torch.float32, device=DEVICE)

# -----------------------------
# 5) 배치 임베딩 (단일 추론)
# -----------------------------
@torch.no_grad()
def embed_batch(seq_list):
    """(원본→canonical→center crop) 1회 토큰화/1회 forward/결정적 집계."""
    prepared = [center_crop(canonical_strand(s), MAX_LEN) for s in seq_list]
    tok = tokenizer(prepared, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LEN).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs  = torch.stack(out.hidden_states[-4:], dim=0)                 # (4,B,T,H) FP32
    w   = (hs * LAYER_W.view(4,1,1,1)).sum(0)                        # (B,T,H)
    mask= tok["attention_mask"].unsqueeze(-1)                        # (B,T,1)
    emb = (w * mask).sum(1) / mask.sum(1).clamp(min=1)               # (B,H)
    return emb.cpu()                                                 # (B,H)

# -----------------------------
# 6) 전체 추론 (배치만 사용)
# -----------------------------
seqs = test_df["seq"].tolist()
ids  = test_df["ID"].tolist()

parts = []
for i in tqdm(range(0, len(seqs), BATCH_SEQ), desc="Embedding (deterministic)"):
    batch = seqs[i:i+BATCH_SEQ]
    parts.append(embed_batch(batch))

emb_tensor = torch.vstack(parts)                                     # (N,H)
print("✅ Raw embedding shape:", emb_tensor.shape)

# -----------------------------
# 7) LayerNorm → PCA(512) → L2  (단일 적용)
# -----------------------------
from torch.nn.functional import layer_norm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(N_PCA, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)
emb_final = emb_pca / (np.linalg.norm(emb_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape:", emb_final.shape)

# -----------------------------
# 8) 제출
# -----------------------------
cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
submission = pd.concat([pd.Series(ids, name="ID"), pd.DataFrame(emb_final, columns=cols)], axis=1)
out_path = "/content/submission_final_singlepass.csv"
submission.to_csv(out_path, index=False)
print("🎯 Saved:", out_path)

# (선택) Colab 다운로드
try:
    from google.colab import files
    files.download(out_path)
except Exception as e:
    print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
    print("Error:", e)


✅ Device: cuda
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.


Embedding (deterministic): 100%|██████████| 429/429 [02:54<00:00,  2.46it/s]


✅ Raw embedding shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
🎯 Saved: /content/submission_final_singlepass.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ============================================
#  MAI 대회 최종 제출 (점수 안정화 + 규칙 준수)
#  Strategy: Deterministic multi-crop + RC 평균 + Weighted last4 + PCA Whitening
# ============================================

import os, numpy as np, pandas as pd, torch
from tqdm import tqdm
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModelForMaskedLM

# ---------------------------------------------------
# 1️⃣ 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f"✅ Device: {DEVICE}")

# ---------------------------------------------------
# 2️⃣ 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded:", test_df.shape)

# ---------------------------------------------------
# 3️⃣ 유틸
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

def deterministic_offsets(L, max_len, n, seed):
    rng = np.random.RandomState(seed)
    offsets = []
    for _ in range(n):
        offsets.append(rng.randint(0, max(1, L - max_len)))
    return offsets

# ---------------------------------------------------
# 4️⃣ 모델 로드 (FP32)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# ---------------------------------------------------
# 5️⃣ Multi-crop + Weighted layer embedding (deterministic)
# ---------------------------------------------------
@torch.no_grad()
def get_seq_embedding(seq, model, tokenizer, n_views=6, max_len=1024, seed=SEED):
    embs = []
    seqs = [seq, reverse_complement(seq)]  # 원본 + RC
    layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4]).to(DEVICE)

    for s in seqs:
        L = len(s)
        offsets = deterministic_offsets(L, max_len, n_views, seed)
        for off in offsets:
            sub_seq = s[off:off + max_len]
            tok = tokenizer(
                sub_seq, return_tensors="pt", truncation=True, padding=True, max_length=max_len
            ).to(DEVICE)
            out = model(**tok, output_hidden_states=True)
            hs = torch.stack(out.hidden_states[-4:], dim=0)  # (4,B,L,H)
            weighted = (hs * layer_weights.view(4,1,1,1)).sum(0)
            mask = tok["attention_mask"].unsqueeze(-1)
            emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)
            embs.append(emb.cpu())

    return torch.stack(embs).mean(0)  # 모든 view 평균

# ---------------------------------------------------
# 6️⃣ 전체 임베딩 추출
# ---------------------------------------------------
all_ids, all_embs = [], []
for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Embedding"):
    emb = get_seq_embedding(row["seq"], model, tokenizer, n_views=6, max_len=1024, seed=SEED)
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)
print("✅ Embedding tensor shape:", emb_tensor.shape)

# ---------------------------------------------------
# 7️⃣ LayerNorm + PCA Whitening
# ---------------------------------------------------
from torch.nn.functional import layer_norm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)
emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# ---------------------------------------------------
# 8️⃣ 제출 파일 생성
# ---------------------------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_final_deterministic.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved:", out_path)

try:
    from google.colab import files
    files.download(out_path)
except:
    pass


✅ Device: cuda
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.


Embedding:  19%|█▉        | 2652/13711 [16:56<1:10:38,  2.61it/s]


KeyboardInterrupt: 

In [None]:
# ============================================
#  MAI Inference (Deterministic multi-crop + Batched)
#  기존: 샘플 1개씩 forward
#  수정: 샘플×뷰를 한 번에 forward → GPU 사용률/속도 ↑
# ============================================

import os, numpy as np, pandas as pd, torch
from tqdm import tqdm
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModelForMaskedLM

# ---------------------------------------------------
# 1️⃣ 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f"✅ Device: {DEVICE}")

# ---------------------------------------------------
# 2️⃣ 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded:", test_df.shape)

# ---------------------------------------------------
# 3️⃣ 유틸 함수
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

def deterministic_offsets(L, max_len, n, seed):
    """
    길이 L인 시퀀스에서 max_len 길이의 crop을
    랜덤이지만 'seed'로 고정된 방식으로 n개 선택.
    """
    rng = np.random.RandomState(seed)
    offs = []
    for _ in range(n):
        offs.append(rng.randint(0, max(1, L - max_len)))
    return offs

# ---------------------------------------------------
# 4️⃣ 모델 및 토크나이저 로드 (FP32)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], dtype=torch.float32, device=DEVICE)

# ---------------------------------------------------
# 5️⃣ Batched multi-crop embedding 함수
# ---------------------------------------------------
@torch.no_grad()
def embed_batch(seqs, n_views=6, max_len=1024, base_seed=SEED):
    """
    seqs: 길이 B 리스트
    - 각 시퀀스에 대해 (원본+RC) × n_views crop 생성
    - 전체 crop을 한 번에 토크나이즈/forward
    - 시퀀스별로 view 평균해서 (B, H) 반환
    """
    all_crops = []
    bounds = [0]    # 각 시퀀스의 crop 시작 인덱스 기록

    # 1) crop 문자열들 모두 모으기
    for idx, seq in enumerate(seqs):
        local_crops = []
        for strand in (seq, reverse_complement(seq)):
            L = len(strand)
            # 시퀀스마다 seed 다르게 해서 완전 고정
            offs = deterministic_offsets(L, max_len, n_views, seed=base_seed + idx)
            for off in offs:
                local_crops.append(strand[off:off + max_len])
        all_crops.extend(local_crops)
        bounds.append(len(all_crops))   # 다음 시퀀스 시작 위치

    # 2) 한 번에 토크나이즈 & 모델 추론
    tok = tokenizer(
        all_crops,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)              # (4, Btot, L, H)
    weighted = (hs * layer_weights.view(4,1,1,1)).sum(0)        # (Btot, L, H)
    mask = tok["attention_mask"].unsqueeze(-1)                  # (Btot, L, 1)
    embs = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (Btot, H)
    embs = embs.cpu()

    # 3) 시퀀스별 view 평균
    seq_embs = []
    for i in range(len(seqs)):
        a, b = bounds[i], bounds[i+1]   # 이 시퀀스의 crop 범위
        seq_embs.append(embs[a:b].mean(0, keepdim=True))
    return torch.vstack(seq_embs)       # (B, H)

# ---------------------------------------------------
# 6️⃣ 전체 임베딩 추출 (배치 단위)
# ---------------------------------------------------
BATCH_SIZE = 32   # A100이면 32~64까지 올려도 충분함

all_ids = []
all_embs = []

ids = test_df["ID"].tolist()
seqs = test_df["seq"].tolist()

for start in tqdm(range(0, len(seqs), BATCH_SIZE), desc="Embedding (batched)"):
    batch_seqs = seqs[start:start+BATCH_SIZE]
    batch_ids  = ids[start:start+BATCH_SIZE]
    batch_embs = embed_batch(batch_seqs, n_views=6, max_len=1024, base_seed=SEED)

    all_ids.extend(batch_ids)
    all_embs.append(batch_embs)

emb_tensor = torch.vstack(all_embs)
print("✅ Embedding tensor shape:", emb_tensor.shape)

# ---------------------------------------------------
# 7️⃣ LayerNorm + PCA Whitening
# ---------------------------------------------------
from torch.nn.functional import layer_norm

emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)
emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# ---------------------------------------------------
# 8️⃣ 제출 파일 생성
# ---------------------------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_final_batched.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved submission file:", out_path)

try:
    from google.colab import files
    files.download(out_path)
except:
    pass


✅ Device: cuda
✅ Loaded: (13711, 2)
✅ Model loaded.


Embedding (batched): 100%|██████████| 429/429 [31:42<00:00,  4.44s/it]


✅ Embedding tensor shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
✅ Saved submission file: /content/submission_final_batched.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
# ============================================
#  제2회 MAI 대회 최종 제출 Inference + Small Experiments
#  Author: Hanbin (GPT-5 assisted)
#  Base Strategy: Weighted last4 mean + Multi-crop(batch) + Reverse complement + Whitening
#  Experiments:
#   - Base: n_views=6,  max_len=1024, N_PCA=512
#   - Exp1: n_views=8,  max_len=1024, N_PCA=512
#   - Exp2: n_views=6,  max_len=896,  N_PCA=512
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# 1️⃣ 공통 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print(f"✅ Device: {DEVICE}")

# GPU/커널 최적화 스위치 (gLM2 버전과 동일)
if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True  # TF32 on (Ampere+)
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

def _amp_dtype():
    # bf16 지원(AMPERE+)이면 bf16, 아니면 fp16 사용
    if DEVICE == "cuda" and torch.cuda.is_available():
        cap = torch.cuda.get_device_capability()
        if cap[0] >= 8:
            return torch.bfloat16
        else:
            return torch.float16
    return None

AMP_DTYPE = _amp_dtype()

# ---------------------------------------------------
# 2️⃣ 데이터 로드 (공통)
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded:", test_df.shape)

# ---------------------------------------------------
# 3️⃣ Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4️⃣ 모델 및 토크나이저 로드 (한 번만)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

model = AutoModelForMaskedLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=(AMP_DTYPE if AMP_DTYPE is not None else None)
)
model = model.to(DEVICE).eval()
model.config.use_cache = False

# torch.compile 시도 (안 되면 무시)
try:
    model = torch.compile(model, mode="max-autotune")
    print("✅ torch.compile: enabled")
except Exception as e:
    print("⚠️ torch.compile failed, continue without it:", str(e))

print("✅ Model loaded.")

# AMP context helper
from contextlib import nullcontext
def get_autocast_ctx():
    if AMP_DTYPE is not None and DEVICE == "cuda":
        return torch.autocast(device_type="cuda", dtype=AMP_DTYPE)
    else:
        return nullcontext()

# ---------------------------------------------------
# 5️⃣ Multi-crop (배치 전송) + Weighted last4 embedding
#     - n_views, max_len 을 실험마다 바꿀 수 있도록 파라미터화
# ---------------------------------------------------
@torch.no_grad()
def get_seq_embedding(seq, n_views=6, max_len=1024):
    """
    - 원본 + 역상보 각각 n_views개를 한 번의 forward로 처리 (총 2*n_views)
    - AMP(bf16/fp16)로 throughput 향상
    - 마지막 4개 히든스테이트 가중합(Weighted last4)
    """
    srcs = [seq, reverse_complement(seq)]
    subseqs = []
    for s in srcs:
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                sub_seq = s
            else:
                # gLM2 기준: 0 ~ (L-max_len) 전체 범위에서 균일 샘플
                offset = np.random.randint(0, L - max_len + 1)
                sub_seq = s[offset:offset + max_len]
            subseqs.append(sub_seq)

    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

    with get_autocast_ctx():
        out = model(**tok, output_hidden_states=True)
        hs = torch.stack(out.hidden_states[-4:], dim=0)  # (4, B, T, H)
        weighted = (hs * layer_weights).sum(0)           # (B, T, H)

        mask = tok["attention_mask"].unsqueeze(-1)       # (B, T, 1)
        emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B, H)

    emb_mean = emb.mean(0, keepdim=True).cpu()  # (1, H)
    return emb_mean

# ---------------------------------------------------
# 6️⃣ 실험 실행 함수 (한 번에 한 설정만)
# ---------------------------------------------------
from torch.nn.functional import layer_norm

def run_experiment(tag, n_views, max_len, n_pca):
    """
    tag   : 파일 이름 구분용 문자열
    n_views, max_len, n_pca : 실험 파라미터
    """
    print(f"\n===============================")
    print(f"  Running experiment: {tag}")
    print(f"  n_views={n_views}, max_len={max_len}, N_PCA={n_pca}")
    print(f"===============================\n")

    # 실험마다 난수 고정 (실험 간 일관성 확보)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    all_ids = []
    all_embs = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Embedding ({tag})"):
        emb = get_seq_embedding(row["seq"], n_views=n_views, max_len=max_len)
        all_ids.append(row["ID"])
        all_embs.append(emb)

    emb_tensor = torch.vstack(all_embs)  # (N, H)
    print("✅ Embedding tensor shape:", emb_tensor.shape)

    # LayerNorm
    emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
    emb_np = emb_normed.numpy()

    # PCA Whitening
    n_comp = min(n_pca, emb_np.shape[1])
    pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
    emb_pca = pca.fit_transform(emb_np)

    # L2 normalize
    emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
    print("✅ Final embedding shape:", emb_final.shape)

    # 제출 파일 생성
    emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
    emb_df = pd.DataFrame(emb_final, columns=emb_cols)
    submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

    out_path = f"/content/submission_{tag}.csv"
    submission.to_csv(out_path, index=False)
    print(f"✅ Saved submission file: {out_path}")

    # (선택) 자동 다운로드
    try:
        from google.colab import files
        files.download(out_path)
    except Exception as e:
        print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
        print("Error:", e)

# ---------------------------------------------------
# 7️⃣ 실제로 돌릴 실험 목록 정의
#     - 필요 없는 건 주석 처리하고 돌려도 됨
# ---------------------------------------------------
EXPERIMENTS = [
    # (tag, n_views, max_len, N_PCA)

    # ✅ 지금 최고점 설정 (baseline 재실행)
    ("gLM2_nv6_len1024_p512", 6, 1024, 512),

    # 🔬 실험 1: 뷰 수 늘리기 (연산↑, 잠재적으로 안정성↑)
    ("gLM2_nv8_len1024_p512", 8, 1024, 512),

    # 🔬 실험 2: max_len 조금 줄이기 (다른 구간 강조)
    ("gLM2_nv6_len896_p512", 6, 896, 512),

    # 원하면 여기 추가: PCA 차원만 바꾸는 실험 등
    # ("gLM2_nv6_len1024_p384", 6, 1024, 384),
    # ("gLM2_nv6_len1024_p768", 6, 1024, 768),
]

# ---------------------------------------------------
# 8️⃣ 실험 실행 루프
# ---------------------------------------------------
for tag, n_views, max_len, n_pca in EXPERIMENTS:
    run_experiment(tag, n_views, max_len, n_pca)


✅ Device: cuda
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ torch.compile: enabled
✅ Model loaded.

  Running experiment: gLM2_nv6_len1024_p512
  n_views=6, max_len=1024, N_PCA=512



Embedding (gLM2_nv6_len1024_p512):   0%|          | 0/13711 [00:00<?, ?it/s]AUTOTUNE bmm(192x175x175, 192x175x64)
strides: [30656, 175, 1], [11200, 64, 1]
dtypes: torch.bfloat16, torch.bfloat16
  triton_bmm_82 0.0317 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=32, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=2, num_warps=4
  triton_bmm_87 0.0317 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4
  triton_bmm_81 0.0328 ms 96.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=16, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=2, num_warps=4
  triton_bmm_83 0.0328 ms 96.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K=64, BLOCK_M=64, BLOCK_N=64, EVEN_K=False, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=8
  triton_bmm_85 0.0328 ms 96.9% ACC_TYPE='tl.float32', ALLOW_TF32=True, BLOCK_K

RuntimeError: Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. Stack trace: File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 1164, in forward
    outputs = self.esm(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 1057, in forward
    encoder_outputs = self.encoder(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 734, in forward
    layer_outputs = layer_module(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 607, in forward
    self_attention_outputs = self.attention(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 523, in forward
    self_outputs = self.self(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 396, in forward
    query_layer, key_layer = self.rotary_embeddings(query_layer, key_layer)
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 130, in forward
    self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
  File "/root/.cache/huggingface/modules/transformers_modules/InstaDeepAI/nucleotide_hyphen_transformer_hyphen_v2_hyphen_500m_hyphen_multi_hyphen_species/06615c1660c892fc199840c18123f8385b3542a8/modeling_esm.py", line 122, in _update_cos_sin_tables
    self._cos_cached = emb.cos()[None, None, :, :]. To prevent overwriting, clone the tensor outside of torch.compile() or call torch.compiler.cudagraph_mark_step_begin() before each model invocation.

In [7]:
# ============================================
#  MAI Inference + Small Experiments (Batched, NO torch.compile)
#  - gLM2 전략 유지
#  - 여러 시퀀스를 한 번에 forward → GPU 활용도↑
#  - torch.compile 제거 → CUDAGraphs 에러 방지
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm
from torch.nn.functional import layer_norm
from contextlib import nullcontext

# ---------------------------------------------------
# 1️⃣ 공통 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f"✅ Device: {DEVICE}")

if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass

def _amp_dtype():
    if DEVICE == "cuda" and torch.cuda.is_available():
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            return torch.bfloat16
        else:
            return torch.float16
    return None

AMP_DTYPE = _amp_dtype()

def get_autocast_ctx():
    if AMP_DTYPE is not None and DEVICE == "cuda":
        return torch.autocast(device_type="cuda", dtype=AMP_DTYPE)
    else:
        return nullcontext()

# ---------------------------------------------------
# 2️⃣ 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded:", test_df.shape)
ids  = test_df["ID"].tolist()
seqs = test_df["seq"].tolist()

# ---------------------------------------------------
# 3️⃣ Reverse complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4️⃣ 모델 / 토크나이저 로드  (⚠️ torch.compile 사용 안 함)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

model = AutoModelForMaskedLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=(AMP_DTYPE if AMP_DTYPE is not None else None),
)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("✅ Model loaded (no torch.compile).")

layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# ---------------------------------------------------
# 5️⃣ Batched multi-crop embedding
#     - gLM2와 같은 random 로직 유지
# ---------------------------------------------------
@torch.no_grad()
def embed_batch(seq_list, n_views=6, max_len=1024):
    """
    seq_list: 길이 B 리스트
    - 각 seq에 대해 (원본+RC) × n_views crop 생성
    - 전체 crop을 한 번에 토크나이즈 + forward
    - 시퀀스별 view 평균해서 (B, H) 반환
    """
    all_crops = []
    bounds = [0]

    for seq in seq_list:
        for s in (seq, reverse_complement(seq)):
            L = len(s)
            for _ in range(n_views):
                if L <= max_len:
                    sub_seq = s
                else:
                    offset = np.random.randint(0, L - max_len + 1)
                    sub_seq = s[offset:offset + max_len]
                all_crops.append(sub_seq)
        bounds.append(len(all_crops))

    tok = tokenizer(
        all_crops,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    with get_autocast_ctx():
        out = model(**tok, output_hidden_states=True)
        hs = torch.stack(out.hidden_states[-4:], dim=0)      # (4, Btot, T, H)
        weighted = (hs * layer_weights).sum(0)               # (Btot, T, H)
        mask = tok["attention_mask"].unsqueeze(-1)           # (Btot, T, 1)
        emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (Btot, H)

    emb = emb.cpu()
    seq_embs = []
    for i in range(len(seq_list)):
        a, b = bounds[i], bounds[i+1]
        seq_embs.append(emb[a:b].mean(0, keepdim=True))
    return torch.vstack(seq_embs)   # (B, H)

# ---------------------------------------------------
# 6️⃣ 실험 실행 함수 (배치 단위)
# ---------------------------------------------------
def run_experiment(tag, n_views, max_len, n_pca, batch_size=64):
    print(f"\n===============================")
    print(f"  Running experiment: {tag}")
    print(f"  n_views={n_views}, max_len={max_len}, N_PCA={n_pca}, BATCH={batch_size}")
    print(f"===============================\n")

    np.random.seed(SEED)
    torch.manual_seed(SEED)

    all_embs = []
    all_ids  = []

    for start in tqdm(range(0, len(seqs), batch_size), desc=f"Embedding ({tag})"):
        batch_seqs = seqs[start:start+batch_size]
        batch_ids  = ids[start:start+batch_size]
        batch_embs = embed_batch(batch_seqs, n_views=n_views, max_len=max_len)
        all_embs.append(batch_embs)
        all_ids.extend(batch_ids)

    emb_tensor = torch.vstack(all_embs)   # (N, H)
    print("✅ Embedding tensor shape:", emb_tensor.shape)

    emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
    emb_np = emb_normed.numpy()

    n_comp = min(n_pca, emb_np.shape[1])
    pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
    emb_pca = pca.fit_transform(emb_np)

    emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
    print("✅ Final embedding shape:", emb_final.shape)

    emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
    emb_df = pd.DataFrame(emb_final, columns=emb_cols)
    submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

    out_path = f"/content/submission_{tag}.csv"
    submission.to_csv(out_path, index=False)
    print(f"✅ Saved submission file: {out_path}")

    try:
        from google.colab import files
        files.download(out_path)
    except Exception as e:
        print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
        print("Error:", e)

# ---------------------------------------------------
# 7️⃣ 실험 목록 (우선 baseline만)
# ---------------------------------------------------
EXPERIMENTS = [
    ("gLM2_nv6_len1024_p512_batch", 6, 1024, 512, 64),
    # 원하면 아래처럼 추가:
    # ("gLM2_nv8_len1024_p512_batch", 8, 1024, 512, 64),
    # ("gLM2_nv6_len896_p512_batch", 6, 896, 512, 64),
]

# ---------------------------------------------------
# 8️⃣ 실행
# ---------------------------------------------------
for tag, nv, ml, npca, bs in EXPERIMENTS:
    run_experiment(tag, nv, ml, npca, batch_size=bs)


✅ Device: cuda
✅ Loaded: (13711, 2)
✅ Model loaded (no torch.compile).

  Running experiment: gLM2_nv6_len1024_p512_batch
  n_views=6, max_len=1024, N_PCA=512, BATCH=64



Embedding (gLM2_nv6_len1024_p512_batch): 100%|██████████| 215/215 [08:25<00:00,  2.35s/it]


✅ Embedding tensor shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
✅ Saved submission file: /content/submission_gLM2_nv6_len1024_p512_batch.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# ============================================
#  MAI Inference (FP32, AMP OFF, Batched)
#  - gLM2 전략 유지
#  - AMP/autocast 사용 안 함 (순수 FP32)
#  - torch.compile 도 사용 안 함 (CUDAGraph 에러 방지)
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm
from torch.nn.functional import layer_norm

# ---------------------------------------------------
# 1️⃣ 공통 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
print(f"✅ Device: {DEVICE}")

if DEVICE == "cuda":
    # TF32는 허용 (속도↑, 하지만 여전히 FP32 경로)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass

# ---------------------------------------------------
# 2️⃣ 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded:", test_df.shape)
ids  = test_df["ID"].tolist()
seqs = test_df["seq"].tolist()

# ---------------------------------------------------
# 3️⃣ Reverse complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4️⃣ 모델 / 토크나이저 로드 (FP32)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# torch_dtype 지정 안 함 → 기본 FP32
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("✅ Model loaded (FP32, no AMP, no torch.compile).")

layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# ---------------------------------------------------
# 5️⃣ Batched multi-crop embedding (gLM2 동일 로직)
# ---------------------------------------------------
@torch.no_grad()
def embed_batch(seq_list, n_views=6, max_len=1024):
    """
    seq_list: 길이 B 리스트
    - 각 seq에 대해 (원본+RC) × n_views crop 생성
    - 전체 crop을 한 번에 토크나이즈 + forward
    - 시퀀스별 view 평균해서 (B, H) 반환
    """
    all_crops = []
    bounds = [0]

    for seq in seq_list:
        for s in (seq, reverse_complement(seq)):
            L = len(s)
            for _ in range(n_views):
                if L <= max_len:
                    sub_seq = s
                else:
                    offset = np.random.randint(0, L - max_len + 1)
                    sub_seq = s[offset:offset + max_len]
                all_crops.append(sub_seq)
        bounds.append(len(all_crops))

    tok = tokenizer(
        all_crops,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    # ❌ AMP/autocast 없이 순수 FP32
    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)      # (4, Btot, T, H)
    weighted = (hs * layer_weights).sum(0)               # (Btot, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)           # (Btot, T, 1)
    emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (Btot, H)

    emb = emb.cpu()
    seq_embs = []
    for i in range(len(seq_list)):
        a, b = bounds[i], bounds[i+1]
        seq_embs.append(emb[a:b].mean(0, keepdim=True))
    return torch.vstack(seq_embs)   # (B, H)

# ---------------------------------------------------
# 6️⃣ 실험 실행 함수 (배치 단위)
# ---------------------------------------------------
def run_experiment(tag, n_views, max_len, n_pca, batch_size=32):
    """
    tag   : 파일 이름 suffix
    n_views, max_len, n_pca : 실험 파라미터
    batch_size : FP32라 메모리 좀 더 쓰므로 32~64 정도 추천
    """
    print(f"\n===============================")
    print(f"  Running experiment: {tag}")
    print(f"  n_views={n_views}, max_len={max_len}, N_PCA={n_pca}, BATCH={batch_size}")
    print(f"===============================\n")

    np.random.seed(SEED)
    torch.manual_seed(SEED)

    all_embs = []
    all_ids  = []

    for start in tqdm(range(0, len(seqs), batch_size), desc=f"Embedding ({tag})"):
        batch_seqs = seqs[start:start+batch_size]
        batch_ids  = ids[start:start+batch_size]
        batch_embs = embed_batch(batch_seqs, n_views=n_views, max_len=max_len)
        all_embs.append(batch_embs)
        all_ids.extend(batch_ids)

    emb_tensor = torch.vstack(all_embs)   # (N, H)
    print("✅ Embedding tensor shape:", emb_tensor.shape)

    # LayerNorm → PCA → L2
    emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
    emb_np = emb_normed.numpy()

    n_comp = min(n_pca, emb_np.shape[1])
    pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
    emb_pca = pca.fit_transform(emb_np)

    emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
    print("✅ Final embedding shape:", emb_final.shape)

    emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
    emb_df = pd.DataFrame(emb_final, columns=emb_cols)
    submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

    out_path = f"/content/submission_{tag}.csv"
    submission.to_csv(out_path, index=False)
    print(f"✅ Saved submission file: {out_path}")

    try:
        from google.colab import files
        files.download(out_path)
    except Exception as e:
        print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
        print("Error:", e)

# ---------------------------------------------------
# 7️⃣ 실행할 실험 (FP32 최종 검증용)
# ---------------------------------------------------
EXPERIMENTS = [
    # gLM2 기본 설정과 동일한 파라미터
    ("gLM2_fp32_nv6_len1024_p512_batch", 6, 1024, 512, 32),
]

for tag, nv, ml, npca, bs in EXPERIMENTS:
    run_experiment(tag, nv, ml, npca, batch_size=bs)


✅ Device: cuda
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Model loaded (FP32, no AMP, no torch.compile).

  Running experiment: gLM2_fp32_nv6_len1024_p512_batch
  n_views=6, max_len=1024, N_PCA=512, BATCH=32



Embedding (gLM2_fp32_nv6_len1024_p512_batch): 100%|██████████| 429/429 [10:09<00:00,  1.42s/it]


✅ Embedding tensor shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
✅ Saved submission file: /content/submission_gLM2_fp32_nv6_len1024_p512_batch.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>