<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/high_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U bitsandbytes peft accelerate transformers datasets


Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['test.csv', 'sample_submission.csv']


In [3]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [4]:
# ============================================
#  제2회 MAI 대회 최종 제출 Inference (Stage2 LoRA 우선) - 고GPU 활용 버전
#  Strategy: Weighted last4 + Multi-crop(view-batch) + Reverse complement
#            + LayerNorm + PCA Whitening
#  Notes:
#    - 샘플×뷰 동시 배치로 forward → GPU 사용률↑
#    - FP16(권장, VRAM 여유시) or 8bit(메모리 절약)
#    - TF32, cuDNN benchmark, AMP 적용
# ============================================

import os, numpy as np, pandas as pd, torch
from tqdm import tqdm
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig

# (옵션) PEFT 로더: LoRA가 있을 때 사용
try:
    from peft import AutoPeftModelForMaskedLM
    HAS_PEFT = True
except Exception:
    HAS_PEFT = False

# -----------------------------
# 0) 환경설정
# -----------------------------
SEED = 42
np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Device:", DEVICE)

# GPU 커널/정밀도 최적화
if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

def _amp_dtype():
    if DEVICE == "cuda" and torch.cuda.is_available():
        cap = torch.cuda.get_device_capability()
        return torch.bfloat16 if cap[0] >= 8 else torch.float16  # Ampere+ → bf16
    return None

AMP_DTYPE = _amp_dtype()

# -----------------------------
# 1) 경로/하이퍼파라미터
# -----------------------------
data_path = "/content/open"  # test.csv, sample_submission.csv
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
STAGE2_CKPT = "/content/stage2_contrastive_lora8bit"  # LoRA 경로(없으면 폴백)

# === 핵심 성능 파라미터 ===
USE_8BIT   = False   # ✅ VRAM 여유되면 False로 두고 FP16 추론 권장(속도↑/점유율↑)
BATCH_SEQ  = 32      # 한 번에 처리할 샘플 수 (VRAM 따라 16~64 튜닝)
N_VIEWS    = 4       # per strand views (원본/역상보 각각)
MAX_LEN    = 1024    # crop 길이 (512→1024로 늘리면 계산량↑)
N_PCA      = 512     # 최종 출력 차원(≤2048)

LAYER_WEIGHTS = torch.tensor([0.1, 0.2, 0.3, 0.4])  # 마지막 4개 레이어 가중치

# -----------------------------
# 2) 데이터 로드
# -----------------------------
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape, sub_df.shape)

# -----------------------------
# 3) reverse complement
# -----------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -----------------------------
# 4) 모델/토크나이저 로드 (Stage2 우선)
# -----------------------------
if USE_8BIT:
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)

    if os.path.isdir(STAGE2_CKPT) and HAS_PEFT:
        print("✅ Loading Stage2 LoRA (8bit):", STAGE2_CKPT)
        tokenizer = AutoTokenizer.from_pretrained(STAGE2_CKPT, trust_remote_code=True)
        model = AutoPeftModelForMaskedLM.from_pretrained(
            STAGE2_CKPT, trust_remote_code=True,
            quantization_config=bnb_config, device_map={"": 0}
        )
    else:
        print("⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (8bit).")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        model = AutoModelForMaskedLM.from_pretrained(
            MODEL_ID, trust_remote_code=True,
            quantization_config=bnb_config, device_map={"": 0}
        )
else:
    # FP16 로드 (권장)
    if os.path.isdir(STAGE2_CKPT) and HAS_PEFT:
        print("✅ Loading Stage2 LoRA (FP16):", STAGE2_CKPT)
        tokenizer = AutoTokenizer.from_pretrained(STAGE2_CKPT, trust_remote_code=True)
        model = AutoPeftModelForMaskedLM.from_pretrained(
            STAGE2_CKPT, trust_remote_code=True,
            torch_dtype=torch.float16, device_map={"": 0}
        )
    else:
        print("⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (FP16).")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        model = AutoModelForMaskedLM.from_pretrained(
            MODEL_ID, trust_remote_code=True,
            torch_dtype=torch.float16, device_map={"": 0}
        )

model.eval(); model.config.use_cache = False
LAYER_WEIGHTS = LAYER_WEIGHTS.to(model.device)

# -----------------------------
# 5) 뷰 생성 유틸
# -----------------------------
def make_crops(seq, n_views, max_len):
    out = []
    for s in (seq, reverse_complement(seq)):
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                out.append(s)
            else:
                st = np.random.randint(0, L - max_len + 1)
                out.append(s[st:st+max_len])
    return out  # len = 2*n_views

# -----------------------------
# 6) 샘플×뷰 동시 배치 임베딩
# -----------------------------
from contextlib import nullcontext

@torch.no_grad()
def embed_batch(seq_list, n_views=N_VIEWS, max_len=MAX_LEN):
    """
    seq_list: 길이 B의 시퀀스 리스트
    - 각 시퀀스에 대해 (원본/역상보)×n_views crop 생성
    - 전체(2*n_views*B) 를 한 방에 토크나이즈 → 1 forward
    - 마지막 4개 레이어 가중합 → attention mask 평균 → 시퀀스별 view 평균
    반환: (B, H) torch.FloatTensor(cpu)
    """
    # 1) 모든 샘플의 crop 생성 후 평탄화
    batch_crops = []
    starts = [0]
    for seq in seq_list:
        cs = make_crops(seq, n_views, max_len)
        batch_crops.extend(cs)
        starts.append(starts[-1] + len(cs))   # 인덱스 경계 기록

    # 2) 한 번에 토크나이즈
    tok = tokenizer(
        batch_crops, return_tensors="pt",
        truncation=True, padding=True, max_length=max_len
    ).to(model.device)

    # 3) AMP (8bit일 땐 실효 적을 수 있어 nullcontext)
    use_amp = (not USE_8BIT) and (DEVICE == "cuda")
    autocast_ctx = torch.autocast("cuda", dtype=AMP_DTYPE) if use_amp else nullcontext()

    with autocast_ctx:
        out = model(**tok, output_hidden_states=True)
        hs  = torch.stack(out.hidden_states[-4:], dim=0)                 # (4,B,T,H)
        w   = (hs * LAYER_WEIGHTS.view(4,1,1,1)).sum(0)                  # (B,T,H)
        mask = tok["attention_mask"].unsqueeze(-1)                       # (B,T,1)
        emb  = (w * mask).sum(1) / mask.sum(1).clamp(min=1)              # (B,H)

    # 4) 원 시퀀스 단위로 view 평균
    outs = []
    for i in range(len(seq_list)):
        a, b = starts[i], starts[i+1]
        outs.append(emb[a:b].mean(0, keepdim=True))
    return torch.vstack(outs).cpu()                                      # (B, H)

# -----------------------------
# 7) 전체 추론 루프 (샘플 배치 단위)
# -----------------------------
all_ids, all_embs = [], []
seqs = test_df["seq"].tolist()
ids  = test_df["ID"].tolist()

for s in tqdm(range(0, len(seqs), BATCH_SEQ), desc="Embedding (samples×views batched)"):
    chunk = seqs[s:s+BATCH_SEQ]
    embs  = embed_batch(chunk)           # (B, H)
    all_embs.append(embs)
    all_ids.extend(ids[s:s+BATCH_SEQ])

emb_tensor = torch.vstack(all_embs)      # (N, H)
print("✅ Raw embedding shape:", emb_tensor.shape)

# -----------------------------
# 8) 정규화 + Whitening (PCA)
# -----------------------------
from torch.nn.functional import layer_norm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])   # (N, H)
emb_np = emb_normed.numpy()

# PCA 차원 축소 + 화이트닝
n_comp = min(N_PCA, emb_np.shape[1])
pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)                          # (N, n_comp)

# L2 normalize (cosine 안정화)
emb_final = emb_pca / (np.linalg.norm(emb_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape:", emb_final.shape)

# -----------------------------
# 9) 제출 파일 생성
# -----------------------------
assert emb_final.shape[1] <= 2048, "임베딩 차원 2048 초과 금지!"
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_final_stage2_or_base.csv"
submission.to_csv(out_path, index=False)
print("🎯 Saved:", out_path)

# (선택) Colab 다운로드
try:
    from google.colab import files
    files.download(out_path)
except Exception as e:
    print("ℹ️ files.download 실패 시, 왼쪽 Files 패널에서 직접 다운로드하세요.")
    print("Error:", e)


✅ Device: cuda
✅ Loaded: (13711, 2) (13711, 769)
⚠️ Stage2 ckpt not found or PEFT unavailable. Using base (FP16).


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Embedding (samples×views batched): 100%|██████████| 429/429 [05:33<00:00,  1.29it/s]


✅ Raw embedding shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
🎯 Saved: /content/submission_final_stage2_or_base.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>