<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [3]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['test.csv', 'sample_submission.csv']


In [None]:
# ============================================
#  Advanced MAI Inference Model (No Train CSV Required)
#  - SAP Attention Pooling
#  - Projection Head (random init)
#  - Asymmetric RC Weighting
#  - Multi-crop (Orig 12 + RC 4)
#  - Weighted Last4 Hidden Mean
#  Author: Hanbin + GPT-5
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# Device / Seed
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("Device:", DEVICE)

# ---------------------------------------------------
# Load test.csv only
# ---------------------------------------------------
data_path = "/content/open"

test_df  = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df   = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

print("Loaded test:", test_df.shape)


# ---------------------------------------------------
# Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq):
    table = str.maketrans("ACGT", "TGCA")
    return seq.translate(table)[::-1]


# ---------------------------------------------------
# SAP Attention Pooler
# ---------------------------------------------------
class SAP(torch.nn.Module):
    def __init__(self, hidden_dim=1280, heads=8):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(hidden_dim, heads, batch_first=True)
        self.norm = torch.nn.LayerNorm(hidden_dim)

    def forward(self, x, mask):
        out, _ = self.attn(x, x, x, key_padding_mask=~mask.bool())
        out = out.mean(1)
        return self.norm(out)


# ---------------------------------------------------
# Projection head (random weights since train.csv 없음)
# ---------------------------------------------------
class ProjectionHead(torch.nn.Module):
    def __init__(self, in_dim=1280, out_dim=512):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, out_dim)
        )

    def forward(self, x):
        return self.net(x)


# ---------------------------------------------------
# Load Base Model
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE).eval()
model.config.use_cache = False

sap = SAP(hidden_dim=model.config.hidden_size).to(DEVICE)
proj = ProjectionHead(in_dim=model.config.hidden_size, out_dim=512).to(DEVICE)

print("Model loaded.")


# ---------------------------------------------------
# Embedding function
# ---------------------------------------------------
@torch.no_grad()
def embed_sequence(seq, n_orig=12, n_rc=4, max_len=1024):
    layer_w = torch.tensor([0.1, 0.2, 0.3, 0.4]).to(DEVICE)

    def _embed_once(s, n_views):
        embs = []
        for _ in range(n_views):
            offset = np.random.randint(0, max(1, len(s)-max_len))
            sub = s[offset:offset+max_len]

            tok = tokenizer(sub, return_tensors="pt",
                            truncation=True, padding=True,
                            max_length=max_len).to(DEVICE)

            out = model(**tok, output_hidden_states=True)
            hs = torch.stack(out.hidden_states[-4:], dim=0)
            weighted = (hs * layer_w.view(4,1,1,1)).sum(0)

            mask = tok["attention_mask"]

            pooled = sap(weighted, mask)      # (B,H)
            embs.append(pooled)

        return torch.stack(embs, 0).mean(0)  # (H)

    # Original + RC
    e_orig = _embed_once(seq, n_orig)
    e_rc   = _embed_once(reverse_complement(seq), n_rc)

    emb = 0.7 * e_orig + 0.3 * e_rc
    return proj(emb).cpu()   # projection head 적용


# ---------------------------------------------------
# All test embedding
# ---------------------------------------------------
all_ids = []
all_embs = []

print("Embedding test...")

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    all_ids.append(row["ID"])
    all_embs.append(embed_sequence(row["seq"]))

emb = torch.vstack(all_embs).numpy()
print("Embedding shape:", emb.shape)


# ---------------------------------------------------
# PCA Whitening
# ---------------------------------------------------
pca_dim = 512
pca = PCA(n_components=pca_dim, whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)


# ---------------------------------------------------
# Build submission
# ---------------------------------------------------
cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
submission = pd.DataFrame({"ID": all_ids})

for i, c in enumerate(cols):
    submission[c] = emb_final[:, i]

out_path = "/content/submission_attention_proj.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


# ---------------------------------------------------
# Download
# ---------------------------------------------------
from google.colab import files
files.download(out_path)


Device: cuda
Loaded test: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Model loaded.
Embedding test...


100%|██████████| 13711/13711 [2:00:03<00:00,  1.90it/s]


Embedding shape: (13711, 512)


  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c

Saved: /content/submission_attention_proj.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ============================================
#  A형: Original Best (PCA 512 + Whitening)
#  -> 0.58694 나왔던 구조 그대로
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# 1) 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print(f"✅ Device: {DEVICE}")

# ---------------------------------------------------
# 2) 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape)

# ---------------------------------------------------
# 3) Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4) gLM 로드
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("✅ Model loaded.")

# ---------------------------------------------------
# 5) Multi-crop + Weighted last4
# ---------------------------------------------------
@torch.no_grad()
def get_seq_embedding(seq, model, tokenizer, n_views=6, max_len=1024):
    """랜덤 crop + reverse complement 평균 포함 (원본 그대로)"""
    embs = []
    seqs = [seq, reverse_complement(seq)]  # 원본 + 역상보
    layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4]).to(DEVICE)

    for s in seqs:
        for _ in range(n_views):
            offset = np.random.randint(0, max(1, len(s) - max_len))
            sub_seq = s[offset:offset + max_len]
            tok = tokenizer(
                sub_seq, return_tensors="pt", truncation=True, padding=True, max_length=max_len
            ).to(DEVICE)
            out = model(**tok, output_hidden_states=True)
            hs = torch.stack(out.hidden_states[-4:], dim=0)  # (4, B, L, H)
            weighted = (hs * layer_weights.view(4,1,1,1)).sum(0)
            mask = tok["attention_mask"].unsqueeze(-1)
            emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)
            embs.append(emb.cpu())

    return torch.stack(embs).mean(0)  # 모든 view 평균

# ---------------------------------------------------
# 6) 전체 임베딩
# ---------------------------------------------------
all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    emb = get_seq_embedding(row["seq"], model, tokenizer, n_views=6, max_len=1024)
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)
print("✅ Embedding tensor shape:", emb_tensor.shape)

# ---------------------------------------------------
# 7) LayerNorm + PCA(512) + Whitening + L2
# ---------------------------------------------------
from torch.nn.functional import layer_norm

emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# ---------------------------------------------------
# 8) 제출 파일 생성
# ---------------------------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_A_base512.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved submission file:", out_path)

# ---------------------------------------------------
# 9) 다운로드
# ---------------------------------------------------
from google.colab import files
files.download(out_path)


✅ Device: cpu
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.


  0%|          | 2/13711 [00:25<47:57:58, 12.60s/it]

In [4]:
# ============================================
#  E 전략 (새로운 방향): Forward-only + Center-focused Multi-crop
#  - RC 전혀 사용 안 함
#  - 중앙 근처 window 여러 개
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("✅ Device:", DEVICE)

data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape)

MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# Forward-only, center-focused crops
@torch.no_grad()
def get_seq_embedding_forward_center(seq, max_len=1024):
    """
    - RC 무시, forward strand만 사용
    - center 근처에 deterministic + 약간 random crop
    """
    layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE)
    embs = []

    s = seq
    L = len(s)
    # 길이 동일(1024)이므로 center 기반 shift 정의
    center = (L - max_len) // 2  # 보통 0이지만 일반화
    candidates = [center - 50, center - 25, center, center + 25, center + 50]
    offsets = []
    for off in candidates:
        offsets.append(max(0, min(off, L - max_len)))

    # 여기에 random view 두 개 추가
    for _ in range(3):
        offsets.append(np.random.randint(0, max(1, L - max_len)))

    for off in offsets:
        sub_seq = s[off:off + max_len]
        tok = tokenizer(
            sub_seq, return_tensors="pt",
            truncation=True, padding=True, max_length=max_len
        ).to(DEVICE)
        out = model(**tok, output_hidden_states=True)
        hs = torch.stack(out.hidden_states[-4:], dim=0)
        weighted = (hs * layer_weights.view(4,1,1,1)).sum(0)
        mask = tok["attention_mask"].unsqueeze(-1)
        emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)
        embs.append(emb.squeeze(0).cpu())

    return torch.stack(embs).mean(0)

all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Embedding E (forward-center)"):
    emb = get_seq_embedding_forward_center(row["seq"])
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)
print("✅ Embedding tensor shape:", emb_tensor.shape)

from torch.nn.functional import layer_norm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)
emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_E_forwardCenter.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved submission file:", out_path)

from google.colab import files
files.download(out_path)


✅ Device: cuda
✅ Loaded: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.


Embedding E (forward-center): 100%|██████████| 13711/13711 [58:51<00:00,  3.88it/s]


✅ Embedding tensor shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
✅ Saved submission file: /content/submission_E_forwardCenter.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ============================================
#  제2회 MAI 대회 Inference (배치 + 고GPU 사용 버전)
#  - Base 전략: Weighted last4 + Multi-crop + RC + PCA512 + Whitening
#  - 변경점:
#      * 여러 시퀀스를 한 번에 배치로 처리 (BATCH_SEQS)
#      * 각 시퀀스당 view 수 유지(또는 증가)하면서도 GPU 한 번에 많이 사용
#  - FP32 유지 (AMP, bf16/FP16 사용 X) → 수치 안정성 유지
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# 1️⃣ 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print(f"✅ Device: {DEVICE}")

# ⚙️ GPU 메모리/연산량 조절 파라미터
BATCH_SEQS = 8    # 한 번에 처리할 시퀀스 개수 (GPU 여유되면 8→12→16까지 올려봐도 됨)
N_VIEWS    = 6    # 각 strand당 view 개수 (원래 코드와 동일; 8로 살짝 올려볼 수도 있음)
MAX_LEN    = 1024 # 시퀀스 길이 (데이터가 1024로 고정이니 그대로)

# ---------------------------------------------------
# 2️⃣ 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape)

# ---------------------------------------------------
# 3️⃣ Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4️⃣ gLM 로드 (FP32, no AMP)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("✅ Model loaded.")

# ---------------------------------------------------
# 5️⃣ 배치 단위 embedding 함수
#     - 여러 개 시퀀스를 한 번에 subseq들로 펼쳐서 model에 넣고
#       다시 시퀀스별로 평균 내는 구조
#     - last4 layer weighted sum, RC 포함, random crop 유지
# ---------------------------------------------------
@torch.no_grad()
def embed_batch(seqs, n_views=N_VIEWS, max_len=MAX_LEN):
    """
    seqs: 리스트[str], 길이 = batch_size
    반환: (batch_size, H) PyTorch tensor (CPU 상)
    """
    batch_size = len(seqs)
    layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE)

    subseqs = []  # 모든 view에 대한 subseq 문자열
    owners  = []  # subseq가 어떤 seq index에 속하는지 기록 (0 ~ batch_size-1)

    # 1) subseq들 만들기 (forward + RC, 각 n_views개)
    for idx, seq in enumerate(seqs):
        s_fwd = seq
        s_rc  = reverse_complement(seq)
        Lf = len(s_fwd)
        Lr = len(s_rc)

        # forward views
        for _ in range(n_views):
            if Lf <= max_len:
                off = 0
            else:
                off = np.random.randint(0, Lf - max_len + 1)
            subseq = s_fwd[off:off + max_len]
            subseqs.append(subseq)
            owners.append(idx)

        # reverse complement views
        for _ in range(n_views):
            if Lr <= max_len:
                off = 0
            else:
                off = np.random.randint(0, Lr - max_len + 1)
            subseq = s_rc[off:off + max_len]
            subseqs.append(subseq)
            owners.append(idx)

    # 2) 토크나이즈 + 모델 forward (한 번에)
    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    # hidden_states: tuple(len_layers+1), 각 (B_total, T, H)
    hs = torch.stack(out.hidden_states[-4:], dim=0)          # (4, B_total, T, H)
    weighted = (hs * layer_weights.view(4,1,1,1)).sum(0)     # (B_total, T, H)

    mask = tok["attention_mask"].unsqueeze(-1)               # (B_total, T, 1)
    emb_views = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B_total, H)

    # 3) 시퀀스별로 view 평균
    B_total, H = emb_views.shape
    emb_views_cpu = emb_views.detach().cpu()

    sums = torch.zeros((batch_size, H), dtype=emb_views_cpu.dtype)
    counts = torch.zeros(batch_size, dtype=torch.long)

    for i, owner in enumerate(owners):
        sums[owner] += emb_views_cpu[i]
        counts[owner] += 1

    # 혹시라도 count가 0인 경우 방지 (이론상 발생하지 않음)
    counts = counts.clamp(min=1)
    embs_batch = sums / counts.unsqueeze(1)   # (batch_size, H)

    return embs_batch  # CPU tensor

# ---------------------------------------------------
# 6️⃣ 전체 test에 대해 배치 추론
# ---------------------------------------------------
all_ids = []
all_embs = []

num_samples = len(test_df)
for start in tqdm(range(0, num_samples, BATCH_SEQS), desc="Embedding (batched)"):
    end = min(start + BATCH_SEQS, num_samples)
    batch_df = test_df.iloc[start:end]
    seqs = batch_df["seq"].tolist()
    ids  = batch_df["ID"].tolist()

    emb_batch = embed_batch(seqs, n_views=N_VIEWS, max_len=MAX_LEN)  # (batch_size, H)

    all_ids.extend(ids)
    all_embs.append(emb_batch)

emb_tensor = torch.vstack(all_embs)  # (N, H), CPU tensor
print("✅ Embedding tensor shape:", emb_tensor.shape)

# ---------------------------------------------------
# 7️⃣ LayerNorm + PCA(512, whiten=True) + L2 normalize
# ---------------------------------------------------
from torch.nn.functional import layer_norm

emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# ---------------------------------------------------
# 8️⃣ 제출 파일 생성
# ---------------------------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_batched_highGPU.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved submission file:", out_path)

# ---------------------------------------------------
# 9️⃣ 로컬 다운로드
# ---------------------------------------------------
from google.colab import files
files.download(out_path)


✅ Device: cuda
✅ Loaded: (13711, 2)
✅ Model loaded.


Embedding (batched):   2%|▏         | 38/1714 [00:42<31:24,  1.12s/it]