<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/20251114.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [2]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['sample_submission.csv', 'test.csv']


In [3]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [4]:
# ============================================
# 공통: 모델/데이터 로드 + 기본 설정
# ============================================
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

from collections import Counter
from math import log2

# -----------------------------
# 환경설정
# -----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

print("✅ Device:", DEVICE)

if DEVICE == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

# -----------------------------
# 데이터 로드
# -----------------------------
data_dir = "/content/open"
test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))
print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)

# -----------------------------
# Reverse Complement 함수
# -----------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -----------------------------
# 모델/토크나이저 로드 (FP32, no AMP)
# -----------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# 마지막 4개 레이어 가중치
LAYER_WEIGHTS = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# 서열 파생변수 함수들 (gc, entropy, poly_len, cg_count, kmer_div)
def gc_content(seq: str) -> float:
    seq = seq.upper()
    g = seq.count("G"); c = seq.count("C")
    length = len(seq)
    return (g + c) / length if length > 0 else 0.0

def seq_entropy(seq: str) -> float:
    counts = Counter(seq)
    total = len(seq)
    if total == 0:
        return 0.0
    probs = [v / total for v in counts.values()]
    return -sum(p * log2(p) for p in probs if p > 0)

def longest_repeat(seq: str) -> int:
    if not seq:
        return 0
    max_len = 1
    cur_len = 1
    for i in range(1, len(seq)):
        if seq[i] == seq[i-1]:
            cur_len += 1
            max_len = max(max_len, cur_len)
        else:
            cur_len = 1
    return max_len

def cg_count(seq: str) -> int:
    return seq.count("CG")

def kmer_diversity(seq: str, k: int = 3) -> float:
    n = len(seq)
    if n < k:
        return 0.0
    kmers = {seq[i:i+k] for i in range(n - k + 1)}
    return len(kmers) / (n - k + 1)

print("✅ Common setup done.")


✅ Device: cuda
✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.
✅ Common setup done.


In [5]:
# ============================================
# Experiment 1.
#  gLM basic pooling + seq features concat → PCA512
# ============================================
from torch.nn.functional import layer_norm

N_VIEWS = 6
MAX_LEN = 1024
N_PCA   = 512

@torch.no_grad()
def get_seq_embedding_exp1(seq: str, n_views=N_VIEWS, max_len=MAX_LEN):
    """
    - 원본 + 역상보 각각 n_views개 crop
    - 모든 view를 한 배치로 처리 (2 * n_views)
    - 마지막 4개 layer weighted mean → mean pooling
    """
    srcs = [seq, reverse_complement(seq)]
    subseqs = []
    for s in srcs:
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                sub_seq = s
            else:
                start = np.random.randint(0, L - max_len + 1)
                sub_seq = s[start:start + max_len]
            subseqs.append(sub_seq)

    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs  = torch.stack(out.hidden_states[-4:], dim=0)   # (4, B, T, H)
    weighted = (hs * LAYER_WEIGHTS).sum(0)             # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)         # (B, T, 1)

    emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B, H)
    emb_mean = emb.mean(0, keepdim=True).cpu()                  # (1, H)
    return emb_mean

# 1) gLM embedding 추출
all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Exp1 Embedding"):
    emb = get_seq_embedding_exp1(row["seq"])  # (1, H)
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)  # (N, H)
print("✅ Raw embedding shape (Exp1):", emb_tensor.shape)

# 2) LayerNorm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()  # (N, H)

# 3) 서열 파생변수 계산 (test 기준)
test_df["gc"]        = test_df["seq"].apply(gc_content)
test_df["entropy"]   = test_df["seq"].apply(seq_entropy)
test_df["poly_len"]  = test_df["seq"].apply(longest_repeat)
test_df["cg_count"]  = test_df["seq"].apply(cg_count)
test_df["kmer_div"]  = test_df["seq"].apply(kmer_diversity)

feat_cols = ["gc", "entropy", "poly_len", "cg_count", "kmer_div"]
feat_np = test_df[feat_cols].values.astype(np.float32)  # (N, 5)

# 4) 파생변수 표준화 후 concat
feat_mean = feat_np.mean(axis=0, keepdims=True)
feat_std  = feat_np.std(axis=0, keepdims=True) + 1e-9
feat_std_np = (feat_np - feat_mean) / feat_std

X_np = np.concatenate([emb_np, feat_std_np], axis=1)  # (N, H+5)
print("✅ Combined feature shape (Exp1):", X_np.shape)

# 5) PCA Whitening (512차)
n_comp = min(N_PCA, X_np.shape[1])
pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
X_pca = pca.fit_transform(X_np)  # (N, n_comp)

# 6) L2 normalize
X_final = X_pca / (np.linalg.norm(X_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape (Exp1):", X_final.shape)

# 7) 제출 파일 생성
emb_cols = [f"emb_{i:04d}" for i in range(X_final.shape[1])]
emb_df = pd.DataFrame(X_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_exp1_gLM_seqfeat_p512.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved (Exp1):", out_path)

from google.colab import files
files.download(out_path)


Exp1 Embedding: 100%|██████████| 13711/13711 [12:10<00:00, 18.77it/s]


✅ Raw embedding shape (Exp1): torch.Size([13711, 1024])
✅ Combined feature shape (Exp1): (13711, 1029)
✅ Final embedding shape (Exp1): (13711, 512)
✅ Saved (Exp1): /content/submission_exp1_gLM_seqfeat_p512.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
# ============================================
# Experiment 2.
#  Position-weighted pooling (center focus) → PCA512
# ============================================
from torch.nn.functional import layer_norm

N_VIEWS = 6
MAX_LEN = 1024
N_PCA   = 512

@torch.no_grad()
def get_seq_embedding_exp2(seq: str, n_views=N_VIEWS, max_len=MAX_LEN, sigma_ratio=0.2):
    """
    - 원본 + 역상보 각각 n_views개 crop
    - 마지막 4개 레이어 weighted sum 후
    - 길이 방향으로 Gaussian position weight 적용 (중앙에 가중치↑)
    """
    srcs = [seq, reverse_complement(seq)]
    subseqs = []
    for s in srcs:
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                sub_seq = s
            else:
                start = np.random.randint(0, L - max_len + 1)
                sub_seq = s[start:start + max_len]
            subseqs.append(sub_seq)

    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs  = torch.stack(out.hidden_states[-4:], dim=0)   # (4, B, T, H)
    weighted = (hs * LAYER_WEIGHTS).sum(0)             # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)         # (B, T, 1)

    B, T, H = weighted.shape
    idx = torch.arange(T, device=DEVICE)
    center = T // 2
    sigma = T * sigma_ratio  # 예: 0.2 → 약 204 토큰 폭
    pos_w = torch.exp(- (idx - center)**2 / (2 * sigma**2))  # (T,)
    pos_w = pos_w[None, :, None]                             # (1, T, 1)

    w = pos_w * mask                                         # (B, T, 1)
    emb = (weighted * w).sum(1) / w.sum(1).clamp(min=1)      # (B, H)

    emb_mean = emb.mean(0, keepdim=True).cpu()               # (1, H)
    return emb_mean

# 1) embedding 추출
all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Exp2 Embedding"):
    emb = get_seq_embedding_exp2(row["seq"])
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)  # (N, H)
print("✅ Raw embedding shape (Exp2):", emb_tensor.shape)

# 2) LayerNorm + PCA + L2
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

n_comp = min(N_PCA, emb_np.shape[1])
pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / (np.linalg.norm(emb_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape (Exp2):", emb_final.shape)

# 3) 제출 파일 생성
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_exp2_posweighted_p512.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved (Exp2):", out_path)

from google.colab import files
files.download(out_path)


Exp2 Embedding: 100%|██████████| 13711/13711 [12:09<00:00, 18.79it/s]


✅ Raw embedding shape (Exp2): torch.Size([13711, 1024])
✅ Final embedding shape (Exp2): (13711, 512)
✅ Saved (Exp2): /content/submission_exp2_posweighted_p512.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# ============================================
# Experiment 3.
#  Multi-resolution pooling (global + center-window + max)
#  + seq features concat → PCA512
# ============================================
from torch.nn.functional import layer_norm

N_VIEWS   = 6
MAX_LEN   = 1024
N_PCA     = 512
WINDOW    = 128  # 중앙 ±64 토큰 window

@torch.no_grad()
def get_seq_embedding_exp3(seq: str, n_views=N_VIEWS, max_len=MAX_LEN, window=WINDOW):
    """
    - 원본 + 역상보 각각 n_views개 crop
    - 마지막 4개 레이어 weighted sum 후,
      1) global mean
      2) center-window mean
      3) max pooling
    - 이 3가지를 concat해서 view별 (3H) 벡터 만들고
      모든 view 평균
    """
    srcs = [seq, reverse_complement(seq)]
    subseqs = []
    for s in srcs:
        L = len(s)
        for _ in range(n_views):
            if L <= max_len:
                sub_seq = s
            else:
                start = np.random.randint(0, L - max_len + 1)
                sub_seq = s[start:start + max_len]
            subseqs.append(sub_seq)

    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs  = torch.stack(out.hidden_states[-4:], dim=0)  # (4, B, T, H)
    weighted = (hs * LAYER_WEIGHTS).sum(0)            # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)        # (B, T, 1)

    B, T, H = weighted.shape

    # 1) global mean
    mean_global = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B, H)

    # 2) center-window mean
    center = T // 2
    half = window // 2
    start = max(0, center - half)
    end   = min(T, center + half)
    win_mask = mask.clone()
    # window 밖은 0으로
    if start > 0:
        win_mask[:, :start, :] = 0
    if end < T:
        win_mask[:, end:, :] = 0
    mean_window = (weighted * win_mask).sum(1) / win_mask.sum(1).clamp(min=1)  # (B, H)

    # 3) max pooling (mask 반영)
    weighted_masked = weighted.clone()
    weighted_masked[mask.squeeze(-1) == 0] = -1e9
    max_pool, _ = weighted_masked.max(dim=1)  # (B, H)

    # concat: (B, 3H)
    multi = torch.cat([mean_global, mean_window, max_pool], dim=1)  # (B, 3H)

    # view 평균 (B = 2 * n_views)
    multi_mean = multi.mean(0, keepdim=True).cpu()                  # (1, 3H)
    return multi_mean

# 1) embedding 추출
all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Exp3 Embedding"):
    emb = get_seq_embedding_exp3(row["seq"])
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)  # (N, 3H)
print("✅ Raw embedding shape (Exp3):", emb_tensor.shape)

# 2) LayerNorm
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

# 3) 서열 파생변수 (이미 있으면 재계산 없이 사용 가능하지만, 안전하게 다시)
test_df["gc"]        = test_df["seq"].apply(gc_content)
test_df["entropy"]   = test_df["seq"].apply(seq_entropy)
test_df["poly_len"]  = test_df["seq"].apply(longest_repeat)
test_df["cg_count"]  = test_df["seq"].apply(cg_count)
test_df["kmer_div"]  = test_df["seq"].apply(kmer_diversity)

feat_cols = ["gc", "entropy", "poly_len", "cg_count", "kmer_div"]
feat_np = test_df[feat_cols].values.astype(np.float32)  # (N, 5)

# 4) 파생변수 표준화 + concat
feat_mean = feat_np.mean(axis=0, keepdims=True)
feat_std  = feat_np.std(axis=0, keepdims=True) + 1e-9
feat_std_np = (feat_np - feat_mean) / feat_std

X_np = np.concatenate([emb_np, feat_std_np], axis=1)   # (N, 3H+5)
print("✅ Combined feature shape (Exp3):", X_np.shape)

# 5) PCA512 + L2
n_comp = min(N_PCA, X_np.shape[1])
pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
X_pca = pca.fit_transform(X_np)

X_final = X_pca / (np.linalg.norm(X_pca, axis=1, keepdims=True) + 1e-9)
print("✅ Final embedding shape (Exp3):", X_final.shape)

# 6) 제출 파일 생성
emb_cols = [f"emb_{i:04d}" for i in range(X_final.shape[1])]
emb_df = pd.DataFrame(X_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_exp3_multires_seqfeat_p512.csv"
submission.to_csv(out_path, index=False)
print("✅ Saved (Exp3):", out_path)

from google.colab import files
files.download(out_path)


Exp3 Embedding: 100%|██████████| 13711/13711 [12:11<00:00, 18.76it/s]


✅ Raw embedding shape (Exp3): torch.Size([13711, 3072])
✅ Combined feature shape (Exp3): (13711, 3077)
✅ Final embedding shape (Exp3): (13711, 512)
✅ Saved (Exp3): /content/submission_exp3_multires_seqfeat_p512.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
# ============================================
#  제2회 MAI 대회 Inference (Deterministic + 16 views + Gaussian pooling)
#  Author: Hanbin (GPT-5 assisted)
#  Base: Weighted last4 + RC + Multi-crop + PCA Whitening
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# 1) 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print(f"Device: {DEVICE}")

# ---------------------------------------------------
# 2) 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("Loaded:", test_df.shape)

# ---------------------------------------------------
# 3) Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4) 모델 로드
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("Model loaded.")

# last4 layer weights
LAYER_WEIGHTS = torch.tensor([0.1,0.2,0.3,0.4], device=DEVICE).view(4,1,1,1)

# ---------------------------------------------------
# 5) Multi-view embedding (Deterministic + Random + Gaussian pooling)
# ---------------------------------------------------
N_VIEWS_PER_STRAND = 8
MAX_LEN = 1024
SIGMA_RATIO = 0.20     # Gaussian 폭 비율

@torch.no_grad()
def get_seq_embedding(seq: str):

    strands = [seq, reverse_complement(seq)]
    subseqs = []

    for s in strands:
        L = len(s)
        if L <= MAX_LEN:
            det_offsets = [0]*4
            rand_offsets = [0]*(N_VIEWS_PER_STRAND-4)
        else:
            max_offset = L - MAX_LEN

            # deterministic offsets
            det_offsets = np.linspace(0, max_offset, num=4, dtype=int).tolist()

            # random offsets
            rand_offsets = np.random.randint(0, max_offset+1, size=N_VIEWS_PER_STRAND-4).tolist()

        offsets = det_offsets + rand_offsets
        for off in offsets:
            subseqs.append(s[off:off+MAX_LEN])

    # Batch tokenize
    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    ).to(DEVICE)

    # Forward
    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)      # (4, B, T, H)
    weighted = (hs * LAYER_WEIGHTS).sum(0)              # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)          # (B, T, 1)

    B, T, H = weighted.shape

    # Gaussian weight (중앙 강조)
    idx = torch.arange(T, device=DEVICE)
    center = T // 2
    sigma = T * SIGMA_RATIO

    pos_w = torch.exp(- (idx - center)**2 / (2*sigma*sigma))
    pos_w = pos_w.view(1, T, 1)     # <-- ★ 이게 핵심 수정!

    # combine with mask
    w = pos_w * mask
    denom = w.sum(1).clamp(min=1.0)
    emb = (weighted * w).sum(1) / denom

    return emb.mean(0, keepdim=True).cpu()


# ---------------------------------------------------
# 6) 전체 embedding 생성
# ---------------------------------------------------
all_ids = []
all_embs = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Embedding (Det+16+Gauss)"):
    emb = get_seq_embedding(row["seq"])
    all_ids.append(row["ID"])
    all_embs.append(emb)

emb_tensor = torch.vstack(all_embs)
print("Raw embedding shape:", emb_tensor.shape)

# ---------------------------------------------------
# 7) LayerNorm + PCA whitening
# ---------------------------------------------------
from torch.nn.functional import layer_norm
emb_norm = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_norm.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)

# ---------------------------------------------------
# 8) 제출 파일 생성
# ---------------------------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_det16_gauss_fixed.csv"
submission.to_csv(out_path, index=False)

print("Saved:", out_path)

from google.colab import files
files.download(out_path)


Device: cuda
Loaded: (13711, 2)
Model loaded.


Embedding (Det+16+Gauss): 100%|██████████| 13711/13711 [15:07<00:00, 15.11it/s]


Raw embedding shape: torch.Size([13711, 1024])
Saved: /content/submission_det16_gauss_fixed.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>