<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/20251115.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [3]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['sample_submission.csv', 'test.csv']


In [4]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [None]:
# ============================================
#  MAI 대회: 6가지 Inference 실험 자동 실행 스크립트
#  - 공통 gLM: nucleotide-transformer-v2-500m-multi-species
#  - 각 실험별:
#      * deterministic + random multi-crop
#      * reverse complement 포함
#      * pooling (mean / Gaussian) 변경
#      * PCA(512) + L2 normalize
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm
from torch.nn.functional import layer_norm

# ---------------------------------------------------
# 1) 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print(f"✅ Device: {DEVICE}")

# ---------------------------------------------------
# 2) 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("✅ Loaded test_df:", test_df.shape)

# ---------------------------------------------------
# 3) Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4) 모델 로드 (FP32, AMP/compile 사용 X)
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# 마지막 4개 레이어 가중치
LAYER_WEIGHTS = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# ---------------------------------------------------
# 5) Multi-view Embedding 함수
#     - config: dict
#       { "n_views", "max_len", "pooling", "sigma_ratio" }
# ---------------------------------------------------
@torch.no_grad()
def get_seq_embedding(seq: str, config: dict):
    """
    config:
      - n_views: 한 strand당 view 수 (정수)
      - max_len: crop 길이
      - pooling: "mean" or "gauss"
      - sigma_ratio: pooling=="gauss"일 때만 사용 (예: 0.2)
    """
    n_views = config["n_views"]
    max_len = config["max_len"]
    pooling = config["pooling"]
    sigma_ratio = config.get("sigma_ratio", 0.2)

    strands = [seq, reverse_complement(seq)]
    subseqs = []

    for s in strands:
        L = len(s)

        if L <= max_len:
            # 길이가 짧으면 전체 시퀀스를 반복 사용
            det_count = min(4, n_views)
            det_offsets = [0] * det_count
            rand_count = n_views - det_count
            rand_offsets = [0] * rand_count
        else:
            max_offset = L - max_len

            # 4개 deterministic offset (0, 1/3, 2/3, max_offset 근처)
            if n_views >= 4:
                det_count = 4
                det_offsets = np.linspace(0, max_offset, num=det_count, dtype=int).tolist()
            else:
                det_count = n_views
                det_offsets = np.linspace(0, max_offset, num=det_count, dtype=int).tolist()

            rand_count = max(0, n_views - det_count)
            rand_offsets = np.random.randint(0, max_offset + 1, size=rand_count).tolist()

        offsets = det_offsets + rand_offsets
        for off in offsets:
            if L <= max_len:
                subseqs.append(s)
            else:
                subseqs.append(s[off:off + max_len])

    # B = 2 * n_views
    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)      # (4, B, T, H)
    weighted = (hs * LAYER_WEIGHTS).sum(0)               # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)           # (B, T, 1)

    B, T, H = weighted.shape

    if pooling == "mean":
        # 순수 mean pooling
        denom = mask.sum(1).clamp(min=1.0)               # (B, 1)
        denom = denom.squeeze(-1).unsqueeze(-1)          # (B, 1)
        emb = (weighted * mask).sum(1) / denom           # (B, H)

    elif pooling == "gauss":
        # Gaussian position weight (중앙 강조)
        idx = torch.arange(T, device=DEVICE)
        center = T // 2
        sigma = T * sigma_ratio

        pos_w = torch.exp(- (idx - center)**2 / (2 * sigma * sigma))  # (T,)
        pos_w = pos_w.view(1, T, 1)                                   # (1, T, 1)

        w = pos_w * mask                                              # (B, T, 1)
        denom = w.sum(1).clamp(min=1.0)                               # (B, 1, 1)
        denom = denom.squeeze(-1).squeeze(-1).unsqueeze(-1)           # (B, 1)
        emb = (weighted * w).sum(1) / denom                           # (B, H)
    else:
        raise ValueError(f"Unknown pooling mode: {pooling}")

    # view 평균
    emb_mean = emb.mean(0, keepdim=True).cpu()    # (1, H)
    return emb_mean

# ---------------------------------------------------
# 6) 실험 설정 6개 정의
# ---------------------------------------------------
EXPERIMENTS = [
    {
        "tag": "exp1_mean_nv6_len1024",
        "n_views": 6,
        "max_len": 1024,
        "pooling": "mean",
        "sigma_ratio": None,
    },
    {
        "tag": "exp2_mean_nv8_len1024",
        "n_views": 8,
        "max_len": 1024,
        "pooling": "mean",
        "sigma_ratio": None,
    },
    {
        "tag": "exp3_gauss_nv8_len1024_s015",
        "n_views": 8,
        "max_len": 1024,
        "pooling": "gauss",
        "sigma_ratio": 0.15,
    },
    {
        "tag": "exp4_gauss_nv8_len1024_s020",
        "n_views": 8,
        "max_len": 1024,
        "pooling": "gauss",
        "sigma_ratio": 0.20,
    },
    {
        "tag": "exp5_gauss_nv8_len896_s020",
        "n_views": 8,
        "max_len": 896,
        "pooling": "gauss",
        "sigma_ratio": 0.20,
    },
    {
        "tag": "exp6_gauss_nv10_len1024_s020",
        "n_views": 10,
        "max_len": 1024,
        "pooling": "gauss",
        "sigma_ratio": 0.20,
    },
]

print("✅ Experiments:")
for cfg in EXPERIMENTS:
    print("  -", cfg["tag"], cfg)

# ---------------------------------------------------
# 7) 실험 루프: 각 설정별로 embedding → PCA512 → CSV 저장
# ---------------------------------------------------
results_paths = []

for cfg in EXPERIMENTS:
    tag = cfg["tag"]
    print("\n" + "="*80)
    print(f"🚀 Running experiment: {tag}")
    print("="*80)

    all_ids = []
    all_embs = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Embedding ({tag})"):
        emb = get_seq_embedding(row["seq"], cfg)   # (1, H)
        all_ids.append(row["ID"])
        all_embs.append(emb)

    emb_tensor = torch.vstack(all_embs)           # (N, H)
    print(f"  ➤ Raw embedding shape [{tag}]:", emb_tensor.shape)

    # LayerNorm
    emb_norm = layer_norm(emb_tensor, emb_tensor.shape[1:])
    emb_np = emb_norm.numpy()

    # PCA Whitening (512차)
    n_comp = min(512, emb_np.shape[1])
    pca = PCA(n_components=n_comp, whiten=True, random_state=SEED)
    emb_pca = pca.fit_transform(emb_np)

    # L2 normalize
    emb_final = emb_pca / (np.linalg.norm(emb_pca, axis=1, keepdims=True) + 1e-9)
    print(f"  ➤ Final embedding shape [{tag}]:", emb_final.shape)

    # CSV 저장
    emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
    emb_df = pd.DataFrame(emb_final, columns=emb_cols)
    submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

    out_path = f"/content/submission_{tag}.csv"
    submission.to_csv(out_path, index=False)
    results_paths.append(out_path)
    print(f"  ✅ Saved: {out_path}")

print("\n✅ All experiments finished.")
print("Generated files:")
for p in results_paths:
    print("  -", p)

# (원하면 아래 주석 풀어서 개별 다운로드도 가능)
# from google.colab import files
# for p in results_paths:
#     files.download(p)


✅ Device: cuda
✅ Loaded test_df: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.
✅ Experiments:
  - exp1_mean_nv6_len1024 {'tag': 'exp1_mean_nv6_len1024', 'n_views': 6, 'max_len': 1024, 'pooling': 'mean', 'sigma_ratio': None}
  - exp2_mean_nv8_len1024 {'tag': 'exp2_mean_nv8_len1024', 'n_views': 8, 'max_len': 1024, 'pooling': 'mean', 'sigma_ratio': None}
  - exp3_gauss_nv8_len1024_s015 {'tag': 'exp3_gauss_nv8_len1024_s015', 'n_views': 8, 'max_len': 1024, 'pooling': 'gauss', 'sigma_ratio': 0.15}
  - exp4_gauss_nv8_len1024_s020 {'tag': 'exp4_gauss_nv8_len1024_s020', 'n_views': 8, 'max_len': 1024, 'pooling': 'gauss', 'sigma_ratio': 0.2}
  - exp5_gauss_nv8_len896_s020 {'tag': 'exp5_gauss_nv8_len896_s020', 'n_views': 8, 'max_len': 896, 'pooling': 'gauss', 'sigma_ratio': 0.2}
  - exp6_gauss_nv10_len1024_s020 {'tag': 'exp6_gauss_nv10_len1024_s020', 'n_views': 10, 'max_len': 1024, 'pooling': 'gauss', 'sigma_ratio': 0.2}

🚀 Running experiment: exp1_mean_nv6_len1024


Embedding (exp1_mean_nv6_len1024):  28%|██▊       | 3882/13711 [22:54<58:00,  2.82it/s]