<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [3]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['sample_submission.csv', 'test.csv']


In [5]:
# ============================================
#  Advanced MAI Inference Model (No Train CSV Required)
#  - SAP Attention Pooling
#  - Projection Head (random init)
#  - Asymmetric RC Weighting
#  - Multi-crop (Orig 12 + RC 4)
#  - Weighted Last4 Hidden Mean
#  Author: Hanbin + GPT-5
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# ---------------------------------------------------
# Device / Seed
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("Device:", DEVICE)

# ---------------------------------------------------
# Load test.csv only
# ---------------------------------------------------
data_path = "/content/open"

test_df  = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df   = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

print("Loaded test:", test_df.shape)


# ---------------------------------------------------
# Reverse Complement
# ---------------------------------------------------
def reverse_complement(seq):
    table = str.maketrans("ACGT", "TGCA")
    return seq.translate(table)[::-1]


# ---------------------------------------------------
# SAP Attention Pooler
# ---------------------------------------------------
class SAP(torch.nn.Module):
    def __init__(self, hidden_dim=1280, heads=8):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(hidden_dim, heads, batch_first=True)
        self.norm = torch.nn.LayerNorm(hidden_dim)

    def forward(self, x, mask):
        out, _ = self.attn(x, x, x, key_padding_mask=~mask.bool())
        out = out.mean(1)
        return self.norm(out)


# ---------------------------------------------------
# Projection head (random weights since train.csv 없음)
# ---------------------------------------------------
class ProjectionHead(torch.nn.Module):
    def __init__(self, in_dim=1280, out_dim=512):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(in_dim, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, out_dim)
        )

    def forward(self, x):
        return self.net(x)


# ---------------------------------------------------
# Load Base Model
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE).eval()
model.config.use_cache = False

sap = SAP(hidden_dim=model.config.hidden_size).to(DEVICE)
proj = ProjectionHead(in_dim=model.config.hidden_size, out_dim=512).to(DEVICE)

print("Model loaded.")


# ---------------------------------------------------
# Embedding function
# ---------------------------------------------------
@torch.no_grad()
def embed_sequence(seq, n_orig=12, n_rc=4, max_len=1024):
    layer_w = torch.tensor([0.1, 0.2, 0.3, 0.4]).to(DEVICE)

    def _embed_once(s, n_views):
        embs = []
        for _ in range(n_views):
            offset = np.random.randint(0, max(1, len(s)-max_len))
            sub = s[offset:offset+max_len]

            tok = tokenizer(sub, return_tensors="pt",
                            truncation=True, padding=True,
                            max_length=max_len).to(DEVICE)

            out = model(**tok, output_hidden_states=True)
            hs = torch.stack(out.hidden_states[-4:], dim=0)
            weighted = (hs * layer_w.view(4,1,1,1)).sum(0)

            mask = tok["attention_mask"]

            pooled = sap(weighted, mask)      # (B,H)
            embs.append(pooled)

        return torch.stack(embs, 0).mean(0)  # (H)

    # Original + RC
    e_orig = _embed_once(seq, n_orig)
    e_rc   = _embed_once(reverse_complement(seq), n_rc)

    emb = 0.7 * e_orig + 0.3 * e_rc
    return proj(emb).cpu()   # projection head 적용


# ---------------------------------------------------
# All test embedding
# ---------------------------------------------------
all_ids = []
all_embs = []

print("Embedding test...")

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    all_ids.append(row["ID"])
    all_embs.append(embed_sequence(row["seq"]))

emb = torch.vstack(all_embs).numpy()
print("Embedding shape:", emb.shape)


# ---------------------------------------------------
# PCA Whitening
# ---------------------------------------------------
pca_dim = 512
pca = PCA(n_components=pca_dim, whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)


# ---------------------------------------------------
# Build submission
# ---------------------------------------------------
cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
submission = pd.DataFrame({"ID": all_ids})

for i, c in enumerate(cols):
    submission[c] = emb_final[:, i]

out_path = "/content/submission_attention_proj.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


# ---------------------------------------------------
# Download
# ---------------------------------------------------
from google.colab import files
files.download(out_path)


Device: cuda
Loaded test: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Model loaded.
Embedding test...


100%|██████████| 13711/13711 [2:00:03<00:00,  1.90it/s]


Embedding shape: (13711, 512)


  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c] = emb_final[:, i]
  submission[c

Saved: /content/submission_attention_proj.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>