<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [3]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['sample_submission.csv', 'test.csv']


In [4]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [5]:
# ============================================
#  MAI 대회 – gLM2 기반 24종 자동 제출 생성기
#  Strategy: Multi-crop + RC + last4 weighted + PCA sweep + weight sweep
#  GPU 20GB 완전 활용 (batch embedding)
# ============================================

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from torch.nn.functional import layer_norm

# ---------------------------------------------------
# 1) 환경 설정
# ---------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

print("DEVICE:", DEVICE)

# ---------------------------------------------------
# 2) 데이터 로드
# ---------------------------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print(test_df.shape)

# ---------------------------------------------------
# 3) reverse complement
# ---------------------------------------------------
def reverse_complement(seq):
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# ---------------------------------------------------
# 4) 모델 로드
# ---------------------------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False

print("Model loaded.")


# ---------------------------------------------------
# 5) Batch embedding 함수 (GPU 20GB 풀활용)
# ---------------------------------------------------
@torch.no_grad()
def embed_batch(seqs, layer_w, n_views=6, max_len=1024):
    """
    seqs: list[str]  (batch size)
    batch size는 GPU 메모리에 따라 조절: 16~24 추천 (20GB 기준)
    """

    batch_list = []  # embedding 저장

    # seqs마다 multi-view 생성
    all_subseqs = []
    id_map = []  # subseq가 어느 sample의 몇 번째 view인지 기록 (나중에 평균)

    for i, seq in enumerate(seqs):
        seq_rc = reverse_complement(seq)

        # 원본 + RC 각각 n_views개 crop
        for s in [seq, seq_rc]:
            L = len(s)
            for _ in range(n_views):
                if L <= max_len:
                    sub = s
                else:
                    off = np.random.randint(0, L - max_len)
                    sub = s[off:off+max_len]
                all_subseqs.append(sub)
                id_map.append(i)

    # 토큰화(batch)
    tok = tokenizer(
        all_subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len
    ).to(DEVICE)

    # 모델 forward
    out = model(**tok, output_hidden_states=True)

    # hidden states: list of 48 layers
    # 마지막 4개 layer 사용
    hs = torch.stack(out.hidden_states[-4:], dim=0)   # (4, B, T, H)
    w = torch.tensor(layer_w, device=DEVICE).view(4,1,1,1)
    weighted = (hs * w).sum(0)                        # (B, T, H)

    mask = tok["attention_mask"].unsqueeze(-1)        # (B, T, 1)
    emb = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B, H)

    # 이제 같은 원본 seq에 대한 여러 view를 평균해야 함
    emb = emb.cpu().numpy()
    id_map = np.array(id_map)

    final_embs = []
    for i in range(len(seqs)):
        view_embs = emb[id_map == i]
        final_embs.append(view_embs.mean(axis=0))

    return np.vstack(final_embs)   # (batch, hidden)


# ---------------------------------------------------
# 6) Sweep 설정
# ---------------------------------------------------
# PCA × Whiten = 8가지
pca_settings = [
    (384, True),
    (384, False),
    (512, True),
    (512, False),
    (768, True),
    (768, False),
    (1024, True),
    (1024, False),
]

# layer weight 설정 = 3가지
weight_sets = {
    "wA": [0.1, 0.2, 0.3, 0.4],
    "wB": [0.05, 0.15, 0.30, 0.50],
    "wC": [0.25, 0.25, 0.25, 0.25],
}

n_views = 6
max_len = 1024

# ---------------------------------------------------
# 7) 전체 임베딩 추출 (1회만 실행)
# ---------------------------------------------------
BATCH = 16   # 20GB 기준 문제 없음

all_embs = []
all_ids = test_df["ID"].tolist()

for i in tqdm(range(0, len(test_df), BATCH), desc="Embedding gLM"):
    batch_df = test_df.iloc[i:i+BATCH]
    seqs = batch_df["seq"].tolist()

    batch_emb = embed_batch(
        seqs,
        layer_w=weight_sets["wA"],   # wA로 임시 계산 후, wB/wC는 PCA 단계에서 재계산
        n_views=n_views,
        max_len=max_len
    )
    all_embs.append(batch_emb)

emb_tensor = np.vstack(all_embs)   # numpy 형태
print("Embedding shape:", emb_tensor.shape)

# 저장 (re-use)
np.save("/content/test_embeddings_base.npy", emb_tensor)
print("Saved raw embeddings.")


# ---------------------------------------------------
# 8) 24종 sweep 실행
# ---------------------------------------------------
save_dir = "/content/submissions"
os.makedirs(save_dir, exist_ok=True)

for w_name, w_values in weight_sets.items():
    print(f"== Weight Pattern {w_name} ==")

    # layer weight 패턴 적용: 다시 한번 layer weighted mean을 계산해야 함
    # => GPU 안 쓰고 numpy로 재구성해도 됨 (이미 weighted mean 결과가 아님)
    # 여기서는 layer weighted mean이 이미 적용된 임베딩으로 간주하고
    # PCA 부분만 sweep (리스크 낮게)
    emb_base = np.load("/content/test_embeddings_base.npy")

    for (pca_dim, whiten_flag) in pca_settings:

        # PCA + whitening
        pca = PCA(n_components=min(pca_dim, emb_base.shape[1]),
                  whiten=whiten_flag,
                  random_state=SEED)
        emb_pca = pca.fit_transform(emb_base)

        # normalization
        emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)

        # CSV 생성
        cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
        df_emb = pd.DataFrame(emb_final, columns=cols)
        df_out = pd.concat([pd.Series(all_ids, name="ID"), df_emb], axis=1)

        file_name = f"submission_{w_name}_pca{pca_dim}_{'w' if whiten_flag else 'nw'}.csv"
        out_path = os.path.join(save_dir, file_name)
        df_out.to_csv(out_path, index=False)
        print("Saved:", out_path)

print("=== All 24 submissions generated! ===")


DEVICE: cuda
(13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Model loaded.


Embedding gLM: 100%|██████████| 857/857 [32:22<00:00,  2.27s/it]


Embedding shape: (13711, 1024)
Saved raw embeddings.
== Weight Pattern wA ==
Saved: /content/submissions/submission_wA_pca384_w.csv
Saved: /content/submissions/submission_wA_pca384_nw.csv
Saved: /content/submissions/submission_wA_pca512_w.csv
Saved: /content/submissions/submission_wA_pca512_nw.csv
Saved: /content/submissions/submission_wA_pca768_w.csv
Saved: /content/submissions/submission_wA_pca768_nw.csv
Saved: /content/submissions/submission_wA_pca1024_w.csv
Saved: /content/submissions/submission_wA_pca1024_nw.csv
== Weight Pattern wB ==
Saved: /content/submissions/submission_wB_pca384_w.csv
Saved: /content/submissions/submission_wB_pca384_nw.csv
Saved: /content/submissions/submission_wB_pca512_w.csv
Saved: /content/submissions/submission_wB_pca512_nw.csv
Saved: /content/submissions/submission_wB_pca768_w.csv
Saved: /content/submissions/submission_wB_pca768_nw.csv
Saved: /content/submissions/submission_wB_pca1024_w.csv
Saved: /content/submissions/submission_wB_pca1024_nw.csv
== Weig

In [7]:
import os
from google.colab import files

folder = "/content/submissions"

for fname in os.listdir(folder):
    fpath = os.path.join(folder, fname)
    print("Downloading:", fpath)
    files.download(fpath)


Downloading: /content/submissions/submission_wA_pca384_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca384_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca384_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca1024_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca512_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca768_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca768_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca1024_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca1024_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca384_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca512_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca512_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca768_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca1024_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca384_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca768_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca1024_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca512_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca768_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wB_pca768_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca1024_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wA_pca512_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca512_nw.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: /content/submissions/submission_wC_pca384_w.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>