<a href="https://colab.research.google.com/github/Hanbin-git/DNA/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U bitsandbytes peft accelerate transformers datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manyl

In [2]:
# 압축풀기
import zipfile
import os

zip_path = "/content/drive/MyDrive/DNA/open.zip"
extract_dir = "/content/open"

os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ 압축 해제 완료!")
print("압축 풀린 파일 목록:", os.listdir(extract_dir))

✅ 압축 해제 완료!
압축 풀린 파일 목록: ['sample_submission.csv', 'test.csv']


In [3]:
# 데이터 불러오기
import pandas as pd

data_dir = "/content/open"

test_df = pd.read_csv(os.path.join(data_dir, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_dir, "sample_submission.csv"))

print("✅ test_df:", test_df.shape)
print("✅ sub_df:", sub_df.shape)
test_df.head()


✅ test_df: (13711, 2)
✅ sub_df: (13711, 769)


Unnamed: 0,ID,seq
0,TEST_000000,ATCATTTTTATTTTTTAGTTTTATGAGACGCTGCCTTGCTATGTCA...
1,TEST_000001,CGACGTCCCCGTAGCGGCCGAAGTCGAGGGGCAGCAGGCGATCGTG...
2,TEST_000002,GGTAGTAAGAAGGAAAATGACAGCATGGAAGCAGCAATACCAGTAA...
3,TEST_000003,CAGCGCATATACTCAGGGCCATGGTGGGTACTGTTCCCATGGCCAG...
4,TEST_000004,TTCATAATTGCTATCAGTCTATGGGCTAATATTTTATACATCAATG...


In [5]:
# ============================================
#  MAI 대회 – 기존 0.5869 파이프라인을
#  GPU 배치 + LayerNorm 포함으로 재구현
# ============================================

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from torch.nn.functional import layer_norm

# -----------------------------
# 1) 환경 설정
# -----------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("DEVICE:", DEVICE)

data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
print("test_df:", test_df.shape)

# -----------------------------
# 2) reverse complement
# -----------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -----------------------------
# 3) 모델 & 토크나이저
# -----------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("Model loaded.")

# 원본과 동일한 레이어 가중치
LAYER_WEIGHTS = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE)

N_VIEWS = 8
USE_RC   = True  # 원본은 RC 사용
MAX_LEN  = 1024
BATCH    = 24    # VRAM 20GB면 이 정도는 여유, 부족하면 16으로 줄이기

# -----------------------------
# 4) 배치 임베딩 함수
# -----------------------------
@torch.no_grad()
def embed_batch(seqs):
    """
    seqs: list[str] (batch_size)
    원본 코드와 동일한 수식:
      - 각 seq에 대해 (원본 + RC) * N_VIEWS crop
      - last4 weighted mean
      - view 평균
    """
    subseqs = []
    owner   = []  # subseq가 어느 sample에 속하는지 기록

    for idx, seq in enumerate(seqs):
        strands = [seq, reverse_complement(seq)] if USE_RC else [seq]
        for s in strands:
            L = len(s)
            for _ in range(N_VIEWS):
                if L <= MAX_LEN:
                    sub = s
                else:
                    offset = np.random.randint(0, L - MAX_LEN + 1)
                    sub = s[offset:offset + MAX_LEN]
                subseqs.append(sub)
                owner.append(idx)

    tok = tokenizer(
        subseqs,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LEN
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs  = torch.stack(out.hidden_states[-4:], dim=0)          # (4, B, T, H)
    w   = LAYER_WEIGHTS.view(4,1,1,1)
    weighted = (hs * w).sum(0)                                # (B, T, H)
    mask = tok["attention_mask"].unsqueeze(-1)                # (B, T, 1)
    emb  = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)# (B, H)

    emb = emb.cpu().numpy()
    owner = np.array(owner)

    # 같은 seq에 해당하는 view들을 평균
    out_list = []
    for i in range(len(seqs)):
        views = emb[owner == i]
        out_list.append(views.mean(axis=0))

    return np.vstack(out_list)   # (batch, H)

# -----------------------------
# 5) 전체 test 임베딩 추출
# -----------------------------
all_embs = []
all_ids  = test_df["ID"].tolist()

for start in tqdm(range(0, len(test_df), BATCH), desc="Embedding (batched)"):
    batch_df = test_df.iloc[start:start+BATCH]
    seqs = batch_df["seq"].tolist()
    batch_emb = embed_batch(seqs)          # numpy (B, H)
    all_embs.append(torch.from_numpy(batch_emb))

emb_tensor = torch.cat(all_embs, dim=0)    # (N, H) torch tensor
print("Raw emb_tensor:", emb_tensor.shape)

# -----------------------------
# 6) LayerNorm + PCA(512, whiten=True) + L2 norm
#    → 원본 0.5869 파이프라인과 동일
# -----------------------------
emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])  # (N, H)
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]),
          whiten=True,
          random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("Final emb shape:", emb_final.shape)

# -----------------------------
# 7) 제출 파일 생성
# -----------------------------
cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_highscore_batched.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)


DEVICE: cuda
test_df: (13711, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

Model loaded.


Embedding (batched): 100%|██████████| 572/572 [1:41:59<00:00, 10.70s/it]


Raw emb_tensor: torch.Size([13711, 1024])
Final emb shape: (13711, 512)
Saved: /content/submission_highscore_batched.csv


In [6]:
from google.colab import files
files.download("/content/submission_highscore_batched.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# ============================================
# 1) High-score batched version (원본과 수식 동일)
#    output: /content/submission_highscore_batched.csv
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# -------------------------------
# 1. 환경 설정
# -------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("✅ Device:", DEVICE)

# 배치 크기 (GPU 여유에 따라 16~64 사이로 조절)
BATCH_SIZE = 32

# -------------------------------
# 2. 데이터 로드
# -------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape, sub_df.shape)

# -------------------------------
# 3. Reverse complement
# -------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -------------------------------
# 4. 모델 / 토크나이저 로드 (FP32)
# -------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

# 마지막 4개 레이어 가중치 (원본과 동일)
layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# -------------------------------
# 5. 배치 단위 임베딩 함수
# -------------------------------
@torch.no_grad()
def embed_batch(seqs, n_views=6, use_rc=True, max_len=1024):
    """
    seqs: 길이 B인 시퀀스 리스트
    - 각 시퀀스를 원본/RC 각각 n_views번씩 그대로 모델에 넣고
    - 마지막 4개 히든스테이트 가중합 + token mean pooling
    - view 평균까지 반환 (B, H)
    """
    views = []
    for s in seqs:
        # 원본
        for _ in range(n_views):
            views.append(s[:max_len])  # 길이 1024라 crop 없음
        # 역상보
        if use_rc:
            rc = reverse_complement(s)
            for _ in range(n_views):
                views.append(rc[:max_len])

    # 토크나이즈
    tok = tokenizer(
        views,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)        # (4, B_total, T, H)
    weighted = (hs * layer_weights).sum(0)                 # (B_total, T, H)

    mask = tok["attention_mask"].unsqueeze(-1)             # (B_total, T, 1)
    emb_views = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B_total, H)

    # view 평균: B_total = B * (2*n_views)
    factor = (2 * n_views) if use_rc else n_views
    B_total, H = emb_views.shape
    B = B_total // factor
    emb_seq = emb_views.view(B, factor, H).mean(1)         # (B, H)

    return emb_seq.cpu()

# -------------------------------
# 6. 전체 test 임베딩 추출
# -------------------------------
all_ids = []
all_embs = []

for start in tqdm(range(0, len(test_df), BATCH_SIZE), desc="Embedding (batched)"):
    end = min(start + BATCH_SIZE, len(test_df))
    batch_df = test_df.iloc[start:end]
    seqs = batch_df["seq"].tolist()
    ids  = batch_df["ID"].tolist()

    emb_batch = embed_batch(seqs, n_views=6, use_rc=True, max_len=1024)  # 원본 설정
    all_ids.extend(ids)
    all_embs.append(emb_batch)

emb_tensor = torch.vstack(all_embs)   # (N, H)
print("✅ Raw embedding shape:", emb_tensor.shape)

# -------------------------------
# 7. LayerNorm + PCA(512, whiten) + L2 (원본과 동일)
# -------------------------------
from torch.nn.functional import layer_norm

emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])   # (N, H)
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)                          # (N, 512)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# -------------------------------
# 8. 제출 파일 생성
# -------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

out_path = "/content/submission_highscore_batched.csv"
submission.to_csv(out_path, index=False)
print("🎯 Saved:", out_path)

# -------------------------------
# 9. 다운로드
# -------------------------------
from google.colab import files
files.download(out_path)


✅ Device: cuda
✅ Loaded: (13711, 2) (13711, 769)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

✅ Model loaded.


Embedding (batched): 100%|██████████| 429/429 [31:32<00:00,  4.41s/it]


✅ Raw embedding shape: torch.Size([13711, 1024])
✅ Final embedding shape: (13711, 512)
🎯 Saved: /content/submission_highscore_batched.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
# ============================================
# 2) N_VIEWS & RC sweep (8개 제출 파일 자동 생성)
#    output dir: /content/submissions
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm

# -------------------------------
# 1. 환경 설정
# -------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("✅ Device:", DEVICE)

BATCH_SIZE = 32   # GPU 여유에 따라 조절

# -------------------------------
# 2. 데이터 로드
# -------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape, sub_df.shape)

# -------------------------------
# 3. Reverse complement
# -------------------------------
def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -------------------------------
# 4. 모델 / 토크나이저 로드 (FP32)
# -------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# -------------------------------
# 5. 배치 임베딩 함수 (1️⃣과 동일)
# -------------------------------
@torch.no_grad()
def embed_batch(seqs, n_views=6, use_rc=True, max_len=1024):
    views = []
    for s in seqs:
        for _ in range(n_views):
            views.append(s[:max_len])
        if use_rc:
            rc = reverse_complement(s)
            for _ in range(n_views):
                views.append(rc[:max_len])

    tok = tokenizer(
        views,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)        # (4, B_total, T, H)
    weighted = (hs * layer_weights).sum(0)                 # (B_total, T, H)

    mask = tok["attention_mask"].unsqueeze(-1)
    emb_views = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)  # (B_total, H)

    factor = (2 * n_views) if use_rc else n_views
    B_total, H = emb_views.shape
    B = B_total // factor
    emb_seq = emb_views.view(B, factor, H).mean(1)         # (B, H)

    return emb_seq.cpu()

# -------------------------------
# 6. Sweep 설정
# -------------------------------
configs = []
for n_views in [6, 8, 10, 12]:
    for use_rc in [True, False]:
        configs.append((n_views, use_rc))

os.makedirs("/content/submissions", exist_ok=True)

# -------------------------------
# 7. 각 설정별로 전체 임베딩 + PCA + CSV 생성
# -------------------------------
from torch.nn.functional import layer_norm

for n_views, use_rc in configs:
    tag = f"nv{n_views}_" + ("rc" if use_rc else "nor")
    print(f"\n==============================")
    print(f"Running config: {tag}")
    print(f"==============================")

    all_ids = []
    all_embs = []

    for start in tqdm(range(0, len(test_df), BATCH_SIZE), desc=f"Embedding ({tag})"):
        end = min(start + BATCH_SIZE, len(test_df))
        batch_df = test_df.iloc[start:end]
        seqs = batch_df["seq"].tolist()
        ids  = batch_df["ID"].tolist()

        emb_batch = embed_batch(seqs, n_views=n_views, use_rc=use_rc, max_len=1024)
        all_ids.extend(ids)
        all_embs.append(emb_batch)

    emb_tensor = torch.vstack(all_embs)
    print("  Raw embedding shape:", emb_tensor.shape)

    emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
    emb_np = emb_normed.numpy()

    pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
    emb_pca = pca.fit_transform(emb_np)

    emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
    print("  Final embedding shape:", emb_final.shape)

    emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
    emb_df = pd.DataFrame(emb_final, columns=emb_cols)
    submission = pd.concat([pd.Series(all_ids, name="ID"), emb_df], axis=1)

    out_path = f"/content/submissions/submission_{tag}_pca512.csv"
    submission.to_csv(out_path, index=False)
    print("  Saved:", out_path)

print("\n✅ All sweep submissions saved in /content/submissions")

# -------------------------------
# 8. 전체를 zip으로 묶어서 다운로드
# -------------------------------
import shutil
zip_path = "/content/submissions_nv_rc_sweep"
shutil.make_archive(zip_path, "zip", "/content/submissions")

from google.colab import files
files.download(zip_path + ".zip")


✅ Device: cuda
✅ Loaded: (13711, 2) (13711, 769)
✅ Model loaded.

Running config: nv6_rc


Embedding (nv6_rc):   2%|▏         | 8/429 [00:35<31:11,  4.45s/it]


KeyboardInterrupt: 

In [None]:
# ============================================
# 3) 파생변수 기반 스마트 뷰 전략
#    - GC / entropy / poly_len 으로 n_views 결정
#    - 임베딩 수식/후처리는 동일
#    output: /content/submission_smartviews_features.csv
# ============================================

import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.decomposition import PCA
from tqdm import tqdm
from collections import Counter
from math import log2

# -------------------------------
# 1. 환경 설정
# -------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
print("✅ Device:", DEVICE)

BATCH_SIZE = 24   # n_views 10까지 고려해서 조금 보수적으로

# -------------------------------
# 2. 데이터 로드
# -------------------------------
data_path = "/content/open"
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))
sub_df  = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
print("✅ Loaded:", test_df.shape, sub_df.shape)

# -------------------------------
# 3. 파생변수 함수들
# -------------------------------
def gc_content(seq: str) -> float:
    seq = seq.upper()
    g = seq.count("G")
    c = seq.count("C")
    length = len(seq)
    return (g + c) / length if length > 0 else 0.0

def seq_entropy(seq: str) -> float:
    counts = Counter(seq)
    total = len(seq)
    if total == 0:
        return 0.0
    probs = [v / total for v in counts.values()]
    return -sum(p * log2(p) for p in probs if p > 0)

def longest_repeat(seq: str) -> int:
    if not seq:
        return 0
    max_len = 1
    cur_len = 1
    for i in range(1, len(seq)):
        if seq[i] == seq[i-1]:
            cur_len += 1
            max_len = max(max_len, cur_len)
        else:
            cur_len = 1
    return max_len

def reverse_complement(seq: str) -> str:
    tr = str.maketrans("ACGT", "TGCA")
    return seq.translate(tr)[::-1]

# -------------------------------
# 4. 파생변수 계산 + n_views 결정
# -------------------------------
print("🔍 Computing sequence features...")
test_df["gc"]       = test_df["seq"].apply(gc_content)
test_df["entropy"]  = test_df["seq"].apply(seq_entropy)
test_df["poly_len"] = test_df["seq"].apply(longest_repeat)

def decide_n_views(gc, entropy, poly_len):
    # 고 GC & 고 복잡도 → 더 많이 본다
    if gc >= 0.55 and entropy >= 1.98:
        return 10
    # 반복 구간/저 복잡도 → 적게 본다
    elif poly_len >= 15 or entropy < 1.94:
        return 4
    # 일반 케이스
    else:
        return 6

test_df["n_views"] = test_df[["gc", "entropy", "poly_len"]].apply(
    lambda row: decide_n_views(row["gc"], row["entropy"], row["poly_len"]),
    axis=1
)
print(test_df["n_views"].value_counts())

# -------------------------------
# 5. 모델 / 토크나이저 로드
# -------------------------------
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model = model.to(DEVICE).eval()
model.config.use_cache = False
print("✅ Model loaded.")

layer_weights = torch.tensor([0.1, 0.2, 0.3, 0.4], device=DEVICE).view(4, 1, 1, 1)

# -------------------------------
# 6. 배치 임베딩 함수 (n_views 인자로 받음)
# -------------------------------
@torch.no_grad()
def embed_batch(seqs, n_views=6, max_len=1024):
    """
    - 항상 reverse complement 사용
    - 각 시퀀스마다 n_views 설정만 다르게 호출
    """
    views = []
    for s in seqs:
        for _ in range(n_views):
            views.append(s[:max_len])
        rc = reverse_complement(s)
        for _ in range(n_views):
            views.append(rc[:max_len])

    tok = tokenizer(
        views,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_len,
    ).to(DEVICE)

    out = model(**tok, output_hidden_states=True)
    hs = torch.stack(out.hidden_states[-4:], dim=0)
    weighted = (hs * layer_weights).sum(0)

    mask = tok["attention_mask"].unsqueeze(-1)
    emb_views = (weighted * mask).sum(1) / mask.sum(1).clamp(min=1)

    factor = 2 * n_views
    B_total, H = emb_views.shape
    B = B_total // factor
    emb_seq = emb_views.view(B, factor, H).mean(1)

    return emb_seq.cpu()

# -------------------------------
# 7. n_views 그룹별로 나눠서 임베딩, 원래 순서에 맞게 재조립
# -------------------------------
N = len(test_df)
emb_list = [None] * N
id_list  = test_df["ID"].tolist()

unique_views = sorted(test_df["n_views"].unique().tolist())
print("🔧 Unique n_views:", unique_views)

for nv in unique_views:
    idxs = np.where(test_df["n_views"].values == nv)[0]
    print(f"\n=== Processing n_views={nv}, count={len(idxs)} ===")

    for start_pos in tqdm(range(0, len(idxs), BATCH_SIZE), desc=f"n_views={nv}"):
        batch_indices = idxs[start_pos:start_pos + BATCH_SIZE]
        seqs = test_df.loc[batch_indices, "seq"].tolist()

        emb_batch = embed_batch(seqs, n_views=nv, max_len=1024)  # (B, H)

        for k, idx in enumerate(batch_indices):
            emb_list[idx] = emb_batch[k]

# 리스트 합치기
emb_tensor = torch.vstack(emb_list)
print("✅ Raw embedding shape:", emb_tensor.shape)

# -------------------------------
# 8. LayerNorm + PCA + L2
# -------------------------------
from torch.nn.functional import layer_norm

emb_normed = layer_norm(emb_tensor, emb_tensor.shape[1:])
emb_np = emb_normed.numpy()

pca = PCA(n_components=min(512, emb_np.shape[1]), whiten=True, random_state=SEED)
emb_pca = pca.fit_transform(emb_np)

emb_final = emb_pca / np.linalg.norm(emb_pca, axis=1, keepdims=True)
print("✅ Final embedding shape:", emb_final.shape)

# -------------------------------
# 9. 제출 파일 생성 + 다운로드
# -------------------------------
emb_cols = [f"emb_{i:04d}" for i in range(emb_final.shape[1])]
emb_df = pd.DataFrame(emb_final, columns=emb_cols)
submission = pd.concat([pd.Series(id_list, name="ID"), emb_df], axis=1)

out_path = "/content/submission_smartviews_features.csv"
submission.to_csv(out_path, index=False)
print("🎯 Saved:", out_path)

from google.colab import files
files.download(out_path)
