<a href="https://colab.research.google.com/github/Hanbin-git/kaggle/blob/main/kaggle_event20250512.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

stanford_rna_3d_folding_path = kagglehub.competition_download('stanford-rna-3d-folding')
biniroun_protenix_checkpoints_path = kagglehub.dataset_download('biniroun/protenix-checkpoints')
biniroun_usalign_path = kagglehub.dataset_download('biniroun/usalign')
biniroun_protenix_src_path = kagglehub.dataset_download('biniroun/protenix-src')

print('Data source import complete.')


In [None]:
!pip install ml-collections --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 1단계: 경로 등록 (최우선)
import sys
sys.path.insert(0, '/kaggle/input/protenix-src')

# 2단계: 사용할 모델 정확히 지정해서 import
from protenix.model.modules.transformer import AtomAttentionEncoder
from protenix.config.config import parse_configs

# 3단계: 확인용 출력
import protenix
print(protenix.__file__)


/kaggle/input/protenix-src/protenix/__init__.py


In [None]:
import pandas as pd

df = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/train_labels.csv')
print(df.columns)


Index(['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1'], dtype='object')


In [None]:
# 1단계: 경로 설정
import sys
sys.path.insert(0, '/kaggle/input/protenix-src')  # 경로 등록

# 2단계: 필요한 모듈 import
from protenix.model.modules.transformer import AtomAttentionEncoder
from protenix.config.config import parse_configs

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

# 3단계: 데이터셋 정의
class ProtenixDataset(Dataset):
    def __init__(self, csv_path):
        df = pd.read_csv(csv_path)

        self.input_feature_dicts = []
        self.labels = []

        grouped = df.groupby("ID")
        for _, group in grouped:
            group = group.sort_values("resid")

            ref_pos = group[["x_1", "y_1", "z_1"]].values.astype(np.float32)  # [N, 3]
            ref_element_raw = group["resname"].astype("category").cat.codes.values  # [N]
            ref_element = np.eye(128)[ref_element_raw]  # [N, 128]
            ref_charge = np.zeros((len(ref_element_raw), 1), dtype=np.float32)  # [N, 1]
            ref_mask = np.ones((len(ref_element_raw), 1), dtype=np.float32)  # [N, 1]
            ref_atom_name_chars = np.zeros((len(ref_element_raw), 4 * 64), dtype=np.float32)  # [N, 256]
            atom_to_token_idx = np.zeros(len(ref_element_raw), dtype=np.int64)  # [N]
            ref_space_uid = np.zeros(len(ref_element_raw), dtype=np.int64)  # [N]

            input_dict = {
                "ref_pos": torch.tensor(ref_pos, dtype=torch.float32),
                "ref_element": torch.tensor(ref_element, dtype=torch.float32),
                "ref_charge": torch.tensor(ref_charge, dtype=torch.float32),
                "ref_mask": torch.tensor(ref_mask, dtype=torch.float32),
                "ref_atom_name_chars": torch.tensor(ref_atom_name_chars, dtype=torch.float32),
                "atom_to_token_idx": torch.tensor(atom_to_token_idx, dtype=torch.long),
                "ref_space_uid": torch.tensor(ref_space_uid, dtype=torch.long),
            }

            self.input_feature_dicts.append(input_dict)
            self.labels.append(torch.tensor(ref_pos, dtype=torch.float32))  # [N, 3] 좌표 전체를 label로 사용

    def __len__(self):
        return len(self.input_feature_dicts)

    def __getitem__(self, idx):
        return self.input_feature_dicts[idx], self.labels[idx]

# 4단계: 모델 정의 (좌표 회귀용 head 추가)
class ProtenixRegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AtomAttentionEncoder(
            has_coords=False,
            c_token=384,
            c_atom=128,
            c_atompair=16,
            c_s=384,
            c_z=128,
            n_blocks=3,
            n_heads=4,
            n_queries=32,
            n_keys=128,
            blocks_per_ckpt=None
        )
        self.output_layer = nn.Linear(384, 3)  # 3D 좌표 회귀

    def forward(self, input_feature_dict):
        a, *_ = self.encoder(input_feature_dict)
        coords = self.output_layer(a)  # [B, N_token, 3]
        return coords

# 5단계: 학습 데이터 로딩
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ProtenixRegressionModel().to(device)
dataset = ProtenixDataset('/kaggle/input/stanford-rna-3d-folding/train_labels.csv')
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# 6단계: 학습 루프
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(3):
    running_loss = 0.0
    for inputs, labels in dataloader:
        input_feature_dict = {k: v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)  # [1, N, 3]

        optimizer.zero_grad()
        outputs = model(input_feature_dict)  # [1, N, 3]

        N = min(outputs.shape[1], labels.shape[1])  # 안정성 위해 일치 조정
        loss = criterion(outputs[:, :N, :], labels[:, :N, :])
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}")


In [None]:
!pip install --no-deps protenix
!pip install biopython
!pip install ml-collections
!pip install biotite==1.0.1
!pip install rdkit
