<a href="https://colab.research.google.com/github/Hanbin-git/kaggle/blob/main/restart2025.05.13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

stanford_rna_3d_folding_path = kagglehub.competition_download('stanford-rna-3d-folding')
biniroun_protenix_checkpoints_path = kagglehub.dataset_download('biniroun/protenix-checkpoints')
biniroun_usalign_path = kagglehub.dataset_download('biniroun/usalign')
biniroun_protenix_src_path = kagglehub.dataset_download('biniroun/protenix-src')

print('Data source import complete.')


In [None]:
# 필수 패키지 설치
!pip install torch torchvision torchaudio --quiet
!pip install transformers --quiet --no-deps
!pip install biopython --quiet --no-deps
!pip install ml-collections --quiet --no-deps
!pip install pandas numpy --quiet


In [None]:
import pandas as pd

# 서열 확인
df_seq = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/train_sequences.csv')
print("train_sequences.csv:", df_seq.shape)
print(df_seq.head())

# 구조 좌표 확인
df_labels = pd.read_csv('/kaggle/input/stanford-rna-3d-folding/train_labels.csv')
print("train_labels.csv:", df_labels.shape)
print(df_labels.columns)

# 염기 이름 및 좌표 구성 확인
print(df_labels[['resname', 'x_1', 'y_1', 'z_1']].head())



In [None]:
from pathlib import Path
from Bio import SeqIO
from collections import Counter

def scan_msa_for_degenerate_bases(msa_dir):
    counts = Counter()
    for fasta in Path(msa_dir).glob("*.fasta"):
        for record in SeqIO.parse(str(fasta), "fasta"):
            if not record.id.startswith("query"):
                for base in record.seq:
                    if base not in "AUGC":
                        counts[base] += 1
    return counts

msa_counts = scan_msa_for_degenerate_bases("/kaggle/input/stanford-rna-3d-folding/MSA")
print("MSA 확장 염기 사용 현황:", msa_counts)


In [None]:
# RNA 서열 염기 → 숫자 인덱스 매핑
BASE2IDX = {
    "A": 0,
    "U": 1,
    "G": 2,
    "C": 3,
    "R": 4,  # A or G
    "Y": 5,  # C or U
    "S": 6,  # G or C
    "W": 7,  # A or U
    "K": 8,  # G or U
    "M": 9,  # A or C
    "B": 10, # C or G or U
    "D": 11, # A or G or U
    "H": 12, # A or C or U
    "V": 13, # A or C or G
    "N": 14, # any base
    "-": 15  # gap in MSA
}

In [None]:
def tokenize_sequence(seq, base2idx=BASE2IDX):
    """
    RNA 서열을 숫자 인덱스 리스트로 변환합니다.
    알 수 없는 염기는 기본적으로 'N'으로 처리됩니다.
    """
    return [base2idx.get(base, base2idx["N"]) for base in seq]


In [None]:
import torch
def one_hot_encode(tokens, vocab_size=len(BASE2IDX)):
    return torch.nn.functional.one_hot(torch.tensor(tokens), num_classes=vocab_size)


In [None]:
sequences = ["AGCU", "NYR-", "UGCGAU"]
token_batch = [tokenize_sequence(seq) for seq in sequences]


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def batchify_token_sequences(sequences, pad_token=BASE2IDX["-"]):
    token_lists = [torch.tensor(tokenize_sequence(seq)) for seq in sequences]
    padded = pad_sequence(token_lists, batch_first=True, padding_value=pad_token)
    return padded


In [None]:
batch = ["AGCU", "NYR-", "UGCGAU"]
padded_batch = batchify_token_sequences(batch)
print(padded_batch)
print(padded_batch.shape)


In [None]:
# 1단계: PositionalEncoding 모듈 구현
import torch
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096):  # 기존 512 → 충분히 큰 값으로 확장
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x


In [None]:
# 2단계: RNA3DTransformer 모델 정의
class RNA3DTransformer(nn.Module):
    def __init__(self, vocab_size=16, d_model=128, nhead=4, num_layers=4, dropout=0.1):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.output_layer = nn.Linear(d_model, 3)  # x, y, z 좌표 출력

    def forward(self, src):
        # src: [batch_size, seq_len]
        x = self.embedding(src)                        # → [batch, seq_len, d_model]
        x = self.pos_encoder(x)                        # → positional encoding 추가
        x = x.permute(1, 0, 2)                         # → [seq_len, batch, d_model] (transformer 요구 형태)
        x = self.transformer_encoder(x)                # → transformer 인코딩
        x = x.permute(1, 0, 2)                         # → 다시 [batch, seq_len, d_model]
        out = self.output_layer(x)                     # → [batch, seq_len, 3]
        return out


In [None]:
model = RNA3DTransformer()

# 예시 입력
input_ids = batchify_token_sequences(["AGCU", "RYNK-", "UGCGAU"])  # padding 포함
output_coords = model(input_ids)

print("출력 좌표 shape:", output_coords.shape)  # [batch, seq_len, 3]


In [None]:
# 손실 함수 정의
import torch.nn as nn

criterion = nn.MSELoss()


In [None]:
# 옵티마이저 및 모델 준비
import torch.optim as optim

model = RNA3DTransformer()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [None]:
import torch.nn.functional as F

def train_one_epoch(model, dataloader, optimizer, criterion=None, device='cpu'):
    model.train()
    total_loss = 0.0
    total_batches = 0

    for input_ids, target_coords in dataloader:
        input_ids = input_ids.to(device)
        target_coords = target_coords.to(device)

        pred_coords = model(input_ids)  # [B, L, 3]

        # Step 1: 좌표가 0이 아닌 위치 마스크 만들기
        valid_mask = (target_coords.abs().sum(dim=-1) != 0)  # [B, L]

        # Step 2: 개별 loss 계산 (reduction 없이)
        loss_each = F.mse_loss(pred_coords, target_coords, reduction='none')  # [B, L, 3]

        # Step 3: 평균 내기 전 masking
        loss_each = loss_each.mean(dim=-1)        # [B, L]
        loss_each = loss_each * valid_mask        # 유효하지 않은 곳은 0으로

        # Step 4: 유효한 위치 수로 나누어 평균 loss 계산
        loss = loss_each.sum() / valid_mask.sum()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_batches += 1

    return total_loss / total_batches


In [None]:
from torch.utils.data import Dataset

class RNADataset(Dataset):
    def __init__(self, df_seq, df_labels):
        self.df_seq = df_seq
        self.df_labels = df_labels
        self.id_list = df_seq['target_id'].unique()

    def __len__(self):
        return len(self.id_list)

    def __getitem__(self, idx):
        target_id = self.id_list[idx]

        # 서열
        sequence = self.df_seq[self.df_seq['target_id'] == target_id]['sequence'].values[0]
        tokenized = tokenize_sequence(sequence)

        # 좌표
        coords = self.df_labels[self.df_labels['ID'] == target_id][['x_1', 'y_1', 'z_1']].values
        coords = torch.tensor(coords, dtype=torch.float32)

        return tokenized, coords



In [None]:
from torch.utils.data import DataLoader

train_dataset = RNADataset(df_seq, df_labels)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda x: (
        batchify_token_sequences([s for s, _ in x]),
        torch.nn.utils.rnn.pad_sequence([t for _, t in x], batch_first=True)
    )
)


In [None]:
seq = "AGCUWRYKMBDHVN-"
tokens = tokenize_sequence(seq)
print("🔢 Tokenized:", tokens)
encoded = one_hot_encode(tokens)
print("🔍 One-hot shape:", encoded.shape)  # (len(seq), 16)
print(encoded)
batch = ["AGCU", "NYR-", "UGCGAU"]
padded_batch = batchify_token_sequences(batch)
print("📦 Batch shape:", padded_batch.shape)  # 예: torch.Size([3, 6])
print(padded_batch)
dataset = RNADataset(df_seq, df_labels)
sample = dataset[0]

print("🎯 Sample input:", sample[0])  # 토큰 인덱스
print("🎯 Sample coords shape:", sample[1].shape)  # 좌표 (N, 3)


In [None]:
import torch.nn.functional as F

def train_one_epoch(model, dataloader, optimizer, device='cpu'):
    model.train()
    total_loss = 0.0
    total_batches = 0

    for input_ids, target_coords in dataloader:
        input_ids = input_ids.to(device)
        target_coords = target_coords.to(device)

        pred_coords = model(input_ids)  # [B, L, 3]

        # 1️⃣ 좌표가 없는 위치 마스킹
        valid_mask = (target_coords.abs().sum(dim=-1) != 0)  # [B, L]

        # 2️⃣ MSE loss 개별 계산
        loss_each = F.mse_loss(pred_coords, target_coords, reduction='none')  # [B, L, 3]
        loss_each = loss_each.mean(dim=-1)  # [B, L]

        # 3️⃣ 유효한 좌표에만 적용
        loss_each = loss_each * valid_mask  # [B, L]
        loss = loss_each.sum() / valid_mask.sum()  # 최종 평균

        # 4️⃣ 역전파
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_batches += 1

    return total_loss / total_batches


In [None]:
import torch.nn.functional as F

def evaluate(model, dataloader, device='cpu'):
    model.eval()
    total_mae = 0.0
    total_count = 0

    with torch.no_grad():
        for input_ids, target_coords in dataloader:
            input_ids = input_ids.to(device)
            target_coords = target_coords.to(device)

            pred_coords = model(input_ids)  # [B, L, 3]

            # 좌표가 있는 위치에만 MAE 계산
            valid_mask = (target_coords.abs().sum(dim=-1) != 0)  # [B, L]

            # MAE 계산
            abs_error = (pred_coords - target_coords).abs().mean(dim=-1)  # [B, L]
            masked_error = abs_error * valid_mask  # [B, L]
            mae = masked_error.sum() / valid_mask.sum()  # 평균 MAE

            total_mae += mae.item()
            total_count += 1

    return total_mae / total_count


In [None]:
import matplotlib.pyplot as plt

def plot_metrics(train_losses, val_maes):
    epochs = list(range(1, len(train_losses) + 1))
    plt.figure(figsize=(10, 4))

    # 학습 Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, marker='o', label='Train Loss (MSE)')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid()
    plt.legend()

    # 검증 MAE
    plt.subplot(1, 2, 2)
    plt.plot(epochs, val_maes, marker='s', color='orange', label='Validation MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.title('Validation MAE')
    plt.grid()
    plt.legend()

    plt.tight_layout()
    plt.show()


In [None]:
set_seq_ids = set(df_seq['target_id'].unique())
set_label_ids = set(df_labels['ID'].unique())

missing_ids = set_seq_ids - set_label_ids
print("❗ 좌표가 없는 target_id 개수:", len(missing_ids))
print("예시:", list(missing_ids)[:5])


In [None]:
# 하이퍼파라미터 세팅
model = RNA3DTransformer()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# 기록용 리스트
train_losses = []
val_maes = []

# 1회차 학습
train_loss = train_one_epoch(model, train_loader, optimizer)
val_mae = evaluate(model, train_loader)

train_losses.append(train_loss)
val_maes.append(val_mae)

print(f"✅ Epoch 1 - Train Loss: {train_loss:.4f}, Val MAE: {val_mae:.4f}")
plot_metrics(train_losses, val_maes)
