<a href="https://colab.research.google.com/github/Hanbin-git/kaggle/blob/main/Randadomness3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

stanford_rna_3d_folding_path = kagglehub.competition_download('stanford-rna-3d-folding')
metric_usalign_path = kagglehub.dataset_download('metric/usalign')
geraseva_protenix_checkpoints_path = kagglehub.dataset_download('geraseva/protenix-checkpoints')
biniroun_protenix_packages_path = kagglehub.dataset_download('biniroun/protenix-packages')
biniroun_linux_biowheels_310_path = kagglehub.dataset_download('biniroun/linux-biowheels-310')
biniroun_linux_mlwhl_311_path = kagglehub.dataset_download('biniroun/linux-mlwhl-311')
biniroun_protenix_main_path = kagglehub.dataset_download('biniroun/protenix-main')
biniroun_rdkit_cp310_path = kagglehub.dataset_download('biniroun/rdkit-cp310')
metric_ribonanza_tm_score_path = kagglehub.notebook_output_download('metric/ribonanza-tm-score')

print('Data source import complete.')


In [None]:
flag = 1
if flag == 1:
    !pip install --no-index --find-links=/kaggle/input/linux-biowheels-310 biopython numpy
    !pip install --no-index --find-links=/kaggle/input/linux-mlwhl-311 ml-collections contextlib2
    !pip install --no-index --find-links=/kaggle/input/linux-biowheels-310 biopython
    !pip install --no-index --find-links=/kaggle/input/rdkit-cp310 rdkit
    !pip install --no-index --find-links=/kaggle/input/linux-mlwhl-311 biopython ml-collections contextlib2 rdkit
    !pip install --no-index --find-links=/kaggle/input/linux-mlwhl-311 biotite
    !pip install --no-index --find-links=/kaggle/input/linux-mlwhl-311 biopython ml-collections contextlib2 rdkit biotite


Looking in links: /kaggle/input/linux-biowheels-310
Processing /kaggle/input/linux-biowheels-310/biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: biopython
Successfully installed biopython-1.81
Looking in links: /kaggle/input/linux-mlwhl-311
Processing /kaggle/input/linux-mlwhl-311/ml_collections-0.1.0-py3-none-any.whl
Processing /kaggle/input/linux-mlwhl-311/contextlib2-21.6.0-py2.py3-none-any.whl
Installing collected packages: contextlib2, ml-collections
Successfully installed contextlib2-21.6.0 ml-collections-0.1.0
Looking in links: /kaggle/input/linux-biowheels-310
Looking in links: /kaggle/input/rdkit-cp310
Processing /kaggle/input/rdkit-cp310/rdkit-2024.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2024.3.1
Looking in links: /kaggle/input/linux-mlwhl-311
Looking in links: /kaggle/input/linux-mlwhl-311
Processing /kaggle/input/linux-mlwhl-311/b

In [None]:
# 📌 기본 세팅
import os, sys, numpy as np, pandas as pd, torch, warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

# 📌 GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"📌 Using device: {device}")

# 📌 USAlign 설정
os.system("cp /kaggle/input/usalign/USalign /kaggle/working/")
os.system("chmod u+x /kaggle/working/USalign")

# 📌 Protenix 코드 등록
sys.path.append("/kaggle/input/protenix-main")
sys.path.append("/kaggle/input/protenix-packages")

# 📌 데이터 로딩 및 병합 (v2 우선)
seq_v1 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_sequences.csv")
seq_v2 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_sequences.v2.csv")
label_v1 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_labels.csv")
label_v2 = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/train_labels.v2.csv")

seq_v1 = seq_v1[~seq_v1["target_id"].isin(seq_v2["target_id"])]
label_v1 = label_v1[~label_v1["ID"].str.extract(r"^(.+)_\d+$")[0].isin(label_v2["ID"].str.extract(r"^(.+)_\d+$")[0])]

merged_seq = pd.concat([seq_v2, seq_v1], ignore_index=True)
merged_label = pd.concat([label_v2, label_v1], ignore_index=True)

# 📌 입력 생성
inputs = []
for _, row in merged_seq.iterrows():
    tid = row["target_id"]
    seq = row["sequence"]
    lbl = merged_label[merged_label["ID"].str.startswith(tid + "_")].copy()
    lbl.sort_values("resid", inplace=True)
    if len(lbl) != len(seq):
        continue
    coords = lbl[["x_1", "y_1", "z_1"]].values.astype(np.float32)
    inputs.append({
        "name": tid,
        "sequences": [{
            "rnaSequence": {"sequence": seq, "count": 1},
            "coordinates": coords.tolist()
        }]
    })
print(f"✅ 학습 가능한 시퀀스 수: {len(inputs)}")

# 📌 데이터셋 클래스 정의
from protenix.data.infer_data_pipeline import InferenceDataset
class TrainDataset(InferenceDataset):
    def __init__(self, data_list, use_msa=False):
        self.inputs = data_list
        self.use_msa = use_msa
        self.dump_dir = "output"

dataset = TrainDataset(inputs)

# 📌 모델 구성
from runner.inference import update_inference_configs, InferenceRunner
from configs.configs_base import configs as configs_base
from configs.configs_data import data_configs
from configs.configs_inference import inference_configs
from protenix.config.config import parse_configs

configs_base["model"]["N_cycle"] = 10
configs_base["model"]["use_ds4sci"] = False  # ✅ CUTLASS 오류 방지
configs_base["sample_diffusion"]["N_sample"] = 5
configs_base["sample_diffusion"]["N_step"] = 200
inference_configs["load_checkpoint_path"] = "/kaggle/input/protenix-checkpoints/model_v0.2.0.pt"

configs = {**configs_base, **{"data": data_configs}, **inference_configs}
configs = parse_configs(configs=configs, fill_required_with_null=True)
runner = InferenceRunner(configs)
torch.cuda.empty_cache()

# 📌 테스트셋 로딩
test_df = pd.read_csv("/kaggle/input/stanford-rna-3d-folding/test_sequences.csv")

class TestDataset(InferenceDataset):
    def __init__(self, seq_list, id_list):
        self.use_msa = False
        self.dump_dir = "output"
        self.inputs = [{
            "sequences": [{"rnaSequence": {"sequence": seq, "count": 1}}],
            "name": i
        } for i, seq in zip(id_list, seq_list)]

test_dataset = TestDataset(test_df["sequence"], test_df["target_id"])

# 📌 예측 및 제출 파일 생성
with open("submission.csv", "w") as f:
    for i in tqdm(range(len(test_dataset))):
        try:
            data, atom_array, err_msg = test_dataset[i]
            if err_msg:
                raise Exception(err_msg)

            runner.update_model_configs(update_inference_configs(configs, data["N_token"].item()))
            out = runner.predict(data)

            if "coordinate" not in out:
                raise Exception("❗ 'coordinate' key missing")

            coords = out["coordinate"][:, data["input_feature_dict"]["atom_to_tokatom_idx"] == 12]

            result = []
            for j, res in enumerate(data["sequences"][0]["rnaSequence"]["sequence"]):
                row = {
                    "ID": f"{data['sample_name']}_{j+1}",
                    "resname": res,
                    "resid": j + 1,
                }
                for k in range(5):
                    if k < coords.shape[0]:
                        row.update({
                            f"x_{k+1}": coords[k, j, 0],
                            f"y_{k+1}": coords[k, j, 1],
                            f"z_{k+1}": coords[k, j, 2],
                        })
                    else:
                        row.update({f"x_{k+1}": 0.0, f"y_{k+1}": 0.0, f"z_{k+1}": 0.0})
                result.append(row)

            pd.DataFrame(result).to_csv(f, index=False, header=(i == 0))

        except Exception as e:
            print(f"❌ Error at index {i}: {e}")
            continue

print("✅ submission.csv 생성 완료 🎉")


📌 Using device: cpu
✅ 학습 가능한 시퀀스 수: 5379
Try to find the ccd cache data in the code directory for inference.


AssertionError: if use ds4sci, set `CUTLASS_PATH` env as https://www.deepspeed.ai/tutorials/ds4sci_evoformerattention/