## Install requirements 

In [1]:
    import os
    os.makedirs("/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache", exist_ok=True)
    
    source_ccd_file = "/kaggle/input/protenix/af3-dev/release_data/ccd_cache/components.cif"
    target_ccd_file = "/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache/components.cif"
    
    source_rdkit_file = "/kaggle/input/protenix/af3-dev/release_data/ccd_cache/components.cif.rdkit_mol.pkl"
    target_rdkit_file = "/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache/components.cif.rdkit_mol.pkl"
    
    # Create the symlinks if the source files exist
    if os.path.exists(source_ccd_file) and not os.path.exists(target_ccd_file):
        try:
            os.symlink(source_ccd_file, target_ccd_file)
            print(f"Created symlink for CCD file")
        except Exception as e:
            print(f"Error creating symlink for CCD file: {e}")
    
    if os.path.exists(source_rdkit_file) and not os.path.exists(target_rdkit_file):
        try:
            os.symlink(source_rdkit_file, target_rdkit_file)
            print(f"Created symlink for RDKIT file")
        except Exception as e:
            print(f"Error creating symlink for RDKIT file: {e}")
    

Created symlink for CCD file
Created symlink for RDKIT file


In [2]:
import os, sys, json, tempfile, pathlib, subprocess, re, time
from   timeit import default_timer as timer
import numpy  as np
import pandas as pd
import torch
from   tqdm   import tqdm

# ── user flags ──────────────────────────────────────────────────────────
MODE        = "submit"            #  <<<  "local"  or  "submit"
RUN_LOCAL   = False
RUN_KAGGLE  = not RUN_LOCAL

NUM_CONF=5
MAX_LENGTH=20000

assert torch.cuda.is_available(), "Need an NVIDIA GPU."
print("torch", torch.__version__, "| cuda:", torch.version.cuda,
      "| gpu:", torch.cuda.get_device_name(0))

# ── pip installs (done once) ────────────────────────────────────────────
# !pip install --no-deps protenix biopython ml-collections \
#                       biotite==1.0.1 rdkit

!pip install --no-index --no-deps --find-links=/kaggle/input/protenix-wheel-bundle/wheels protenix biopython ml-collections biotite==1.0.1 rdkit

# ── Protenix resource directory ────────────────────────────────────────
os.environ["USE_DEEPSPEED_EVO_ATTENTION"] = "false"

if RUN_LOCAL:
    ROOT_DIR = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev"
else:
    ROOT_DIR = "/kaggle/input/protenix/af3-dev"
        
os.environ["PROTENIX_DATA_ROOT_DIR"] = ROOT_DIR
print("PROTENIX_DATA_ROOT_DIR →", ROOT_DIR)

torch 2.5.1+cu124 | cuda: 12.4 | gpu: Tesla P100-PCIE-16GB
Looking in links: /kaggle/input/protenix-wheel-bundle/wheels
Processing /kaggle/input/protenix-wheel-bundle/wheels/protenix-0.4.6-py3-none-any.whl
Processing /kaggle/input/protenix-wheel-bundle/wheels/biopython-1.83-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/input/protenix-wheel-bundle/wheels/ml_collections-1.1.0-py3-none-any.whl
Processing /kaggle/input/protenix-wheel-bundle/wheels/biotite-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/input/protenix-wheel-bundle/wheels/rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit, protenix, ml-collections, biotite, biopython
Successfully installed biopython-1.83 biotite-1.0.1 ml-collections-1.1.0 protenix-0.4.6 rdkit-2024.9.6
PROTENIX_DATA_ROOT_DIR → /kaggle/input/protenix/af3-dev


In [3]:
# SEQ_CSV = ("/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/"
#            f'{"validation" if MODE=="local" else "test"}_sequences.csv')
SEQ_CSV = ("/kaggle/input/stanford-rna-3d-folding/"
           f'{"validation" if MODE=="local" else "test"}_sequences.csv')
df      = pd.read_csv(SEQ_CSV)

if MODE == "local":
    # LABEL_CSV  = "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels.csv"
    LABEL_CSV  = "/kaggle/input/stanford-rna-3d-folding/validation_labels.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

# build input JSON --------------------------------------------------------
samples = [{"name":tid,
            "sequences":[{"rnaSequence":{"sequence":seq,"count":1}}]}
           for seq,tid in zip(df.sequence, df.target_id)]
json_path = tempfile.mktemp(prefix="protenix_inputs_", suffix=".json")
json.dump(samples, open(json_path,"w"))
print("json →", json_path)

json → /tmp/protenix_inputs_9euvm72_.json


In [4]:
# !cat /tmp/protenix_inputs_7x1cqjsu.json


In [5]:
# ! ln -s /kaggle/input/protenix/af3-dev/release_data /release_data
# ! ls /release_data

In [6]:
from configs.configs_base       import configs as cfg_base
from configs.configs_data       import data_configs
from configs.configs_inference  import inference_configs
from protenix.config.config     import parse_configs
from runner.inference           import InferenceRunner, update_inference_configs

ckpt_path = "/kaggle/input/7ksteps-protenix-nomsa-75cropping/6999_ema_0.999.pt"

cfg_base["use_deepspeed_evo_attention"]     = False
cfg_base["model"]["N_cycle"]                = 10
cfg_base["sample_diffusion"]["N_step"]      = 200
cfg_base["sample_diffusion"]["N_sample"]    = 5          # 1 if VRAM is tight
inference_configs["load_checkpoint_path"]   = ckpt_path
inference_configs["dtype"]                  = "bf16"     # GPU friendly

cfg = { **cfg_base,
        **{"data": data_configs},
        **inference_configs,
        "input_json_path": json_path,
        "dump_dir": tempfile.mkdtemp(prefix="pred_out_") }

cfg = parse_configs(cfg, fill_required_with_null=True)
runner = InferenceRunner(cfg) 
print("model is", type(runner.model))

Try to find the ccd cache data in the code directory for inference.
train scheduler 16.0
inference scheduler 16.0
Diffusion Module has 16.0


  checkpoint = torch.load(checkpoint_path, self.device)


model is <class 'protenix.model.protenix.Protenix'>


In [7]:
from protenix.data.infer_data_pipeline import InferenceDataset
import time

ds = InferenceDataset(json_path, dump_dir=".", use_msa=False)
rows = []

for idx in tqdm(range(len(ds)), desc="Featurize → Predict"):
    start = time.time()
    data, atom_array, err = ds[idx]
    print(f"[{idx}] Dataset load time: {time.time() - start:.2f}s")
    
    tid  = data["sample_name"]
    seq  = df.loc[df.target_id == tid, "sequence"].values[0]
    
    # —— NEW LENGTH‐CUTOFF —— 
    if len(seq) > MAX_LENGTH:
        print(f"  ▶ Skipping {tid}: length {len(seq)} > {MAX_LENGTH}")
        # pad with zeros so row‐count stays correct
        for i, res in enumerate(seq, 1):
            rows.append([f"{tid}_{i}", res, i] + [0.0]* (NUM_CONF*3))
        continue
    # ————————————————

    if err:
        print("error:" + err)
        # your existing error‐padding
        for i, res in enumerate(seq, 1):
            rows.append([f"{tid}_{i}", res, i] + [0.0]* (NUM_CONF*3))
        continue

    # now safe to run Protenix on a sequence ≤ MAX_LENGTH
    start = time.time()
    runner.update_model_configs(update_inference_configs(cfg, int(data["N_token"])))
    print(f"[{idx}] Config update time: {time.time() - start:.2f}s")
    
    start = time.time()
    with torch.no_grad():
        coord = runner.predict(data)["coordinate"]
    print(f"[{idx}] Prediction time: {time.time() - start:.2f}s")

    c1_mask = data["input_feature_dict"]["atom_to_tokatom_idx"] == 12
    coord   = coord[:, c1_mask, :]                  # [N_sample, L, 3]

    # ensure exactly NUM_CONF samples
    while coord.shape[0] < NUM_CONF:
        coord = torch.cat([coord, coord[-1:]], dim=0) 

    for i, res in enumerate(seq, 1):
        triplets = coord[:, i-1, :].cpu().numpy().reshape(-1)
        rows.append([f"{tid}_{i}", res, i] + triplets.tolist())


cols = (["ID", "resname", "resid"] +
        [f"{ax}_{k}" for k in range(1,6) for ax in ("x","y","z")])
sub  = pd.DataFrame(rows, columns=cols)
sub.to_csv("submission.csv", index=False)
print("submission.csv written — shape:", sub.shape)
# sub.head()


Featurize → Predict:   0%|          | 0/12 [00:00<?, ?it/s]

[0] Dataset load time: 11.90s
[0] Config update time: 0.00s


  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
Featurize → Predict:   8%|▊         | 1/12 [01:05<11:56, 65.12s/it]

[0] Prediction time: 53.22s
[1] Dataset load time: 0.19s
[1] Config update time: 0.00s


Featurize → Predict:  17%|█▋        | 2/12 [01:57<09:35, 57.55s/it]

[1] Prediction time: 52.05s
[2] Dataset load time: 0.53s
[2] Config update time: 0.00s


Featurize → Predict:  25%|██▌       | 3/12 [03:26<10:45, 71.75s/it]

[2] Prediction time: 88.12s
[3] Dataset load time: 0.08s
[3] Config update time: 0.00s


Featurize → Predict:  33%|███▎      | 4/12 [04:15<08:22, 62.79s/it]

[3] Prediction time: 48.97s
[4] Dataset load time: 1.77s
[4] Config update time: 0.00s


Featurize → Predict:  42%|████▏     | 5/12 [08:46<16:05, 137.87s/it]

[4] Prediction time: 269.20s
[5] Dataset load time: 0.92s
[5] Config update time: 0.00s


Featurize → Predict:  50%|█████     | 6/12 [11:13<14:06, 141.15s/it]

[5] Prediction time: 146.57s
[6] Dataset load time: 1.88s
[6] Config update time: 0.00s


Featurize → Predict:  58%|█████▊    | 7/12 [15:55<15:36, 187.22s/it]

[6] Prediction time: 280.17s
[7] Dataset load time: 5.56s
[7] Config update time: 0.00s


Featurize → Predict:  67%|██████▋   | 8/12 [30:26<26:59, 404.76s/it]

[7] Prediction time: 864.93s
[8] Dataset load time: 0.38s
[8] Config update time: 0.00s


Featurize → Predict:  75%|███████▌  | 9/12 [31:35<14:59, 299.75s/it]

[8] Prediction time: 68.47s
[9] Dataset load time: 0.42s
[9] Config update time: 0.00s


Featurize → Predict:  83%|████████▎ | 10/12 [32:52<07:42, 231.15s/it]

[9] Prediction time: 77.10s
[10] Dataset load time: 0.36s
[10] Config update time: 0.00s


Featurize → Predict:  92%|█████████▏| 11/12 [33:58<03:00, 180.68s/it]

[10] Prediction time: 65.86s
[11] Dataset load time: 0.37s
[11] Config update time: 0.00s


Featurize → Predict: 100%|██████████| 12/12 [35:05<00:00, 175.43s/it]

[11] Prediction time: 65.91s
submission.csv written — shape: (2515, 18)





In [8]:
import subprocess

if MODE == "local":
    USALIGN = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev/USalign/USalign"
    if not os.access(USALIGN, os.X_OK):
        os.chmod(USALIGN, 0o755)

    def write_c1_pdb(xyz, seq, fname):
        lines = []
        for i, (r, (x, y, z)) in enumerate(zip(seq, xyz), start=1):
            lines.append(
                f"ATOM  {i:5d}  C1' {r:>3s} A{i:4d}"
                f"{x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00           C\n"
            )
        open(fname, "w").write("".join(lines))

    def align_once(pred_pdb, truth_pdb, timeout=15):
        cmd = [USALIGN, pred_pdb, truth_pdb, "-atom", " C1'", "-m", "-"]
        try:
            res = subprocess.run(cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 text=True,
                                 timeout=timeout)
            tm = float(re.findall(r"TM-score=\s+([\d.]+)", res.stdout)[1])
            return tm
        except subprocess.TimeoutExpired:
            print("⏱  USalign timed out")
            return 0.0
        except Exception as e:
            print("‼️ alignment failed:", e)
            return 0.0

    tm_scores = []
    for i, row in df.iterrows():
        tid, seq = row.target_id, row.sequence
        print(f"\n[{i}] {tid} (len={len(seq)})")

        # 1) pull out only the x_1,y_1,z_1 columns and sort by resid
        truth_df = (
            label_df
              .query("target_id == @tid")
              .sort_values("resid")
              .reset_index(drop=True)
        )

        # 2) build a boolean mask of “real” coords (i.e. not the -1e+18 sentinel)
        coords = truth_df[["x_1","y_1","z_1"]].to_numpy(dtype=np.float64)
        valid = (np.abs(coords) < 1e17).all(axis=1)   # keep anything <1e17

        # 3) pull out your filtered truth coords and matching sequence letters
        truth_xyz   = coords[valid].astype(np.float32)
        resid_keep  = truth_df.loc[valid, "resid"].to_numpy(dtype=int)
        # for each kept resid, grab the correct base from seq:
        truth_seq   = [ seq[r-1] for r in resid_keep ]

        # write your filtered truth PDB
        truth_pdb = tempfile.mktemp(suffix=".pdb")
        write_c1_pdb(truth_xyz, truth_seq, truth_pdb)

        # 4) align each of your NUM_CONF predictions
        best = 0.0
        for c in range(NUM_CONF):
            all_pred_xyz = (
                sub.loc[
                  sub.ID.str.startswith(f"{tid}_"),
                  [f"{ax}_{c+1}" for ax in ("x","y","z")]
                ]
                .to_numpy(dtype=np.float32)
                .reshape(-1,3)
            )
            # apply the same mask to your predictions
            pred_xyz = all_pred_xyz[valid]

            # write it out
            pred_pdb = tempfile.mktemp(suffix=".pdb")
            write_c1_pdb(pred_xyz, truth_seq, pred_pdb)

            tm = align_once(pred_pdb, truth_pdb)
            # print(f"  model {c}: TM={tm:.4f}")
            best = max(best, tm)

        # print(f"→ best for {tid}: {best:.4f}")
        tm_scores.append(best)

    print("\nALL TM:", tm_scores)
    print("MEAN TM:", np.mean(tm_scores))
