## Install requirements 

In [1]:
    import os
    os.makedirs("/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache", exist_ok=True)
    
    source_ccd_file = "/kaggle/input/protenix-public/af3-dev/release_data/ccd_cache/components.cif"
    target_ccd_file = "/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache/components.cif"
    
    source_rdkit_file = "/kaggle/input/protenix-public/af3-dev/release_data/ccd_cache/components.cif.rdkit_mol.pkl"
    target_rdkit_file = "/usr/local/lib/python3.11/dist-packages/release_data/ccd_cache/components.cif.rdkit_mol.pkl"
    
    # Create the symlinks if the source files exist
    if os.path.exists(source_ccd_file) and not os.path.exists(target_ccd_file):
        try:
            os.symlink(source_ccd_file, target_ccd_file)
            print(f"Created symlink for CCD file")
        except Exception as e:
            print(f"Error creating symlink for CCD file: {e}")
    
    if os.path.exists(source_rdkit_file) and not os.path.exists(target_rdkit_file):
        try:
            os.symlink(source_rdkit_file, target_rdkit_file)
            print(f"Created symlink for RDKIT file")
        except Exception as e:
            print(f"Error creating symlink for RDKIT file: {e}")
    

Created symlink for CCD file
Created symlink for RDKIT file


In [2]:
import os, sys, json, tempfile, pathlib, subprocess, re, time
from   timeit import default_timer as timer
import numpy  as np
import pandas as pd
import torch
from   tqdm   import tqdm

# ── user flags ──────────────────────────────────────────────────────────
MODE        = "local"            #  <<<  "local"  or  "submit"
RUN_LOCAL   = False
RUN_KAGGLE  = not RUN_LOCAL

NUM_CONF=5
MAX_LENGTH=20000

assert torch.cuda.is_available(), "Need an NVIDIA GPU."
print("torch", torch.__version__, "| cuda:", torch.version.cuda,
      "| gpu:", torch.cuda.get_device_name(0))

# ── pip installs (done once) ────────────────────────────────────────────
# !pip install --no-deps protenix biopython ml-collections \
#                       biotite==1.0.1 rdkit

!pip install --no-index --no-deps --find-links=/kaggle/input/protenix-wheel/wheels protenix biopython ml-collections biotite==1.0.1 rdkit

# ── Protenix resource directory ────────────────────────────────────────
os.environ["USE_DEEPSPEED_EVO_ATTENTION"] = "false"

if RUN_LOCAL:
    ROOT_DIR = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev"
else:
    ROOT_DIR = "/kaggle/input/protenix-public/af3-dev"
        
os.environ["PROTENIX_DATA_ROOT_DIR"] = ROOT_DIR
print("PROTENIX_DATA_ROOT_DIR →", ROOT_DIR)

# ── CONFIDENCE CALCULATION FUNCTION ─────────────────────────────────────────
def compute_single_confidence_score(plddt_per_residue):
    """
    Compute a single confidence score from per-residue pLDDT scores.
    This is the STANDARD method used to ensure consistency.
    """
    if len(plddt_per_residue) == 0:
        return 0.0
    return float(np.mean(plddt_per_residue))

torch 2.5.1+cu124 | cuda: 12.4 | gpu: Tesla P100-PCIE-16GB
Looking in links: /kaggle/input/protenix-wheel/wheels
Processing /kaggle/input/protenix-wheel/wheels/protenix-0.4.6-py3-none-any.whl
Processing /kaggle/input/protenix-wheel/wheels/biopython-1.83-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/input/protenix-wheel/wheels/ml_collections-1.1.0-py3-none-any.whl
Processing /kaggle/input/protenix-wheel/wheels/biotite-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Processing /kaggle/input/protenix-wheel/wheels/rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit, protenix, ml-collections, biotite, biopython
Successfully installed biopython-1.83 biotite-1.0.1 ml-collections-1.1.0 protenix-0.4.6 rdkit-2024.9.6
PROTENIX_DATA_ROOT_DIR → /kaggle/input/protenix-public/af3-dev


In [3]:
# SEQ_CSV = ("/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/"
#            f'{"validation" if MODE=="local" else "test"}_sequences.csv')
SEQ_CSV = "/kaggle/input/validation-sequences-clean-csv/validation_sequences_clean.csv"
df      = pd.read_csv(SEQ_CSV)

if MODE == "local":
    # LABEL_CSV  = "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels.csv"
    LABEL_CSV  = "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

# build input JSON --------------------------------------------------------
samples = [{"name":tid,
            "sequences":[{"rnaSequence":{"sequence":seq,"count":1}}]}
           for seq,tid in zip(df.sequence, df.target_id)]
json_path = tempfile.mktemp(prefix="protenix_inputs_", suffix=".json")
json.dump(samples, open(json_path,"w"))
print("json →", json_path)

json → /tmp/protenix_inputs_8q_1iu4i.json


In [4]:
# !cat /tmp/protenix_inputs_7x1cqjsu.json


In [5]:
# ! ln -s /kaggle/input/protenix/af3-dev/release_data /release_data
# ! ls /release_data

In [6]:
from configs.configs_base       import configs as cfg_base
from configs.configs_data       import data_configs
from configs.configs_inference  import inference_configs
from protenix.config.config     import parse_configs
from runner.inference           import InferenceRunner, update_inference_configs

ckpt_path = f"{ROOT_DIR}/release_model/model_v0.2.0.pt"

cfg_base["use_deepspeed_evo_attention"]     = False
cfg_base["model"]["N_cycle"]                = 10
cfg_base["sample_diffusion"]["N_step"]      = 200
cfg_base["sample_diffusion"]["N_sample"]    = 5          # 1 if VRAM is tight
inference_configs["load_checkpoint_path"]   = ckpt_path
inference_configs["dtype"]                  = "bf16"     # GPU friendly

cfg = { **cfg_base,
        **{"data": data_configs},
        **inference_configs,
        "input_json_path": json_path,
        "dump_dir": tempfile.mkdtemp(prefix="pred_out_") }

cfg = parse_configs(cfg, fill_required_with_null=True)
runner = InferenceRunner(cfg) 
print("model is", type(runner.model))

Try to find the ccd cache data in the code directory for inference.
train scheduler 16.0
inference scheduler 16.0
Diffusion Module has 16.0


  checkpoint = torch.load(checkpoint_path, self.device)


model is <class 'protenix.model.protenix.Protenix'>


In [7]:
from protenix.data.infer_data_pipeline import InferenceDataset
import time

ds = InferenceDataset(json_path, dump_dir=".", use_msa=False)
rows = []

for idx in tqdm(range(len(ds)), desc="Featurize → Predict"):
    start = time.time()
    data, atom_array, err = ds[idx]
    print(f"[{idx}] Dataset load time: {time.time() - start:.2f}s")
    
    tid  = data["sample_name"]
    seq  = df.loc[df.target_id == tid, "sequence"].values[0]
    
    # —— NEW LENGTH‐CUTOFF —— 
    if len(seq) > MAX_LENGTH:
        print(f"  ▶ Skipping {tid}: length {len(seq)} > {MAX_LENGTH}")
        # pad with zeros so row‐count stays correct
        for i, res in enumerate(seq, 1):
            rows.append([f"{tid}_{i}", res, i] + [0.0]* (NUM_CONF*3))
        continue
    # ————————————————

    if err:
        print("error:" + err)
        # your existing error‐padding
        for i, res in enumerate(seq, 1):
            rows.append([f"{tid}_{i}", res, i] + [0.0]* (NUM_CONF*3))
        continue

    # now safe to run Protenix on a sequence ≤ MAX_LENGTH
    start = time.time()
    runner.update_model_configs(update_inference_configs(cfg, int(data["N_token"])))
    print(f"[{idx}] Config update time: {time.time() - start:.2f}s")
    
    start = time.time()
    with torch.no_grad():
        prediction_output = runner.predict(data)
        coord = prediction_output["coordinate"]
        
        # Extract pLDDT scores
        plddt_scores = None
        for key in ["plddt", "confidence", "plDDT", "lddt"]:
            if key in prediction_output:
                plddt_scores = prediction_output[key]
                break
        
        if plddt_scores is None:
            print(f"Warning: No confidence scores found for {tid}")
            print(f"Available keys: {list(prediction_output.keys())}")
            plddt_scores = torch.zeros(coord.shape[0], coord.shape[1])
            
    print(f"[{idx}] Prediction time: {time.time() - start:.2f}s")

    c1_mask = data["input_feature_dict"]["atom_to_tokatom_idx"] == 12
    coord   = coord[:, c1_mask, :]                  # [N_sample, L, 3]
    # Extract pLDDT for C1' atoms only (to match coordinate extraction)
    if isinstance(plddt_scores, torch.Tensor):
        plddt_c1 = plddt_scores[:, c1_mask.cpu().numpy()]  # [N_sample, L_c1]
    else:
        plddt_c1 = torch.zeros(NUM_CONF, len(seq))

    # ensure exactly NUM_CONF samples for both coordinates and confidence
    while coord.shape[0] < NUM_CONF:
        coord = torch.cat([coord, coord[-1:]], dim=0)
        plddt_c1 = torch.cat([plddt_c1, plddt_c1[-1:]], dim=0)

    # Compute single confidence score per prediction
    confidence_scores = []
    for sample_idx in range(NUM_CONF):
        if isinstance(plddt_c1, torch.Tensor):
            plddt_per_residue = plddt_c1[sample_idx].cpu().numpy()
        else:
            plddt_per_residue = np.zeros(len(seq))
        
        sequence_confidence = compute_single_confidence_score(plddt_per_residue)
        confidence_scores.append(sequence_confidence)
    
    # Store confidence for this target
    confidence_data = {
        'target_id': tid,
        'sequence_length': len(seq),
        'confidence_1': confidence_scores[0],
        'confidence_2': confidence_scores[1],
        'confidence_3': confidence_scores[2],
        'confidence_4': confidence_scores[3],
        'confidence_5': confidence_scores[4]
    }
    if 'confidence_rows' not in globals():
        confidence_rows = []
    confidence_rows.append(confidence_data)

    for i, res in enumerate(seq, 1):
        triplets = coord[:, i-1, :].cpu().numpy().reshape(-1)
        rows.append([f"{tid}_{i}", res, i] + triplets.tolist() + confidence_scores)


coord_cols = [f"{ax}_{k}" for k in range(1,6) for ax in ("x","y","z")]
conf_cols = [f"confidence_{k}" for k in range(1,6)]
cols = (["ID", "resname", "resid"] + coord_cols + conf_cols)
sub  = pd.DataFrame(rows, columns=cols)
sub.to_csv("submission.csv", index=False)
print("submission.csv written — shape:", sub.shape)
sub.head()

Featurize → Predict:   0%|          | 0/94 [00:00<?, ?it/s]

[0] Dataset load time: 16.67s
[0] Config update time: 0.00s


  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
  with torch.cuda.amp.autocast(enabled=False):
Featurize → Predict:   1%|          | 1/94 [02:10<3:21:41, 130.12s/it]

[0] Prediction time: 113.44s
[1] Dataset load time: 0.23s
[1] Config update time: 0.00s


Featurize → Predict:   2%|▏         | 2/94 [03:03<2:09:58, 84.77s/it] 

[1] Prediction time: 52.78s
[2] Dataset load time: 5.23s
[2] Config update time: 0.00s


Featurize → Predict:   3%|▎         | 3/94 [16:55<10:46:27, 426.23s/it]

[2] Prediction time: 827.31s
[3] Dataset load time: 0.22s
[3] Config update time: 0.00s


Featurize → Predict:   4%|▍         | 4/94 [17:48<6:58:12, 278.81s/it] 

[3] Prediction time: 52.58s
[4] Dataset load time: 0.19s
[4] Config update time: 0.00s


Featurize → Predict:   5%|▌         | 5/94 [18:40<4:52:13, 197.00s/it]

[4] Prediction time: 51.75s
[5] Dataset load time: 0.23s
[5] Config update time: 0.00s


Featurize → Predict:   6%|▋         | 6/94 [19:33<3:37:09, 148.07s/it]

[5] Prediction time: 52.84s
[6] Dataset load time: 3.91s
[6] Config update time: 0.00s


Featurize → Predict:   7%|▋         | 7/94 [29:31<7:08:11, 295.31s/it]

[6] Prediction time: 594.50s
[7] Dataset load time: 3.80s
[7] Config update time: 0.00s


Featurize → Predict:   9%|▊         | 8/94 [39:27<9:20:08, 390.80s/it]

[7] Prediction time: 591.43s
[8] Dataset load time: 0.20s
[8] Config update time: 0.00s


Featurize → Predict:  10%|▉         | 9/94 [40:19<6:43:36, 284.90s/it]

[8] Prediction time: 51.83s
[9] Dataset load time: 0.38s
[9] Config update time: 0.00s


Featurize → Predict:  11%|█         | 10/94 [41:23<5:03:28, 216.77s/it]

[9] Prediction time: 63.83s
[10] Dataset load time: 0.10s
[10] Config update time: 0.00s


Featurize → Predict:  12%|█▏        | 11/94 [42:12<3:48:51, 165.44s/it]

[10] Prediction time: 48.97s
[11] Dataset load time: 3.48s
[11] Config update time: 0.00s


Featurize → Predict:  13%|█▎        | 12/94 [51:07<6:19:32, 277.72s/it]

[11] Prediction time: 531.00s
[12] Dataset load time: 0.44s
[12] Config update time: 0.00s


Featurize → Predict:  14%|█▍        | 13/94 [52:24<4:52:53, 216.95s/it]

[12] Prediction time: 76.68s
[13] Dataset load time: 0.23s
[13] Config update time: 0.00s


Featurize → Predict:  15%|█▍        | 14/94 [53:17<3:43:28, 167.60s/it]

[13] Prediction time: 53.34s
[14] Dataset load time: 1.06s
[14] Config update time: 0.00s


Featurize → Predict:  16%|█▌        | 15/94 [55:59<3:38:12, 165.72s/it]

[14] Prediction time: 160.29s
[15] Dataset load time: 0.38s
[15] Config update time: 0.00s


Featurize → Predict:  17%|█▋        | 16/94 [57:05<2:56:39, 135.89s/it]

[15] Prediction time: 66.23s
[16] Dataset load time: 0.23s
[16] Config update time: 0.00s


Featurize → Predict:  18%|█▊        | 17/94 [57:58<2:22:26, 111.00s/it]

[16] Prediction time: 52.86s
[17] Dataset load time: 0.24s
[17] Config update time: 0.00s


Featurize → Predict:  19%|█▉        | 18/94 [58:52<1:58:46, 93.77s/it] 

[17] Prediction time: 53.42s
[18] Dataset load time: 0.23s
[18] Config update time: 0.00s


Featurize → Predict:  20%|██        | 19/94 [59:45<1:41:48, 81.44s/it]

[18] Prediction time: 52.50s
[19] Dataset load time: 0.16s
[19] Config update time: 0.00s


Featurize → Predict:  21%|██▏       | 20/94 [1:00:35<1:28:44, 71.95s/it]

[19] Prediction time: 49.64s
[20] Dataset load time: 0.11s
[20] Config update time: 0.00s


Featurize → Predict:  22%|██▏       | 21/94 [1:01:24<1:19:24, 65.26s/it]

[20] Prediction time: 49.56s
[21] Dataset load time: 0.66s
[21] Config update time: 0.00s


Featurize → Predict:  23%|██▎       | 22/94 [1:03:01<1:29:45, 74.80s/it]

[21] Prediction time: 96.36s
[22] Dataset load time: 0.10s
[22] Config update time: 0.00s


Featurize → Predict:  24%|██▍       | 23/94 [1:03:50<1:19:23, 67.09s/it]

[22] Prediction time: 49.01s
[23] Dataset load time: 0.16s
[23] Config update time: 0.00s


Featurize → Predict:  26%|██▌       | 24/94 [1:04:40<1:12:15, 61.94s/it]

[23] Prediction time: 49.75s
[24] Dataset load time: 0.18s
[24] Config update time: 0.00s


Featurize → Predict:  27%|██▋       | 25/94 [1:05:31<1:07:17, 58.51s/it]

[24] Prediction time: 50.34s
[25] Dataset load time: 0.05s
[25] Config update time: 0.00s


Featurize → Predict:  28%|██▊       | 26/94 [1:06:19<1:02:39, 55.28s/it]

[25] Prediction time: 47.70s
[26] Dataset load time: 3.88s
[26] Config update time: 0.00s


Featurize → Predict:  29%|██▊       | 27/94 [1:16:17<4:03:39, 218.20s/it]

[26] Prediction time: 594.39s
[27] Dataset load time: 0.22s
[27] Config update time: 0.00s


Featurize → Predict:  30%|██▉       | 28/94 [1:17:10<3:05:26, 168.58s/it]

[27] Prediction time: 52.60s
[28] Dataset load time: 0.45s
[28] Config update time: 0.00s


Featurize → Predict:  31%|███       | 29/94 [1:18:26<2:32:38, 140.89s/it]

[28] Prediction time: 75.82s
[29] Dataset load time: 0.23s
[29] Config update time: 0.00s


Featurize → Predict:  32%|███▏      | 30/94 [1:19:19<2:02:13, 114.58s/it]

[29] Prediction time: 52.97s
[30] Dataset load time: 0.17s
[30] Config update time: 0.00s


Featurize → Predict:  33%|███▎      | 31/94 [1:20:09<1:39:56, 95.18s/it] 

[30] Prediction time: 49.72s
[31] Dataset load time: 0.57s
[31] Config update time: 0.00s


Featurize → Predict:  34%|███▍      | 32/94 [1:21:39<1:36:40, 93.55s/it]

[31] Prediction time: 89.18s
[32] Dataset load time: 3.99s
[32] Config update time: 0.00s


Featurize → Predict:  36%|███▌      | 34/94 [1:32:51<3:11:27, 191.45s/it]

[33] Prediction time: 48.94s
[34] Dataset load time: 0.33s
[34] Config update time: 0.00s


Featurize → Predict:  37%|███▋      | 35/94 [1:33:52<2:29:44, 152.28s/it]

[34] Prediction time: 60.53s
[35] Dataset load time: 0.16s
[35] Config update time: 0.00s


Featurize → Predict:  38%|███▊      | 36/94 [1:34:42<1:57:28, 121.53s/it]

[35] Prediction time: 49.63s
[36] Dataset load time: 0.83s
[36] Config update time: 0.00s


Featurize → Predict:  39%|███▉      | 37/94 [1:36:50<1:57:19, 123.50s/it]

[36] Prediction time: 127.24s
[37] Dataset load time: 0.22s
[37] Config update time: 0.00s


Featurize → Predict:  40%|████      | 38/94 [1:37:43<1:35:30, 102.33s/it]

[37] Prediction time: 52.69s
[38] Dataset load time: 0.47s
[38] Config update time: 0.00s


Featurize → Predict:  41%|████▏     | 39/94 [1:39:05<1:28:20, 96.37s/it] 

[38] Prediction time: 81.97s
[39] Dataset load time: 5.21s
[39] Config update time: 0.00s


Featurize → Predict:  43%|████▎     | 40/94 [1:52:58<4:45:31, 317.25s/it]

[39] Prediction time: 827.39s
[40] Dataset load time: 0.17s
[40] Config update time: 0.00s


Featurize → Predict:  44%|████▎     | 41/94 [1:53:48<3:29:31, 237.19s/it]

[40] Prediction time: 50.21s
[41] Dataset load time: 0.32s
[41] Config update time: 0.00s


Featurize → Predict:  45%|████▍     | 42/94 [1:54:49<2:39:46, 184.36s/it]

[41] Prediction time: 60.74s
[42] Dataset load time: 0.16s
[42] Config update time: 0.00s


Featurize → Predict:  46%|████▌     | 43/94 [1:55:40<2:02:35, 144.22s/it]

[42] Prediction time: 50.40s
[43] Dataset load time: 0.46s
[43] Config update time: 0.00s


Featurize → Predict:  47%|████▋     | 44/94 [1:56:57<1:43:24, 124.09s/it]

[43] Prediction time: 76.64s
[44] Dataset load time: 0.15s
[44] Config update time: 0.00s


Featurize → Predict:  48%|████▊     | 45/94 [1:57:47<1:23:14, 101.93s/it]

[44] Prediction time: 50.09s
[45] Dataset load time: 0.24s
[45] Config update time: 0.00s


Featurize → Predict:  49%|████▉     | 46/94 [1:58:41<1:09:58, 87.47s/it] 

[45] Prediction time: 53.48s
[46] Dataset load time: 0.23s
[46] Config update time: 0.00s


Featurize → Predict:  50%|█████     | 47/94 [1:59:35<1:00:35, 77.36s/it]

[46] Prediction time: 53.52s
[47] Dataset load time: 0.15s
[47] Config update time: 0.00s


Featurize → Predict:  51%|█████     | 48/94 [2:00:25<52:58, 69.10s/it]  

[47] Prediction time: 49.68s
[48] Dataset load time: 0.13s
[48] Config update time: 0.00s


Featurize → Predict:  52%|█████▏    | 49/94 [2:01:14<47:22, 63.16s/it]

[48] Prediction time: 49.15s
[49] Dataset load time: 0.31s
[49] Config update time: 0.00s


Featurize → Predict:  53%|█████▎    | 50/94 [2:02:13<45:30, 62.05s/it]

[49] Prediction time: 59.15s
[50] Dataset load time: 0.18s
[50] Config update time: 0.00s


Featurize → Predict:  54%|█████▍    | 51/94 [2:03:05<42:15, 58.96s/it]

[50] Prediction time: 51.58s
[51] Dataset load time: 0.07s
[51] Config update time: 0.00s


Featurize → Predict:  55%|█████▌    | 52/94 [2:03:54<39:11, 55.98s/it]

[51] Prediction time: 48.96s
[52] Dataset load time: 0.19s
[52] Config update time: 0.00s


Featurize → Predict:  56%|█████▋    | 53/94 [2:04:46<37:26, 54.79s/it]

[52] Prediction time: 51.82s
[53] Dataset load time: 0.11s
[53] Config update time: 0.00s


Featurize → Predict:  57%|█████▋    | 54/94 [2:05:35<35:21, 53.03s/it]

[53] Prediction time: 48.79s
[54] Dataset load time: 0.19s
[54] Config update time: 0.00s


Featurize → Predict:  59%|█████▊    | 55/94 [2:06:27<34:14, 52.68s/it]

[54] Prediction time: 51.67s
[55] Dataset load time: 0.13s
[55] Config update time: 0.00s


Featurize → Predict:  60%|█████▉    | 56/94 [2:07:16<32:37, 51.51s/it]

[55] Prediction time: 48.65s
[56] Dataset load time: 0.19s
[56] Config update time: 0.00s


Featurize → Predict:  61%|██████    | 57/94 [2:08:08<31:50, 51.65s/it]

[56] Prediction time: 51.78s
[57] Dataset load time: 0.11s
[57] Config update time: 0.00s


Featurize → Predict:  62%|██████▏   | 58/94 [2:08:57<30:31, 50.88s/it]

[57] Prediction time: 48.96s
[58] Dataset load time: 0.21s
[58] Config update time: 0.00s


Featurize → Predict:  63%|██████▎   | 59/94 [2:09:50<30:00, 51.46s/it]

[58] Prediction time: 52.59s
[59] Dataset load time: 0.43s
[59] Config update time: 0.00s


Featurize → Predict:  64%|██████▍   | 60/94 [2:11:07<33:31, 59.16s/it]

[59] Prediction time: 76.69s
[60] Dataset load time: 0.19s
[60] Config update time: 0.00s


Featurize → Predict:  65%|██████▍   | 61/94 [2:11:59<31:22, 57.04s/it]

[60] Prediction time: 51.92s
[61] Dataset load time: 0.30s
[61] Config update time: 0.00s


Featurize → Predict:  66%|██████▌   | 62/94 [2:12:57<30:39, 57.48s/it]

[61] Prediction time: 58.19s
[62] Dataset load time: 0.43s
[62] Config update time: 0.00s


Featurize → Predict:  67%|██████▋   | 63/94 [2:14:14<32:44, 63.36s/it]

[62] Prediction time: 76.62s
[63] Dataset load time: 0.11s
[63] Config update time: 0.00s


Featurize → Predict:  68%|██████▊   | 64/94 [2:15:04<29:32, 59.08s/it]

[63] Prediction time: 48.99s
[64] Dataset load time: 0.10s
[64] Config update time: 0.00s


Featurize → Predict:  69%|██████▉   | 65/94 [2:15:53<27:07, 56.12s/it]

[64] Prediction time: 49.12s
[65] Dataset load time: 0.85s
[65] Config update time: 0.00s


Featurize → Predict:  70%|███████   | 66/94 [2:17:32<32:16, 69.17s/it]

[65] Prediction time: 98.76s
[66] Dataset load time: 0.44s
[66] Config update time: 0.00s


Featurize → Predict:  71%|███████▏  | 67/94 [2:18:49<32:11, 71.54s/it]

[66] Prediction time: 76.60s
[67] Dataset load time: 0.40s
[67] Config update time: 0.00s


Featurize → Predict:  72%|███████▏  | 68/94 [2:19:58<30:35, 70.59s/it]

[67] Prediction time: 67.97s
[68] Dataset load time: 0.23s
[68] Config update time: 0.00s


Featurize → Predict:  73%|███████▎  | 69/94 [2:20:51<27:12, 65.29s/it]

[68] Prediction time: 52.69s
[69] Dataset load time: 0.23s
[69] Config update time: 0.00s


Featurize → Predict:  74%|███████▍  | 70/94 [2:21:43<24:34, 61.43s/it]

[69] Prediction time: 52.17s
[70] Dataset load time: 3.94s
[70] Config update time: 0.00s


Featurize → Predict:  76%|███████▌  | 71/94 [2:31:42<1:25:20, 222.63s/it]

[70] Prediction time: 594.80s
[71] Dataset load time: 0.49s
[71] Config update time: 0.00s


Featurize → Predict:  77%|███████▋  | 72/94 [2:33:05<1:06:19, 180.88s/it]

[71] Prediction time: 82.97s
[72] Dataset load time: 0.15s
[72] Config update time: 0.00s


Featurize → Predict:  78%|███████▊  | 73/94 [2:33:56<49:36, 141.75s/it]  

[72] Prediction time: 50.28s
[73] Dataset load time: 0.54s
[73] Config update time: 0.00s


Featurize → Predict:  79%|███████▊  | 74/94 [2:35:26<42:02, 126.13s/it]

[73] Prediction time: 89.15s
[74] Dataset load time: 0.18s
[74] Config update time: 0.00s


Featurize → Predict:  80%|███████▉  | 75/94 [2:36:17<32:53, 103.86s/it]

[74] Prediction time: 51.69s
[75] Dataset load time: 0.30s
[75] Config update time: 0.00s


Featurize → Predict:  81%|████████  | 76/94 [2:37:16<27:04, 90.24s/it] 

[75] Prediction time: 58.15s
[76] Dataset load time: 3.85s
[76] Config update time: 0.00s


Featurize → Predict:  82%|████████▏ | 77/94 [2:47:19<1:09:12, 244.24s/it]

[76] Prediction time: 599.68s
[77] Dataset load time: 0.45s
[77] Config update time: 0.00s


Featurize → Predict:  83%|████████▎ | 78/94 [2:48:37<51:45, 194.09s/it]  

[77] Prediction time: 76.63s
[78] Dataset load time: 0.28s
[78] Config update time: 0.00s


Featurize → Predict:  84%|████████▍ | 79/94 [2:49:32<38:05, 152.39s/it]

[78] Prediction time: 54.80s
[79] Dataset load time: 4.20s
[79] Config update time: 0.00s


Featurize → Predict:  85%|████████▌ | 80/94 [3:00:35<1:11:21, 305.81s/it]

[79] Prediction time: 659.56s
[80] Dataset load time: 0.38s
[80] Config update time: 0.00s


Featurize → Predict:  86%|████████▌ | 81/94 [3:01:41<50:37, 233.69s/it]  

[80] Prediction time: 65.03s
[81] Dataset load time: 0.22s
[81] Config update time: 0.00s


Featurize → Predict:  87%|████████▋ | 82/94 [3:02:32<35:47, 178.98s/it]

[81] Prediction time: 51.10s
[82] Dataset load time: 0.22s
[82] Config update time: 0.00s


Featurize → Predict:  88%|████████▊ | 83/94 [3:03:25<25:52, 141.16s/it]

[82] Prediction time: 52.66s
[83] Dataset load time: 0.19s
[83] Config update time: 0.00s


Featurize → Predict:  89%|████████▉ | 84/94 [3:04:17<19:02, 114.25s/it]

[83] Prediction time: 51.26s
[84] Dataset load time: 0.37s
[84] Config update time: 0.00s


Featurize → Predict:  90%|█████████ | 85/94 [3:05:23<14:58, 99.78s/it] 

[84] Prediction time: 65.66s
[85] Dataset load time: 0.52s
[85] Config update time: 0.00s


Featurize → Predict:  91%|█████████▏| 86/94 [3:06:47<12:41, 95.15s/it]

[85] Prediction time: 83.82s
[86] Dataset load time: 0.18s
[86] Config update time: 0.00s


Featurize → Predict:  93%|█████████▎| 87/94 [3:07:37<09:31, 81.67s/it]

[86] Prediction time: 50.02s
[87] Dataset load time: 0.68s
[87] Config update time: 0.00s


Featurize → Predict:  94%|█████████▎| 88/94 [3:09:24<08:55, 89.21s/it]

[87] Prediction time: 106.14s
[88] Dataset load time: 0.39s
[88] Config update time: 0.00s


Featurize → Predict:  95%|█████████▍| 89/94 [3:10:31<06:52, 82.58s/it]

[88] Prediction time: 66.70s
[89] Dataset load time: 0.32s
[89] Config update time: 0.00s


Featurize → Predict:  96%|█████████▌| 90/94 [3:11:31<05:02, 75.69s/it]

[89] Prediction time: 59.28s
[90] Dataset load time: 0.29s
[90] Config update time: 0.00s


Featurize → Predict:  97%|█████████▋| 91/94 [3:12:25<03:27, 69.26s/it]

[90] Prediction time: 53.98s
[91] Dataset load time: 0.32s
[91] Config update time: 0.00s


Featurize → Predict:  98%|█████████▊| 92/94 [3:13:23<02:12, 66.01s/it]

[91] Prediction time: 58.09s
[92] Dataset load time: 0.19s
[92] Config update time: 0.00s


Featurize → Predict:  99%|█████████▉| 93/94 [3:14:15<01:01, 61.59s/it]

[92] Prediction time: 51.08s
[93] Dataset load time: 0.44s
[93] Config update time: 0.00s


Featurize → Predict: 100%|██████████| 94/94 [3:15:32<00:00, 124.81s/it]

[93] Prediction time: 76.66s





submission.csv written — shape: (13782, 23)


Unnamed: 0,ID,resname,resid,x_1,y_1,z_1,x_2,y_2,z_2,x_3,...,y_4,z_4,x_5,y_5,z_5,confidence_1,confidence_2,confidence_3,confidence_4,confidence_5
0,9L5R_2_1,A,1,-23.133446,5.648046,-55.872604,24.893173,15.280736,39.96698,5.991918,...,-9.33133,64.695862,-48.705055,13.416087,29.238045,0.062999,0.048493,0.090558,0.110703,0.090135
1,9L5R_2_2,G,2,-24.690014,7.824708,-51.199677,19.621548,17.113182,38.998108,5.593056,...,-12.77895,60.663078,-43.725693,15.635481,29.560118,0.062999,0.048493,0.090558,0.110703,0.090135
2,9L5R_2_3,C,3,-27.746859,7.639439,-46.665176,15.80904,18.199286,35.121109,2.458481,...,-15.455084,55.955589,-38.693649,15.85153,31.901075,0.062999,0.048493,0.090558,0.110703,0.090135
3,9L5R_2_4,U,4,-30.402573,4.543214,-43.304604,14.658719,16.58939,30.191492,-2.46685,...,-15.825209,51.451309,-34.772202,13.331332,34.730221,0.062999,0.048493,0.090558,0.110703,0.090135
4,9L5R_2_5,C,5,-31.527815,-0.321229,-41.511032,15.135366,11.992863,27.347424,-7.38567,...,-13.567739,47.877548,-32.685989,8.666122,36.40274,0.062999,0.048493,0.090558,0.110703,0.090135


In [8]:
import shutil
import os

# Copy USalign to working directory and make it executable
shutil.copy2("/kaggle/input/usalign/USalign", "/kaggle/working/USalign")
os.chmod("/kaggle/working/USalign", 0o755)

print("USalign copied to /kaggle/working/ and made executable")

USalign copied to /kaggle/working/ and made executable


In [9]:
import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    """Parse TM-score from USalign output with detailed debugging"""
    print(f"DEBUG: Raw USalign output:")
    print(f"'{output}'")
    print(f"DEBUG: Output length: {len(output)}")
    
    if not output.strip():
        print("Warning: Empty output from USalign")
        return 0.0
    
    # Look for all TM-score patterns
    tm_score_matches = re.findall(r'TM-score=\s*([\d.]+)', output)
    print(f"DEBUG: Found TM-score matches: {tm_score_matches}")
    
    if len(tm_score_matches) == 0:
        print("Warning: No TM-score found in output")
        return 0.0
    elif len(tm_score_matches) == 1:
        print("Warning: Only one TM-score found, using it")
        return float(tm_score_matches[0])
    else:
        print(f"Found {len(tm_score_matches)} TM-scores, using the second one")
        return float(tm_score_matches[1])

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def test_usalign():
    """Test if USalign is working properly"""
    usalign_path = "/kaggle/working/USalign"
    
    # Check if file exists
    if not os.path.exists(usalign_path):
        print(f"ERROR: USalign not found at {usalign_path}")
        return False
    
    # Check if it's executable
    if not os.access(usalign_path, os.X_OK):
        print(f"ERROR: USalign at {usalign_path} is not executable")
        print("Trying to make it executable...")
        os.chmod(usalign_path, 0o755)
    
    # Test basic execution
    try:
        test_output = os.popen(f'{usalign_path} 2>&1').read()
        print(f"USalign test output: {test_output[:200]}...")
        return True
    except Exception as e:
        print(f"ERROR testing USalign: {e}")
        return False

def score_and_report_debug(solution: pd.DataFrame, submission: pd.DataFrame):
    """Scoring function with extensive debugging"""
    print("=== Starting scoring with debug output ===")
    
    # Test USalign first
    if not test_usalign():
        print("USalign test failed, cannot proceed with scoring")
        return {}, 0.0
    
    # extract target_id
    solution['target_id'] = solution['ID'].str.split('_').str[0]
    submission['target_id'] = submission['ID'].str.split('_').str[0]

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))
    print(f"Native structure indices: {native_idxs}")

    usalign = "/kaggle/working/USalign"
    per_target = {}

    # Test with just the first target for debugging
    target_ids = solution['target_id'].unique()
    print(f"Found {len(target_ids)} targets, testing first one for debugging...")
    
    for target_idx, (tid, grp_nat) in enumerate(solution.groupby('target_id')):
        print(f"\n=== Processing target {tid} ({target_idx+1}/{len(target_ids)}) ===")
        grp_pred = submission[submission['target_id'] == tid]
        
        print(f"Native group shape: {grp_nat.shape}")
        print(f"Predicted group shape: {grp_pred.shape}")
        
        best_of_five = []

        for pred_cnt in range(1, 6):
            print(f"\n--- Testing prediction {pred_cnt} ---")
            best_for_this_pred = 0.0
            
            for nat_cnt in native_idxs:
                print(f"Comparing prediction {pred_cnt} vs native {nat_cnt}")
                
                n_nat = write2pdb(grp_nat, nat_cnt, 'native.pdb')
                n_pred = write2pdb(grp_pred, pred_cnt, 'predicted.pdb')
                
                print(f"Native atoms written: {n_nat}, Predicted atoms written: {n_pred}")
                
                if n_nat > 0 and n_pred > 0:
                    cmd = f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    print(f"Running command: {cmd}")
                    
                    try:
                        out = os.popen(cmd).read()
                        score = parse_tmscore_output(out)
                        print(f"TM-score: {score}")
                        best_for_this_pred = max(best_for_this_pred, score)
                    except Exception as e:
                        print(f"Error running USalign: {e}")
                        continue
                else:
                    print("Skipping due to empty structures")
            
            best_of_five.append(best_for_this_pred)
            print(f"Best score for prediction {pred_cnt}: {best_for_this_pred}")

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, best = {max(best_of_five):.4f}")
        
        # Only process first target for debugging, remove this break for full scoring
        if target_idx == 0:
            print("=== Debug mode: stopping after first target ===")
            break

    overall = np.mean([max(v) for v in per_target.values()]) if per_target else 0.0
    print(f"\n>>> mean best-of-5 TM-score = {overall:.4f}")
    return per_target, overall

# Quick function to check PDB files
def check_pdb_files():
    """Check if PDB files are being created correctly"""
    for filename in ['native.pdb', 'predicted.pdb']:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                content = f.read()
                print(f"\n=== {filename} content (first 500 chars) ===")
                print(content[:500])
                print(f"=== {filename} total lines: {len(content.splitlines())} ===")
        else:
            print(f"{filename} does not exist")

# Main execution
if __name__ == "__main__":
    solution = pd.read_csv(
        "/kaggle/input/validation-labels-clean-csv/validation_labels_clean.csv"
    )
    submission = pd.read_csv("submission.csv")

    print("Solution columns:", solution.columns.tolist())
    print("Submission columns:", submission.columns.tolist())
    print("Solution shape:", solution.shape)
    print("Submission shape:", submission.shape)

    # Run debug scoring
    per_target_scores, mean_tm = score_and_report_debug(solution, submission)
    
    # Check PDB files after scoring
    check_pdb_files()

Solution columns: ['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1']
Submission columns: ['ID', 'resname', 'resid', 'x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2', 'x_3', 'y_3', 'z_3', 'x_4', 'y_4', 'z_4', 'x_5', 'y_5', 'z_5', 'confidence_1', 'confidence_2', 'confidence_3', 'confidence_4', 'confidence_5']
Solution shape: (13782, 6)
Submission shape: (13782, 23)
=== Starting scoring with debug output ===
USalign test output: 
 ********************************************************************
 * US-align (Version 20241108)                                      *
 * Universal Structure Alignment of Proteins and Nucleic Ac...
Native structure indices: [1]
Found 83 targets, testing first one for debugging...

=== Processing target 8K85 (1/83) ===
Native group shape: (56, 7)
Predicted group shape: (56, 24)

--- Testing prediction 1 ---
Comparing prediction 1 vs native 1
Native atoms written: 56, Predicted atoms written: 56
Running command: /kaggle/working/USalign predicted.pdb native.pdb -atom " C1

In [10]:
submission.to_csv("/kaggle/working/protenix_submission_with_confidence.csv", index=False)