## Install requirements 

In [None]:
import os, sys, json, tempfile, pathlib, subprocess, re, time
from   timeit import default_timer as timer
import numpy  as np
import pandas as pd
import torch
from   tqdm   import tqdm

# ── user flags ──────────────────────────────────────────────────────────
MODE        = "local"            #  <<<  "local"  or  "submit"
RUN_LOCAL   = True
RUN_KAGGLE  = not RUN_LOCAL

NUM_CONF=5
MAX_LENGTH=1000

# assert torch.cuda.is_available(), "Need an NVIDIA GPU."
# print("torch", torch.__version__, "| cuda:", torch.version.cuda,
#       "| gpu:", torch.cuda.get_device_name(0))

# ── pip installs (done once) ────────────────────────────────────────────
!pip install --no-deps protenix biopython ml-collections \
                      biotite==1.0.1 rdkit

# ── Protenix resource directory ────────────────────────────────────────
os.environ["USE_DEEPSPEED_EVO_ATTENTION"] = "True"

if RUN_LOCAL:
    ROOT_DIR = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev"
else:
    ROOT_DIR = "/kaggle/input/protenix/af3-dev"
        
os.environ["PROTENIX_DATA_ROOT_DIR"] = ROOT_DIR
print("PROTENIX_DATA_ROOT_DIR →", ROOT_DIR)

print("Setting random seeds for deterministic prediction...")
np.random.seed(0)
torch.random.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [None]:
SEQ_CSV = ("/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/"
           f'{"validation" if MODE=="local" else "test"}_sequences.csv')
# SEQ_CSV = ("/kaggle/input/stanford-rna-3d-folding/"
#            f'{"validation" if MODE=="local" else "test"}_sequences.csv')
df      = pd.read_csv(SEQ_CSV)

if MODE == "local":
    LABEL_CSV  = "/kaggle/input/stanford-rna-3d-folding/validation_labels.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

if MODE == "local":
    LABEL_CSV  = "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels_clean.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

# build input JSON --------------------------------------------------------
samples = [{"name":tid,
            "sequences":[{"rnaSequence":{"sequence":seq,"count":1}}]}
           for seq,tid in zip(df.sequence, df.target_id)]
json_path = tempfile.mktemp(prefix="protenix_inputs_", suffix=".json")
json.dump(samples, open(json_path,"w"))
print("json →", json_path)

In [None]:
from configs.configs_base       import configs as cfg_base
from configs.configs_data       import data_configs
from configs.configs_inference  import inference_configs
from protenix.config.config     import parse_configs
from runner.inference           import InferenceRunner, update_inference_configs

ckpt_path = f"/home/max/Documents/ProtenixFinetuningFinalResults/9999_NoMSA.pt"

os.environ["CUTLASS_PATH"] = "/home/max/Documents/Protenix-KaggleRNA3D/Cutlass/cutlass"

cfg_base["use_deepspeed_evo_attention"]     = True
cfg_base["model"]["N_cycle"]                = 10
cfg_base["sample_diffusion"]["N_step"]      = 200
cfg_base["sample_diffusion"]["N_sample"]    = 5
inference_configs["load_checkpoint_path"]   = ckpt_path
inference_configs["dtype"]                  = "bf16"
inference_configs["template"] = {
    "use_templates": False,
    "template_mmcif_dir": ""
}

cfg = { **cfg_base,
        **{"data": data_configs},
        **inference_configs,
        "input_json_path": json_path,
        "dump_dir": tempfile.mkdtemp(prefix="pred_out_") }

cfg = parse_configs(cfg, fill_required_with_null=True)
runner = InferenceRunner(cfg) 
print("model is", type(runner.model))

In [None]:
from protenix.data.infer_data_pipeline import InferenceDataset

# --- Prediction Loop ---
ds = InferenceDataset(json_path, dump_dir=".", use_msa=False)
submission_rows = []
confidence_rows = []
ranking_data = []

MAX_SEQ_LENGTH_THRESHOLD = 800

for idx in tqdm(range(len(ds)), desc="Featurize → Predict"):
    data, atom_array, err = ds[idx]
    tid = data["sample_name"]
    seq = df.loc[df.target_id == tid, "sequence"].values[0]

    # Skip long sequences
    if len(seq) > MAX_SEQ_LENGTH_THRESHOLD:
        print(f"  - ID: {tid}, Length: {len(seq)} > {MAX_SEQ_LENGTH_THRESHOLD}. SKIPPING.")
        for i, res in enumerate(seq, 1):
            submission_rows.append([f"{tid}_{i}", res, i] + [0.0] * (NUM_CONF * 3))
            confidence_rows.append([f"{tid}_{i}", res, i] + [0.0] * NUM_CONF)
        ranking_entry = {'target_id': tid}
        for k in range(NUM_CONF):
            ranking_entry[f'ptm_{k+1}'] = 0.0
            ranking_entry[f'ranking_score_{k+1}'] = 0.0
        ranking_data.append(ranking_entry)
        continue

    # Handle featurization errors
    if err:
        print(f"  - ID: {tid}, Featurization ERROR: {err}")
        for i, res in enumerate(seq, 1):
            submission_rows.append([f"{tid}_{i}", res, i] + [0.0] * (NUM_CONF * 3))
            confidence_rows.append([f"{tid}_{i}", res, i] + [0.0] * NUM_CONF)
        ranking_entry = {'target_id': tid}
        for k in range(NUM_CONF):
            ranking_entry[f'ptm_{k+1}'] = 0.0
            ranking_entry[f'ranking_score_{k+1}'] = 0.0
        ranking_data.append(ranking_entry)
        continue

    # Run prediction
    runner.update_model_configs(update_inference_configs(cfg, int(data["N_token"])))
    with torch.no_grad():
        prediction_output = runner.predict(data)
    
    # Extract coordinates, pLDDT, and ranking scores
    coord = prediction_output["coordinate"]
    atom_plddts = [d['atom_plddt'] * 100 for d in prediction_output['full_data']]
    plddt_per_atom = torch.stack(atom_plddts, dim=0)
    ptm_scores = [d['ptm'].item() for d in prediction_output['summary_confidence']]
    ranking_scores = [d['ranking_score'].item() for d in prediction_output['summary_confidence']]
    
    ranking_entry = {'target_id': tid}
    for k in range(NUM_CONF):
        ranking_entry[f'ptm_{k+1}'] = ptm_scores[k] if k < len(ptm_scores) else 0.0
        ranking_entry[f'ranking_score_{k+1}'] = ranking_scores[k] if k < len(ranking_scores) else 0.0
    ranking_data.append(ranking_entry)
    print(f"  - ID: {tid}, Length: {len(seq)}, pTMs: {np.round(ptm_scores, 3)}")

    # Filter for C1' atoms
    c1_mask = data["input_feature_dict"]["atom_to_tokatom_idx"] == 12
    coord_c1 = coord[:, c1_mask, :]
    plddt_c1 = plddt_per_atom[:, c1_mask]

    # Pad if necessary
    while coord_c1.shape[0] < NUM_CONF:
        coord_c1 = torch.cat([coord_c1, coord_c1[-1:]], dim=0)
        plddt_c1 = torch.cat([plddt_c1, plddt_c1[-1:]], dim=0)

    # Populate data for CSV files
    for i, res in enumerate(seq, 1):
        coord_triplets = coord_c1[:, i - 1, :].to(torch.float32).cpu().numpy().reshape(-1)
        submission_rows.append([f"{tid}_{i}", res, i] + coord_triplets.tolist())

        plddt_scores_per_res = plddt_c1[:, i - 1].to(torch.float32).cpu().numpy()
        confidence_rows.append([f"{tid}_{i}", res, i] + plddt_scores_per_res.tolist())


# --- Save Output Files ---
print("\nCreating output files...")

cols_sub = (["ID", "resname", "resid"] + [f"{ax}_{k}" for k in range(1, NUM_CONF + 1) for ax in ("x", "y", "z")])
sub_df = pd.DataFrame(submission_rows, columns=cols_sub)
sub_df.to_csv("submission.csv", index=False)
print(f"submission.csv written — shape: {sub_df.shape}")

cols_conf = (["ID", "resname", "resid"] + [f"plddt_{k}" for k in range(1, NUM_CONF + 1)])
conf_df = pd.DataFrame(confidence_rows, columns=cols_conf)
conf_df.to_csv("confidence.csv", index=False)
print(f"confidence.csv written — shape: {conf_df.shape}")

ranking_df = pd.DataFrame(ranking_data)
ranking_df.to_csv("ranking_scores.csv", index=False)
print(f"ranking_scores.csv written — shape: {ranking_df.shape}")

In [None]:
import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev/USalign/USalign"
    per_target = {}
    
    # Find common targets to iterate over
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in common_targets:
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            for nat_cnt in native_idxs:
                n_nat  = write2pdb(grp_nat,   nat_cnt,   'native.pdb')
                n_pred = write2pdb(grp_pred,  pred_cnt, 'predicted.pdb')
                if n_nat > 0 and n_pred > 0:
                    out = os.popen(
                        f'{usalign} predicted.pdb native.pdb -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five
        print(f"{tid}: TM-scores per model = {best_of_five}, "
              f"best = {max(best_of_five):.4f}")

solution   = pd.read_csv(
    "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels_clean.csv"
)
submission = pd.read_csv("submission.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
