## Install requirements 

In [None]:
import os, sys, json, tempfile, pathlib, subprocess, re, time
from   timeit import default_timer as timer
import numpy  as np
import pandas as pd
import torch
from   tqdm   import tqdm

# ── user flags ──────────────────────────────────────────────────────────
MODE        = "local"            #  <<<  "local"  or  "submit"
RUN_LOCAL   = True
RUN_KAGGLE  = not RUN_LOCAL

NUM_CONF=5
MAX_LENGTH=1000

# assert torch.cuda.is_available(), "Need an NVIDIA GPU."
# print("torch", torch.__version__, "| cuda:", torch.version.cuda,
#       "| gpu:", torch.cuda.get_device_name(0))

# ── pip installs (done once) ────────────────────────────────────────────
!pip install --no-deps protenix biopython ml-collections \
                      biotite==1.0.1 rdkit

# ── Protenix resource directory ────────────────────────────────────────
os.environ["USE_DEEPSPEED_EVO_ATTENTION"] = "True"

if RUN_LOCAL:
    ROOT_DIR = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev"
else:
    ROOT_DIR = "/kaggle/input/protenix/af3-dev"
        
os.environ["PROTENIX_DATA_ROOT_DIR"] = ROOT_DIR
print("PROTENIX_DATA_ROOT_DIR →", ROOT_DIR)

print("Setting random seeds for deterministic prediction...")
np.random.seed(0)
torch.random.manual_seed(0)
torch.cuda.manual_seed_all(0)

In [None]:
SEQ_CSV = ("/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/"
           f'{"validation" if MODE=="local" else "test"}_sequences.csv')
# SEQ_CSV = ("/kaggle/input/stanford-rna-3d-folding/"
#            f'{"validation" if MODE=="local" else "test"}_sequences.csv')
df      = pd.read_csv(SEQ_CSV)

if MODE == "local":
    # LABEL_CSV  = "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels.csv"
    LABEL_CSV  = "/kaggle/input/stanford-rna-3d-folding/validation_labels.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

if MODE == "local":
    LABEL_CSV  = "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels_clean.csv"
    label_df   = pd.read_csv(LABEL_CSV)
    label_df["target_id"] = label_df.ID.str.rsplit(pat="_", n=1).str[0]

# build input JSON --------------------------------------------------------
samples = [{"name":tid,
            "sequences":[{"rnaSequence":{"sequence":seq,"count":1}}]}
           for seq,tid in zip(df.sequence, df.target_id)]
json_path = tempfile.mktemp(prefix="protenix_inputs_", suffix=".json")
json.dump(samples, open(json_path,"w"))
print("json →", json_path)

In [None]:
from protenix.data.json_to_feature import SampleDictToFeatures
from protenix.data.featurizer import Featurizer as ProtenixFeaturizer
from protenix.data.msa_featurizer import MSAFeaturizer
from protenix.data.utils import make_dummy_feature, data_type_transform
from protenix.utils.torch_utils import dict_to_tensor


def featurize_rna_with_msa(sample_row: pd.Series, msa_featurizer: MSAFeaturizer) -> dict:
    """
    Takes a row from the sequence DataFrame and produces the feature dictionary
    required by the Protenix model, including MSA features.
    """
    name = sample_row["target_id"]
    sequence = sample_row["sequence"]

    # 1. Initial Featurization using SampleDictToFeatures (SDF)
    # This creates the initial atom and token representations.
    sample_input_item = {"name": name, "sequences": [{"rnaSequence": {"sequence": sequence, "count": 1}}]}
    sdf_instance = SampleDictToFeatures(sample_input_item)
    _, atom_array, token_array = sdf_instance.get_feature_dict()

    # 2. MSA Featurization
    # We use the full (uncropped) token array for inference.
    selected_token_indices = np.arange(len(token_array))
    
    # The MSA featurizer needs a specific dictionary format.
    # Note: `original_full_sequence` is used for MSA lookup.
    bioassembly_for_msa = {
        "pdb_id": name,
        "sequences": {"1": sequence}, # Assuming single RNA chain, entity_id "1"
        "atom_array": atom_array,
        "token_array": token_array,
        "entity_poly_type": {"1": "polyribonucleotide"}
    }
    entity_to_asym_id_int = {"1": [0]} # Assuming single chain, asym_id_int 0
    
    msa_features = msa_featurizer(
        bioassembly_dict=bioassembly_for_msa,
        selected_indices=selected_token_indices,
        entity_to_asym_id_int=entity_to_asym_id_int
    )
    
    msa_features_added = msa_features is not None and len(msa_features) > 0

    # 3. Final Featurization using ProtenixFeaturizer
    # This generates the final geometric and chemical features.
    final_protenix_featurizer = ProtenixFeaturizer(
        cropped_token_array=token_array, # Using full token_array
        cropped_atom_array=atom_array,   # Using full atom_array
        ref_pos_augment=False,           # No augmentation at inference
        lig_atom_rename=False
    )
    final_feat_dict = final_protenix_featurizer.get_all_input_features()

    # 4. Merge MSA features into the final feature dictionary
    if msa_features_added:
        final_feat_dict.update(dict_to_tensor(msa_features))

    # 5. Add Dummy Features and Finalize
    # Template features are not used, so we create a dummy.
    # If MSA featurization failed, create a dummy for that too.
    dummy_feature_list = ["template"]
    if not msa_features_added:
        dummy_feature_list.append("msa")
        
    final_feat_dict = make_dummy_feature(final_feat_dict, dummy_feature_list)
    final_feat_dict = data_type_transform(final_feat_dict)

    # 6. Construct the final data dictionary for the model
    basic_info = {
        "pdb_id": name,
        "N_token": torch.tensor([final_feat_dict["token_index"].shape[0]]),
        "N_atom": torch.tensor([final_feat_dict["atom_to_token_idx"].shape[0]]),
    }
    if msa_features_added:
         basic_info["N_msa"] = torch.tensor([final_feat_dict["msa"].shape[0]])

    data = {
        "input_feature_dict": final_feat_dict,
        "label_dict": {}, # No labels at inference
        "label_full_dict": {},
        "basic": basic_info,
    }
    return data

In [None]:
from configs.configs_base       import configs as cfg_base
from configs.configs_data       import data_configs
from configs.configs_inference  import inference_configs
from protenix.config.config     import parse_configs
from runner.inference           import InferenceRunner, update_inference_configs

ckpt_path = f"/home/max/Documents/ProtenixFinetuningFinalResults/5999.pt"

os.environ["CUTLASS_PATH"] = "/home/max/Documents/Protenix-KaggleRNA3D/Cutlass/cutlass"

cfg_base["use_deepspeed_evo_attention"]     = True
cfg_base["model"]["N_cycle"]                = 10
cfg_base["sample_diffusion"]["N_step"]      = 200
cfg_base["sample_diffusion"]["N_sample"]    = 5
inference_configs["load_checkpoint_path"]   = ckpt_path
inference_configs["dtype"]                  = "bf16"
inference_configs["template"] = {
    "use_templates": False,
    "template_mmcif_dir": ""
}

msa_configs = {
    "enable": True,
    "enable_rna_msa": True,
    "enable_prot_msa": False,
    "merge_method": "dense_max",
    "strategy": "random",
    "max_size": {"train": 512, "test": 2048},
    "sample_cutoff": {"train": 64, "test": 1024},
    "min_size": {"train": 1, "test": 1},
    "prot": {},
    "rna": {
        "rna_msa_dir": "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/MSA_v2/RNA_MSA_Stockholm",
        "seq_to_pdb_idx_path": os.path.join("/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding", "MSA_v2/seq_to_target_map.json"),
        "indexing_method": "sequence",
        "seq_limits": {},
    }
}

print("Initializing standalone MSAFeaturizer for data prep...")
shared_msa_args = {
    "merge_method": msa_configs["merge_method"],
    "max_size": msa_configs["max_size"]["test"]  # Explicitly use the test size for inference
}
msa_featurizer = MSAFeaturizer(
    prot_msa_args={},  # Protein MSA is disabled
    rna_msa_args={**msa_configs.get('rna', {}), **shared_msa_args},
    enable_rna_msa=msa_configs.get('enable_rna_msa', False),
    enable_prot_msa=msa_configs.get('enable_prot_msa', False)
)

# 1. Start with the base model and inference configs
cfg = {**cfg_base, **inference_configs}

# 2. Inject your custom MSA settings into the main data configurations
data_configs['msa'] = msa_configs

# 3. Add the entire data_configs under a 'data' key in the final config
cfg['data'] = data_configs

# 4. Add any other top-level keys
cfg["dump_dir"] = tempfile.mkdtemp(prefix="pred_out_")

# Now, parse the correctly structured config
cfg = parse_configs(cfg, fill_required_with_null=True)

# --- Initialize Inference Runner ---
print("Initializing InferenceRunner with corrected config...")
runner = InferenceRunner(cfg)
print("Model ready:", type(runner.model))

In [None]:
print("Starting prediction loop...")
df_test = pd.read_csv(SEQ_CSV)
rows = []

for _, sample_row in tqdm(
        df_test.iterrows(),
        total=len(df_test),
        desc="Featurize → Predict",
        leave=True
    ):
    tid = sample_row["target_id"]
    seq = sample_row["sequence"]

    try:
        # 1. Generate features
        data = featurize_rna_with_msa(sample_row, msa_featurizer)

        # 2. Update model configs
        runner.update_model_configs(
            update_inference_configs(cfg, int(data["basic"]["N_token"]))
        )

        # 3. Predict
        with torch.no_grad():
            prediction_output = runner.predict(data)
            coord = prediction_output["coordinate"]

        # 4. Extract C1' coords
        c1_mask = data["input_feature_dict"]["atom_to_tokatom_idx"] == 12
        coord = coord[:, c1_mask, :]

        # 5. Sanity check / padding
        if coord.shape[1] != len(seq):
            tqdm.write(
                f"Warning: Mismatch in predicted length for {tid}. "
                f"Expected {len(seq)}, got {coord.shape[1]}. Padding with zeros."
            )
            new_coord = torch.zeros(coord.shape[0], len(seq), 3, device=coord.device)
            min_len = min(len(seq), coord.shape[1])
            new_coord[:, :min_len, :] = coord[:, :min_len, :]
            coord = new_coord

    except Exception as e:
        import traceback
        tqdm.write(f"ERROR processing {tid}: {e}")
        traceback.print_exc()
        coord = torch.zeros(NUM_CONF, len(seq), 3)

    # 6. Ensure NUM_CONF predictions
    while coord.shape[0] < NUM_CONF:
        coord = torch.cat([coord, coord[-1:]], dim=0)

    # 7. Build rows for submission
    for i, res in enumerate(seq, 1):
        triplets = coord[:NUM_CONF, i - 1, :].cpu().numpy().reshape(-1)
        rows.append([f"{tid}_{i}", res, i] + triplets.tolist())

# write out
cols = (
    ["ID", "resname", "resid"] +
    [f"{ax}_{k}" for k in range(1,6) for ax in ("x","y","z")]
)
sub = pd.DataFrame(rows, columns=cols)
sub.to_csv("submission.csv", index=False)
print("submission.csv written — shape:", sub.shape)


In [None]:
import os
import re
import numpy as np
import pandas as pd

def parse_tmscore_output(output):
    tm_score_match = re.findall(r'TM-score=\s+([\d.]+)', output)[1]
    return float(tm_score_match)

def write_target_line(
    atom_name, atom_serial, residue_name, chain_id, residue_num,
    x_coord, y_coord, z_coord, occupancy=1.0, b_factor=0.0, atom_type='P'
) -> str:
    return (
        f'ATOM  {atom_serial:>5d}  {atom_name:<5s} {residue_name:<3s} '
        f'{residue_num:>3d}    {x_coord:>8.3f}{y_coord:>8.3f}'
        f'{z_coord:>8.3f}{occupancy:>6.2f}{b_factor:>6.2f}           {atom_type}\n'
    )

def write2pdb(df: pd.DataFrame, xyz_id: int, target_path: str) -> int:
    resolved_cnt = 0
    with open(target_path, 'w') as f:
        for _, row in df.iterrows():
            x = row[f'x_{xyz_id}']; y = row[f'y_{xyz_id}']; z = row[f'z_{xyz_id}']
            if x > -1e17 and y > -1e17 and z > -1e17:
                resolved_cnt += 1
                f.write(write_target_line(
                    atom_name="C1'", atom_serial=int(row['resid']),
                    residue_name=row['resname'], chain_id='0',
                    residue_num=int(row['resid']),
                    x_coord=x, y_coord=y, z_coord=z, atom_type='C'
                ))
    return resolved_cnt

def get_base_target_id(long_id):
    return "_".join(str(long_id).split("_")[:-1])

def score_and_report(solution: pd.DataFrame, submission: pd.DataFrame):
    solution['target_id'] = solution['ID'].apply(get_base_target_id)
    submission['target_id'] = submission['ID'].apply(get_base_target_id)

    native_idxs = sorted(int(c.split('_')[1])
                         for c in solution.columns if c.startswith('x_'))

    usalign = "/home/max/Documents/Protenix-KaggleRNA3D/af3-dev/USalign/USalign"
    
    per_target = {}
    all_best_scores = []
    
    temp_dir = "./scoring_temp/"
    os.makedirs(temp_dir, exist_ok=True)
    
    common_targets = sorted(list(set(solution['target_id'].unique()) & set(submission['target_id'].unique())))
    print(f"Scoring {len(common_targets)} common targets...")

    for tid in tqdm(common_targets): # Use tqdm for a nice progress bar
        grp_nat = solution[solution['target_id'] == tid]
        grp_pred = submission[submission['target_id'] == tid]
        best_of_five = []

        native_path = os.path.join(temp_dir, f'native_{tid}.pdb')
        predicted_path = os.path.join(temp_dir, f'predicted_{tid}.pdb')

        for pred_cnt in range(1, 6):
            best_for_this_pred = 0.0
            n_pred = write2pdb(grp_pred, pred_cnt, predicted_path)
            if n_pred == 0:
                best_of_five.append(0.0)
                continue

            for nat_cnt in native_idxs:
                n_nat = write2pdb(grp_nat, nat_cnt, native_path)
                if n_nat > 0:
                    out = os.popen(
                        f'{usalign} {predicted_path} {native_path} -atom " C1\'"'
                    ).read()
                    best_for_this_pred = max(best_for_this_pred,
                                             parse_tmscore_output(out))
            best_of_five.append(best_for_this_pred)

        per_target[tid] = best_of_five

        target_best_score = max(best_of_five)
        all_best_scores.append(target_best_score)

    mean_tm = np.mean(all_best_scores) if all_best_scores else 0.0
    print(f"\n>>> FINAL mean best-of-5 TM-score = {mean_tm:.4f}")
    
    return per_target, mean_tm

    
solution   = pd.read_csv(
    "/home/max/Documents/Protenix-KaggleRNA3D/data/stanford-rna-3d-folding/validation_labels_clean.csv"
)
submission = pd.read_csv("submission.csv")

per_target_scores, mean_tm = score_and_report(solution, submission)
