In [3]:
from Bio import PDB
import numpy as np
import os

## 2d Seq from rnasolo dataset

In [2]:
def extract_rna_secondary_structure(pdb_file):
    """Extracts secondary structure information from an RNA PDB file."""
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("RNA", pdb_file)
    residues = []
    for model in structure:
        for chain in model:
            for residue in chain:
                # Check if it's an RNA nucleotide (not amino acid)
                if not PDB.is_aa(residue, standard=True):  
                    residues.append(residue)
    sequence = "".join([residue.get_resname().strip() for residue in residues])
    
    print(f"Extracted RNA Sequence from {pdb_file}:")
    print(sequence)
    return sequence

In [3]:
pdb_folder = "rnasolo_dataset"  

for pdb_file in os.listdir(pdb_folder):
    if pdb_file.endswith(".pdb"):
        pdb_path = os.path.join(pdb_folder, pdb_file)
        extract_rna_secondary_structure(pdb_path)
        break #limit one pdb file for testing

Extracted RNA Sequence from rnasolo_dataset\1A4D_1_A-B.pdb:
GGCCGAUGGUAGUGUGGGGUCUCCCCAUGCGAGAGUAGGCC


## 3d reference structure from rnasolo

In [4]:
def extract_reference_structure(pdb_file: str) -> np.ndarray:
    """
    Extracts the reference 3D coordinates for an RNA structure from a PDB file.
    The function uses the P atom as the representative coordinate for each nucleotide.
    If the P atom is missing, it falls back to the C4' atom.
    
    Parameters:
        pdb_file (str): Path to the PDB file.
        
    Returns:
        np.ndarray: An array of shape (N, 3) where N is the number of RNA residues.
    """
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("RNA", pdb_file)
    coordinates = []
    
    for model in structure:
        for chain in model:
            for residue in chain:
                # Check if the residue is a standard RNA nucleotide
                if residue.get_resname() in ["A", "U", "G", "C"]:
                    if "P" in residue:
                        coordinates.append(residue["P"].get_coord())
                    elif "C4'" in residue:  # fallback if P atom is not available
                        coordinates.append(residue["C4'"].get_coord())
                    else:
                        print(f"Warning: Neither 'P' nor 'C4'' found in residue {residue.get_resname()}. Skipping.")
    
    if not coordinates:
        raise ValueError("No valid RNA nucleotide coordinates were extracted from the PDB file.")
    
    coordinates = np.array(coordinates)
    print(f"Extracted reference structure coordinates from {pdb_file}: shape {coordinates.shape}")
    return coordinates


In [None]:
pdb_file = "rnasolo_dataset/1A4D_1_A-B.pdb"
ref_structure = extract_reference_structure(pdb_file)
print(ref_structure)

Extracted reference structure coordinates from rnasolo_dataset/1A4D_1_A-B.pdb: shape (41, 3)
[[ 8.5160e+01 -2.7882e+01 -1.6070e+00]
 [ 8.1858e+01 -2.6285e+01 -2.2800e-01]
 [ 7.8960e+01 -2.4568e+01  3.0330e+00]
 [ 7.8229e+01 -1.9660e+01  7.2980e+00]
 [ 7.9238e+01 -1.5308e+01  1.0169e+01]
 [ 8.3165e+01 -1.0617e+01  7.7120e+00]
 [ 8.6866e+01 -5.6910e+00  9.7280e+00]
 [ 9.0108e+01 -2.2710e+00  7.5020e+00]
 [ 9.1317e+01  4.8000e-02  2.1510e+00]
 [ 8.7869e+01 -4.8800e-01 -3.9230e+00]
 [ 8.3222e+01 -1.0890e+00 -7.3280e+00]
 [ 7.6679e+01 -4.4000e-01 -7.3050e+00]
 [ 7.3166e+01  3.1090e+00 -5.2270e+00]
 [ 7.2424e+01  9.3790e+00 -3.1710e+00]
 [ 7.4137e+01  1.4678e+01 -4.2900e-01]
 [ 7.8093e+01  1.9454e+01  7.7000e-02]
 [ 8.2739e+01  2.3089e+01 -2.1230e+00]
 [ 8.6377e+01  2.5023e+01 -7.1760e+00]
 [ 8.8125e+01  2.4882e+01 -1.3290e+01]
 [ 8.5758e+01  2.7655e+01 -1.7334e+01]
 [ 7.9707e+01  2.9885e+01 -1.6187e+01]
 [ 7.4319e+01  2.8092e+01 -1.5503e+01]
 [ 6.9224e+01  2.3390e+01 -1.6497e+01]
 [ 7.2589e