# Projekt 4: Generalna ocena jakości modeli przestrzennych RNA

In [1]:
# %pip install pandas biopython nglview

In [2]:
import xml.etree.ElementTree as ET
from pathlib import Path

import pandas as pd
from Bio.PDB import PDBParser

1. Zapoznanie się z udostępnionymi zbiorami danych i ewentualne przetransformowanie ich do postaci ułatwiającej zastosowanie technik sztucznej inteligencji np. integracja danych składowych przechowywanych w różnych formatach z wykorzystaniem jednej spójnej reprezentacji.

In [3]:
DATASET_PATH = Path("./resources/datasets/RNA-Puzzles")

challenges = [f"pz{index:02}" for index in range(1, 11)]

In [4]:
def parse_motifs(challenges: list[str]) -> pd.DataFrame:
    result_df = pd.DataFrame()
    for challenge in challenges:
        current_df = pd.read_csv(
            DATASET_PATH / f"{challenge}/filter-results.txt",
            sep="\t",
            header=None,
            names=[
                "filename",
                "number_of_segments",
                "number_of_residues",
                "nucleotide_ranges",
                "sequences",
            ],
        )
        current_df["challenge_number"] = challenge
        result_df = pd.concat([result_df, current_df])

    # UWAGA! Warto ograniczyć analizy do motywów strukturalnie złożonych składających się z przynajmniej dwóch,
    # a najlepiej trzech lub więcej segmentów.
    result_df = result_df[result_df["number_of_segments"] >= 3]
    return result_df.reset_index(drop=True)


motifs = parse_motifs(challenges)
motifs

Unnamed: 0,filename,number_of_segments,number_of_residues,nucleotide_ranges,sequences,challenge_number
0,1_solution_0_rpr_A_4_C,3,20,"A1-A8, B10-B16, B19-B23","CCGCCGCG, CAUGCCU, GGCGG",pz01
1,1_solution_0_rpr_A_5_C,3,23,"A1-A9, B8-B15, B18-B23","CCGCCGCGC, GCCAUGCC, UGGCGG",pz01
2,1_solution_0_rpr_A_6_G,3,24,"A2-A10, B7-B14, B17-B23","CGCCGCGCC, CGCCAUGC, GUGGCGG",pz01
3,1_solution_0_rpr_A_7_C,3,24,"A3-A11, B6-B13, B16-B22","GCCGCGCCA, GCGCCAUG, UGUGGCG",pz01
4,1_solution_0_rpr_A_8_G,3,24,"A4-A12, B5-B12, B15-B21","CCGCGCCAU, CGCGCCAU, CUGUGGC",pz01
...,...,...,...,...,...,...
673,10_0_solution_4LCK_rpr_B_55_C,3,32,"A48-A63, B16-B21, B50-B59","AGGAUAGUGAAAGCUA, UGGUAG, GGGUUCGAAU",pz10
674,10_0_solution_4LCK_rpr_B_56_G,3,38,"A48-A64, B15-B22, B49-B61","AGGAUAGUGAAAGCUAG, GUGGUAGA, CGGGUUCGAAUCC",pz10
675,10_0_solution_4LCK_rpr_B_57_A,4,42,"A49-A54, A58-A64, B14-B22, B44-B63","GGAUAG, AAGCUAG, AGUGGUAGA, GGUCGCGGGUUCGAAUCCCG",pz10
676,10_0_solution_4LCK_rpr_B_58_A,4,43,"A59-A63, B6-B11, B13-B23, B43-B63","AGCUA, AGUAGU, CAGUGGUAGAA, GGGUCGCGGGUUCGAAUCCCG",pz10


In [5]:
def parse_xml(xml_path: Path) -> pd.DataFrame:
    tree = ET.parse(xml_path)
    root = tree.getroot()
    return pd.DataFrame(
        [
            {
                "filename": child.find("description").find("filename").text,
                "score": child.find("score").text,
            }
            for child in root
        ]
    )


def parse_scores(motifs: pd.DataFrame) -> pd.DataFrame:
    result_df = pd.DataFrame()
    for index, row in motifs.iterrows():
        xml_path = DATASET_PATH / f"{row['challenge_number']}" / f"{row['filename']}-rmsd.xml"

        if xml_path.exists():
            current_df = parse_xml(xml_path)
            current_df["solution"] = row["filename"]
            result_df = pd.concat([result_df, current_df])
        else:
            motifs.drop(index, inplace=True)

    return result_df


scores = parse_scores(motifs)
scores

Unnamed: 0,filename,score,solution
0,1_bujnicki_1_rpr.pdb,4.769,1_solution_0_rpr_A_4_C
1,1_bujnicki_2_rpr.pdb,4.594,1_solution_0_rpr_A_4_C
2,1_bujnicki_3_rpr.pdb,3.921,1_solution_0_rpr_A_4_C
3,1_bujnicki_4_rpr.pdb,4.522,1_solution_0_rpr_A_4_C
4,1_bujnicki_5_rpr.pdb,4.616,1_solution_0_rpr_A_4_C
...,...,...,...
21,10_DING_5_rpr.pdb,4.516,10_0_solution_4LCK_rpr_B_59_U
22,10_DING_6_rpr.pdb,3.939,10_0_solution_4LCK_rpr_B_59_U
23,10_DING_7_rpr.pdb,4.639,10_0_solution_4LCK_rpr_B_59_U
24,10_DING_8_rpr.pdb,5.66,10_0_solution_4LCK_rpr_B_59_U


In [6]:
# TODO
import warnings
from Bio.PDB.PDBExceptions import PDBConstructionWarning
warnings.filterwarnings('ignore', category=PDBConstructionWarning)

def parse_coordinates(pbd_path: Path):
    parser = PDBParser()

    structure = parser.get_structure(
        "example", DATASET_PATH / f"pz01/1_solution_0_rpr_A_3_G/1_bujnicki_1_rpr.pdb"
    )

    coordinates: list = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    coordinates.append(atom.coord)

    return coordinates

def parse_torsion_angles(tor_path: Path):
    pass

2. Krótkie zapoznanie się z dostępnymi przestrzeniami reprezentacji struktur 3D RNA (przestrzenie kartezjańska i kątów torsyjnych) i ich formatami zapisu. Wybór obiecującej przestrzeni na której będziecie Państwo bazować wraz z uzasadnieniem.

In [7]:
coordinates = parse_coordinates('xd')
coordinates

[array([-20.322,  -0.227,  69.923], dtype=float32),
 array([-19.307,  -1.206,  71.968], dtype=float32),
 array([-19.38 ,   0.818,  69.327], dtype=float32),
 array([-19.303,   0.409,  67.86 ], dtype=float32),
 array([-18.016,  -3.167,  71.892], dtype=float32),
 array([-20.706,  -0.14 ,  67.634], dtype=float32),
 array([-18.208,  -3.283,  70.477], dtype=float32),
 array([-20.771,  -1.091,  66.433], dtype=float32),
 array([-18.967,  -2.335,  69.875], dtype=float32),
 array([-19.534,  -1.305,  70.59 ], dtype=float32),
 array([-18.561,  -2.163,  72.592], dtype=float32),
 array([-17.311,  -4.081,  72.546], dtype=float32),
 array([-19.792,  -0.295,  72.642], dtype=float32),
 array([-19.927,   2.116,  69.44 ], dtype=float32),
 array([-19.038,   1.493,  66.991], dtype=float32),
 array([-21.115,  -0.747,  68.855], dtype=float32),
 array([-19.894,  -2.186,  66.613], dtype=float32),
 array([-19.556,  -2.607,  64.166], dtype=float32),
 array([-18.637,  -4.228,  65.933], dtype=float32),
 array([-19.

In [8]:
# works on google colab (sometimes X D)
# import nglview as nv

# nv.show_biopython(structure)