In [1]:
import os
from dataclasses import dataclass
from collections import defaultdict
import xml.etree.ElementTree as ET

In [2]:
DATA_DIR = "./RNA-Puzzles"
RESULTS_FILENAME = "filter-results.txt"
N = 100

In [3]:
def float_or_none(val: str):
    return None if val == "-" else float(val)


@dataclass
class NativeMotive:
    name: str
    segments_count: int
    residues_count: int
    nucleotide_ranges: list[str]
    sequences: list[str]

    @staticmethod
    def from_split_line(l: list[str]):
        return NativeMotive(
            name=l[0],
            segments_count=int(l[1]),
            residues_count=int(l[2]),
            nucleotide_ranges=list(map(lambda x: x.strip(), l[3].split(","))),
            sequences=list(map(lambda x: x.strip(), l[4].split(",")))
        )


@dataclass
class NucleotideTorAngles:
    chain: str
    residue_num: int
    name: str
    alpha: float
    beta: float
    gamma: float
    delta: float
    epsilon: float
    zeta: float
    chi: float

    @staticmethod
    def from_split_line(l: list[str]):
        return NucleotideTorAngles(
            chain=l[0],
            residue_num=int(l[1]),
            name=l[3],
            alpha=float_or_none(l[4]),
            beta=float_or_none(l[5]),
            gamma=float_or_none(l[6]),
            delta=float_or_none(l[7]),
            epsilon=float_or_none(l[8]),
            zeta=float_or_none(l[9]),
            chi=float_or_none(l[10])
        )
    
@dataclass
class MotivePrediction:
    name: str
    rmsd: float
    nucleotide_tor_angles: list[NucleotideTorAngles]
    

def get_tor_angles_from_file(filename: str) -> list[NucleotideTorAngles]:
    res = []
    with open(filename) as target_tor_file:
            next(target_tor_file)
            for line in target_tor_file:
                split_line = line.strip().split("\t")
                try:
                    res.append(NucleotideTorAngles.from_split_line(l=split_line))
                except ValueError:
                    pass
    return res
    

In [4]:
puzzles = os.listdir(os.path.join(os.curdir, DATA_DIR))

FileNotFoundError: [WinError 3] System nie może odnaleźć określonej ścieżki: '.\\RNA-Puzzles'

In [None]:
native_motives = defaultdict(list[NativeMotive])

for puzzle in puzzles[:N]:
    with open(os.path.join(os.curdir, DATA_DIR, puzzle, RESULTS_FILENAME)) as results_file:
        for line in results_file:
            split_line = line.strip().split("\t")
            # TODO filter by segments count > 2/3?
            native_motives[puzzle].append(
                NativeMotive.from_split_line(l=split_line)
            )


In [None]:
motive_solutions = defaultdict(lambda: defaultdict(dict))
for puzzle in puzzles[:N]:
    for native_motive in native_motives[puzzle]:
        if not all(
                [
                    os.path.exists(os.path.join(os.curdir, DATA_DIR, puzzle, f'{native_motive.name}-rmsd.xml')),
                    os.path.exists(os.path.join(os.curdir, DATA_DIR, puzzle, f'{native_motive.name}.tor'))
                ]
        ):
            continue
        
        motive_solutions[puzzle][native_motive.name]['native_motive'] = native_motive
        motive_solutions[puzzle][native_motive.name]['target_tor_angles'] = get_tor_angles_from_file(os.path.join(os.curdir, DATA_DIR, puzzle, f'{native_motive.name}.tor'))
        motive_solutions[puzzle][native_motive.name]['predictions'] = []
        pred_files = list(filter(lambda x: x.endswith(".tor"), os.listdir(os.path.join(os.curdir, DATA_DIR, puzzle, native_motive.name))))
        for pred_file in pred_files:
            motive_solutions[puzzle][native_motive.name]['predictions'].append(
                MotivePrediction(
                    name=pred_file[:-4],
                    rmsd=-1,
                    nucleotide_tor_angles=get_tor_angles_from_file(os.path.join(os.curdir, DATA_DIR, puzzle, native_motive.name, pred_file))
                )
            )
        xml_tree = ET.parse(os.path.join(os.curdir, DATA_DIR, puzzle, f'{native_motive.name}-rmsd.xml'))
        prediction_scores = {s.find('description/filename').text[:-4]: float(s.find('score').text) for s in xml_tree.getroot().findall('structure')}
        
        for pred in motive_solutions[puzzle][native_motive.name]['predictions']:
            pred.rmsd = prediction_scores[pred.name]
        

In [5]:
motive_solutions['pz01']['1_solution_0_rpr_A_3_G']

NameError: name 'motive_solutions' is not defined

In [None]:
"""
struktura słownika
motive_solutions{
    puzzle_name: {
        motive_name: {
            native_motive: NativeMotive() - klasa,
            target_tor_angles: list[NucleotideTorAngles()] - lista kątów kolejnych nukleotydów z natywnego motywu,
            predictions: [
                MotivePrediction(
                name=...,
                rmsd=...,
                nucleotide_tor_angles: list[NucleotideTorAngles()] - lista kątów kolejnych nukleotydów z danej predykcji motywu
                ),
                ...
            ] - lista wszystkich predykcji dla danego motywu
        }
    }
}
"""