# Projekt 6: Przewidywanie struktury drugorzędowej RNA

In [1]:
# %pip install matplotlib pandas seaborn

In [2]:
import os
import random
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


DatasetDirectory = Path("./resources/datasets")
ModelDirectory = Path("./resources/models")
ResultsDirectory = Path("./resources/results")
FiguresDirectory = Path("./resources/figures")

In [3]:
def split_datasets(
    path: Path, train_ratio: float = 0.7, valid_ratio: float = 0.15, test_ratio: float = 0.15
) -> None:
    random.seed(42)
    files = [f"{path}/{f}" for f in os.listdir(path) if f.endswith(".bpseq")]
    random.shuffle(files)

    total_files = len(files)
    train_count = int(total_files * train_ratio)
    valid_count = int(total_files * valid_ratio)

    train_files = files[:train_count]
    valid_files = files[train_count : train_count + valid_count]
    test_files = files[train_count + valid_count :]

    with open(f"{path}-train-bpseq.lst", "w") as f:
        f.write("\n".join(train_files))
    with open(f"{path}-valid-bpseq.lst", "w") as f:
        f.write("\n".join(valid_files))
    with open(f"{path}-test-bpseq.lst", "w") as f:
        f.write("\n".join(test_files))

    with open(f"{path}-train-fa.lst", "w") as f:
        f.write("\n".join(train_files).replace(".bpseq", ".fa"))
    with open(f"{path}-valid-fa.lst", "w") as f:
        f.write("\n".join(valid_files).replace(".bpseq", ".fa"))
    with open(f"{path}-test-fa.lst", "w") as f:
        f.write("\n".join(test_files).replace(".bpseq", ".fa"))


def datasets_sanity_check() -> None:
    lst_files = [file for file in os.listdir(DatasetDirectory) if file.endswith(".lst")]

    for file in lst_files:
        file_path = DatasetDirectory / file
        with open(file_path, "r") as f:
            lines = f.readlines()
        num_lines = len(lines)

        total_files = len(os.listdir(DatasetDirectory / file.split("-")[0])) // 2
        print(f"{file:<25} has {round(num_lines/total_files * 100)}% files ({num_lines})")

In [4]:
split_datasets(DatasetDirectory / "ArchiveII")
split_datasets(DatasetDirectory / "PDB")
datasets_sanity_check()

FileNotFoundError: [Errno 2] No such file or directory: 'resources/datasets/ArchiveII'

## ArchiveII Dataset

In [None]:
GpuCount: int = -1
EpochCount: int = 1

In [None]:
DatasetName = "ArchiveII"
!mxfold2 train {DATASET_DIR}/{DATASET_NAME}-train-bpseq.lst \
    --param {ModelDirectory} / {DatasetName} - model.pth - -save - config {ModelDirectory} / {DatasetName} - model.conf \
                                                                          - -gpu {GpuCount} - -epoch {EpochCount}

In [None]:
DatasetName = "ArchiveII"
!mxfold2 predict @./{MODELS_DIR}/{DATASET_NAME}-model.conf {DATASET_DIR}/{DATASET_NAME}-test-bpseq.lst \
    --bpseq {ResultsDirectory} / {DatasetName} - -result {ResultsDirectory} / {DatasetName} - results.csv \
                                                         - -gpu {GpuCount}

## PDB Dataset

In [None]:
DatasetName = "PDB"
!mxfold2 train {DATASET_DIR}/{DATASET_NAME}-train-bpseq.lst \
    --param {ModelDirectory} / {DatasetName} - model.pth - -save - config {ModelDirectory} / {DatasetName} - model.conf \
                                                                          - -gpu {GpuCount} - -epoch {EpochCount}

In [None]:
DatasetName = "PDB"
!mxfold2 predict @./{MODELS_DIR}/{DATASET_NAME}-model.conf {DATASET_DIR}/{DATASET_NAME}-test-bpseq.lst \
    --bpseq {ResultsDirectory} / {DatasetName} - -result {ResultsDirectory} / {DatasetName} - results.csv \
                                                         - -gpu {GpuCount}

## Transfer Learning (ArchiveII -> PDB)

In [None]:
DatasetName = "PDB"
!mxfold2 train @./{MODELS_DIR}/ArchiveII-model.conf {DATASET_DIR}/{DATASET_NAME}-train-bpseq.lst --init-param {MODELS_DIR}/ArchiveII-model.pth \
    --param {ModelDirectory} / TransferLearning - model.pth - -save - config {ModelDirectory} / TransferLearning - model.conf \
                                                                             - -gpu {GpuCount} - -epoch {EpochCount}

In [None]:
DatasetName = "PDB"
!mxfold2 predict @./resources/models/TransferLearning-model.conf {DATASET_DIR}/{DATASET_NAME}-test-bpseq.lst \
    --bpseq {ResultsDirectory} / TransferLearning - -result {ResultsDirectory} / TransferLearning - results.csv \
                                                            - -gpu {GpuCount}

## Results

In [None]:
columns: list[str] = [
    "filename",
    "sequence_length",
    "elapsed_time",
    "sc",
    "tp",
    "tn",
    "fp",
    "fn",
    "sen",
    "ppv",
    "fval",
    "mcc",
]

results = pd.read_csv(ResultsDirectory / "ArchiveII-results.csv", header=None, names=columns)
results.head()

In [None]:
def calculate_metrics(results: pd.DataFrame) -> pd.DataFrame:
    """Calculate the metrics for the given results: INF, PPV, TPR, TNR."""
    tp_sum, tn_sum, fp_sum, fn_sum = results[["tp", "tn", "fp", "fn"]].sum()
    ppv = tp_sum / (tp_sum + fp_sum)
    tpr = tp_sum / (tp_sum + fn_sum)
    inf = (ppv * tpr) ** 0.5
    tnr = tn_sum / (tn_sum + fp_sum)
    return inf, ppv, tpr, tnr


def plot_metrics(datasets: tuple[str]) -> None:
    """Plot the metrics for the given results."""
    metrics = defaultdict(dict)
    for dataset in datasets:
        df = pd.read_csv(ResultsDirectory / f"{dataset}-results.csv", header=None, names=columns)
        inf, ppv, tpr, tnr = calculate_metrics(df)
        metrics["inf"].update({dataset: inf})
        metrics["ppv"].update({dataset: ppv})
        metrics["tpr"].update({dataset: tpr})
        metrics["tnr"].update({dataset: tnr})

    colors = sns.color_palette("magma", 3)
    fig, axs = plt.subplots(2, 2, figsize=(12, 8), tight_layout=True)
    ax1, ax2, ax3, ax4 = axs.flatten()

    ax1.set_title("INF")
    ax1.bar(metrics["inf"].keys(), metrics["inf"].values(), color=colors, label="INF")
    ax1.grid(axis="y", linestyle="--", alpha=0.25)
    ax1.set_xlabel("dataset")
    ax1.set_ylabel("score")

    ax2.set_title("PPV")
    ax2.bar(metrics["ppv"].keys(), metrics["ppv"].values(), color=colors, label="PPV")
    ax2.grid(axis="y", linestyle="--", alpha=0.25)
    ax2.set_xlabel("dataset")
    ax2.set_ylabel("score")

    ax3.set_title("TPR")
    ax3.bar(metrics["tpr"].keys(), metrics["tpr"].values(), color=colors, label="TPR")
    ax3.grid(axis="y", linestyle="--", alpha=0.25)
    ax3.set_xlabel("dataset")
    ax3.set_ylabel("score")

    ax4.set_title("TNR")
    ax4.bar(metrics["tnr"].keys(), metrics["tnr"].values(), color=colors, label="TNR")
    ax4.grid(axis="y", linestyle="--", alpha=0.25)
    ax4.set_xlabel("dataset")
    ax4.set_ylabel("score")

    fig.savefig(FiguresDirectory / "results.png")

In [None]:
datasets = ("ArchiveII", "PDB", "TransferLearning")

plot_metrics(datasets)