In [1]:
"""
The purpose of this Jupyter notebook is to generate a results file for
each of the 10 test sets from the file encompassing all test set
predictions. The necessity to do so stems from the fact that inference
has not been performed on the 10 test set splits individually, but on
the file comprising the entire test set.
"""

'\nThe purpose of this Jupyter notebook is to generate a results file for\neach of the 10 test sets from the file encompassing all test set\npredictions. The necessity to do so stems from the fact that inference\nhas not been performed on the 10 test set splits individually, but on\nthe file comprising the entire test set.\n'

In [2]:
import pandas as pd

In [3]:
# Load the file encompassing all test set predictions
path_to_entire_test_set_preds = (
    "/Users/jacobanter/Documents/Code/VACV_screen/MaTPIP/Inference_on_"
    "combined_VACV_PPI_data_set/predicted_probs_pos_label_combined_"
    "VACV_PPIs_data_set.tsv"
)

test_set_preds_df = pd.read_csv(
    path_to_entire_test_set_preds,
    sep="\t",
    header=None
)

In [4]:
# Iterate over the test set splits in order to extract the corresponding
# PPI pairs and generate a file with predictions for each test set split
for i in range(10):
    path_to_current_test_set_split = (
        "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_"
        "instances_with_nucleolus_neg_instances/data_set_splits/VACV_"
        f"WR_pos_and_neg_PPIs_test_split_{i}.tsv"
    )

    current_test_set_split_df = pd.read_csv(
        path_to_current_test_set_split,
        sep="\t",
        header=None
    )

    # Iterate over the test set split at hand, determine the PPI pair
    # and extract the corresponding predicted probability
    current_split_probs = []

    for _, row in current_test_set_split_df.iterrows():
        first_int_partner = row.iloc[0]
        second_int_partner = row.iloc[1]

        predicted_prob = test_set_preds_df.loc[
            (test_set_preds_df.iloc[:, 0] == first_int_partner)
            &
            (test_set_preds_df.iloc[:, 1] == second_int_partner),
            2
        ].iloc[0]
        
        current_split_probs.append(predicted_prob)

    # Assemble the current test set split's Pandas DataFrame
    data = {
        "seq1": current_test_set_split_df.iloc[:, 0].to_list(),
        "seq2": current_test_set_split_df.iloc[:, 1].to_list(),
        "interaction_prob": current_split_probs
    }

    current_test_set_split_with_probs_df = pd.DataFrame(data=data)
    current_test_set_split_with_probs_df.to_csv(
        "predictions_on_VACV_WR_pos_and_neg_data_set_test_set_split_"\
            f"{i}_without_training.tsv",
        sep="\t",
        index=False
    )

In [5]:
# As a last step, introduce a `label` column in each of the test set TSV
# files with labels based on the predicted probability

# As this Jupyter notebook is not part of a package, the module's/file's
# `__name__` attribute is set to `__main__`, i.e. it does not contain
# any package information
# This, in turn, makes relative imports infeasible
# Thus, the directory the desired file is located in has to be manually
# added to path
import sys
sys.path.append("..")

import evaluation_utils

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files="predictions_on_VACV_WR_pos_and_neg_data_set_test_"\
    "set_split_{i}_without_training.tsv",
    pred_col_name="interaction_prob",
    n_fold=10
)