In [5]:
"""
The purpose of this Jupyter notebook is to evaluate the performance of
SENSE-PPI, which serves as one of the three published benchmark models.

Beyond just computing five different metrics (accuracy, precision,
recall, F1-score, specificity) as average across the 10 splits, the ROC
AUC score is also computed, which summarises a classifier's performance
across different classification thresholds.
"""

"\nThe purpose of this Jupyter notebook is to evaluate the performance of\nSENSE-PPI, which serves as one of the three published benchmark models.\n\nBeyond just computing five different metrics (accuracy, precision,\nrecall, F1-score, specificity) as average across the 10 splits, the ROC\nAUC score is also computed, which summarises a classifier's performance\nacross different classification thresholds.\n"

In [6]:
# As this Jupyter notebook is not part of a package, the module's/file's
# `__name__` attribute is set to `__main__`, i.e. it does not contain
# any package information
# This, in turn, makes relative imports infeasible
# Thus, the directory the desired file is located in has to be manually
# added to path
import sys
sys.path.append("..")


import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

In [8]:
# When three columns are provided in the input TSV file, i.e. the TSV
# file harbouring PPI pairs to predict, SENSE-PPI interprets the third
# column as `label` column and adopts the ground truth labels rather
# than determining predicted labels based on the probability
# Therefore, the `label` column has to be removed from the TSV files
# and replaced with labels based on the predicted probabilities

# Iterate over the results TSV files and remove the `label` column
for i in range(10):
    # Bear in mind that due to computational limitations, each test set
    # split has been subdivided into four chunks
    for j in range(4):
        # Also keep in mind that SENSE-PPI outputs two different types
        # of results files, the first of which encompasses all PPI pairs
        # and the second of which comprises exclusively PPI pairs
        # predicted to be positive
        all_preds_file_path = (
            f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_"
            f"data_set_test_set_split_{i}_chunk_{j}_without_training.tsv"
        )
        all_preds_df = pd.read_csv(all_preds_file_path, sep="\t")
        all_preds_df.drop(labels="label", axis=1, inplace=True)
        all_preds_df.to_csv(
            all_preds_file_path, sep="\t", index=False
        )

        pos_only_preds_file_path = (
            f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_"
            f"data_set_test_set_split_{i}_chunk_{j}_without_training_"
            "positive_interactions.tsv"
        )
        pos_only_preds_df = pd.read_csv(
            pos_only_preds_file_path, sep="\t"
        )
        pos_only_preds_df.drop(labels="label", axis=1, inplace=True)
        pos_only_preds_df.to_csv(
            pos_only_preds_file_path, sep="\t", index=False
        )

In [9]:
# As 10-fold cross-validation is supposed to be performed, there are 10
# different data set splits and thus 10 different test sets
# However, bear in mind that due to memory constraints, each test set
# has been subdivided into four chunks
# Therefore, the first step consists of stitching the chunks belonging
# to one test set together
# Also bear in mind that this chunk concatenation has to be performed
# twice per test set as SENSE-PPI outputs two output files

# Iterate over the 10 test sets
for i in range(10):
    # Iterate over the 4 different chunks each test set has been
    # subdivided into and append them to a list
    all_preds_list = []
    pos_preds_list = []

    for j in range(4):
        all_preds_list.append(
            pd.read_csv(
                f"results_split_{i}/predictions_on_VACV_WR_pos_and_"\
                f"neg_data_set_test_set_split_{i}_chunk_{j}_without_"\
                "training.tsv",
                sep="\t"
            )
        )
        pos_preds_list.append(
            pd.read_csv(
                f"results_split_{i}/predictions_on_VACV_WR_pos_and_"\
                f"neg_data_set_test_set_split_{i}_chunk_{j}_without_"\
                "training_positive_interactions.tsv",
                sep="\t"
            )
        )

    all_predictions_full = pd.concat(all_preds_list, ignore_index=True)
    pos_predictions_full = pd.concat(pos_preds_list, ignore_index=True)

    all_predictions_full.to_csv(
        f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_data_"\
        f"set_test_set_split_{i}_without_training.tsv",
        sep="\t",
        index=False
    )
    pos_predictions_full.to_csv(
        f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_data_"\
        f"set_test_set_split_{i}_without_training_positive_interactions.tsv",
        sep="\t",
        index=False
    )

In [10]:
# In a subsequent step, a new `label` column is introduced in each of
# the TSV files with labels based on the predicted probability
import evaluation_utils

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files="results_split_{i}/predictions_on_VACV_WR_pos_and_"\
    "neg_data_set_test_set_split_{i}_without_training.tsv",
    pred_col_name="preds",
    n_fold=10
)

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files="results_split_{i}/predictions_on_VACV_WR_pos_and_"\
    "neg_data_set_test_set_split_{i}_without_training_positive_interactions.tsv",
    pred_col_name="preds",
    n_fold=10
)

In [11]:
# Load the groud truth, i.e. the combined VACV WR data encompassing
# confirmed positive PPIs as well as reliable negative PPIs involving
# nucleolus proteins
VACV_WR_PPIs_ground_truth_df = pd.read_csv(
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances"\
    "_with_nucleolus_neg_instances/VACV_WR_pos_and_nucleolus_prots_"\
    "neg_PPI_instances.tsv",
    sep="\t"
)

In [12]:
# In order to be able to utilise scikit-learn's `confusion_matrix`
# class, the ground truth labels have to be extracted for each and every
# test set
ground_truth_label_list = []

for i in range(10):
    # Load the TSV with predictions for the current test set
    # This is done instead of loading e.g. the original test set TSV
    # file as the PPI pairs must have the same ordering in order for the
    # metrics to be computed correctly
    current_test_set_preds_df = pd.read_csv(
        f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_data_"\
        f"set_test_set_split_{i}_without_training.tsv",
        sep="\t"
    )

    current_test_set_ground_truth_labels = []

    for _, row in current_test_set_preds_df.iterrows():
        human_uniprot_id = row["seq1"]
        VACV_uniprot_id = row["seq2"]

        ground_truth_label = VACV_WR_PPIs_ground_truth_df.loc[
            (VACV_WR_PPIs_ground_truth_df["Human_prot"] == human_uniprot_id)
            &
            (VACV_WR_PPIs_ground_truth_df["VACV_prot"] == VACV_uniprot_id)
        ]["label"].iloc[0]
        
        current_test_set_ground_truth_labels.append(ground_truth_label)
    
    ground_truth_label_list.append(current_test_set_ground_truth_labels)

In [13]:
# Now, five metrics are computed for each and every split test set
# In detail, these five metrics are accuracy, precision, recall,
# F1-score and specificity
# To this end, scikit-learn's `confusion_matrix` class is utilised

# Store the metrics for each test set in corresponding lists
accuracy_list = []
precision_list = []
recall_list = []
f1_score_list = []
specificity_list = []

# Iterate over the 10 test sets
for i, current_ground_truths in enumerate(ground_truth_label_list):
    current_split_preds_df = pd.read_csv(
        f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_data_"\
        f"set_test_set_split_{i}_without_training.tsv",
        sep="\t"
    )
    current_predictions = current_split_preds_df["label"].to_list()

    # Extract the predicted labels and feed them along with the ground
    # truth labels into the `confusion_matrix` class
    cm = confusion_matrix(
        current_ground_truths,
        current_predictions
    )
    
    # Accuracy is defined as the proportion of correct predictions in
    # all predictions made by the model and is hence computed as
    # follows:
    # (# correct predictions) / (# all predictions)
    # = (TP + TN) /(TP + TN + FP + FN)
    accuracy = (cm[1,1] + cm[0,0]) / (cm[1,1] + cm[0,0] + cm[0,1] + cm[1,0])
    # Precision is defined as the proportion of correct positive
    # predictions in all positive predictions and is thus computed as
    # follows:
    # (# true positives) / (# positive predictions)
    # = TP / (TP + FP)
    precision = cm[1,1] / (cm[1,1] + cm[0,1])
    # Recall, also known as sensitivity, is defined as the proportion of
    # correctly identified positive instances in all positive instances
    # and is thus computed as follows:
    # (# true positives) / (# positive instances in data set)
    # = TP / (TP + FN)
    recall = cm[1,1] / (cm[1,1] + cm[1,0])
    # F1-score is a metric incorporating both precision and recall; to
    # be more precise, the F1-score is the harmonic mean of precision
    # and recall
    # It is defined as follows:
    # 2*TP / (2*TP + FP + FN)
    f1_score = 2*cm[1,1] / (2*cm[1,1] + cm[0,1] + cm[1,0])
    # Specificity, which can be considered the opposite of recall, is
    # defined as the proportion of correctly identified negative
    # instances in all negative instances and is hence computed as
    # follows:
    # (# true negatives) / (# negative instances in data set)
    # = TN / (TN + FP)
    specificity = cm[0,0] / (cm[0,0] + cm[0,1])

    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_score_list.append(f1_score)
    specificity_list.append(specificity)

In [14]:
# Compute the mean and the standard deviation for each of the five
# metrics
accuracy_mean = np.mean(accuracy_list)
accuracy_std = np.std(accuracy_list)

precision_mean = np.mean(precision_list)
precision_std = np.std(precision_list)

recall_mean = np.mean(recall_list)
recall_std = np.std(recall_list)

f1_score_mean = np.mean(f1_score_list)
f1_score_std = np.std(f1_score_list)

specificity_mean = np.mean(specificity_list)
specificity_std = np.std(specificity_list)

# Regarding string padding by means of string methods such as
# `.ljust()`, it must be noted that if string continuation by e.g.
# parentheses is used, the entire text preceding a certain point will be
# considered contiguous
# Thus, in order to apply string padding to a defined string, it is
# advisable to separate that string from the surrounding ones by e.g.
# commas or the plus operator
metrics_result_text = (
    "Using 10-fold cross-validation, the metrics for SENSE-PPI are as "
    "follows:\n" +
    "Accuracy:".ljust(13) + f"{accuracy_mean} \xB1 {accuracy_std}\n" +
    "Precision:".ljust(13) + f"{precision_mean} \xB1 {precision_std}\n" +
    "Recall:".ljust(13) + f"{recall_mean} \xB1 {recall_std}\n" +
    "F1-Score:".ljust(13) + f"{f1_score_mean} \xB1 {f1_score_std}\n" +
    "Specificity:".ljust(13) + f"{specificity_mean} \xB1 {specificity_std}"
)

print(metrics_result_text)

# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open(
    "SENSE-PPI_results_10-fold_cross-validation_on_combined_VACV_WR_"\
    "data_set_without_training.txt",
    "w"
) as f:
    f.write(metrics_result_text)

Using 10-fold cross-validation, the metrics for SENSE-PPI are as follows:
Accuracy:    0.6133386794974606 ± 0.04995135031841616
Precision:   0.6157614576020649 ± 0.10754226473564313
Recall:      0.4869537218078638 ± 0.09551093513614
F1-Score:    0.5369952540534972 ± 0.0801484796991584
Specificity: 0.7340540650071831 ± 0.05374001705623114


In [15]:
# Now, address the computation of the ROC AUC score
# A Receiver Operator Characteristic curve, abbreviated as ROC curve,
# summarises a classifier's performance across different classification
# thresholds
# Conceptually, each point of an ROC curve represents a confusion matrix
# obtained for the respective classification threshold
# Thus, an ROC curve visually summarises the information contained in
# multiple confusion matrices by plotting the True Positive Rate (TPR)
# against the False Positive Rate (FPR) for each confusion matrix/
# threshold
# The ROC AUC score, as its name already suggests, is the area under the
# curve of the ROC curve
# The ROC AUC score allows the comparison of different classifiers with
# greater ROC AUC scores indicating superior predictive capabilities
# This is due to the fact that points to the left of the diagonal
# indicate a threshold for which the TPR exceeds the FPR

# Import libraries required for ROC AUC score computation as well as the
# visualisation of the ROC curve
from sklearn.metrics import roc_auc_score

In [16]:
# As 10-fold cross-validation has been performed, the ROC AUC score is
# computed as follows:
# It is iterated over the 10 test sets, strictly speaking over their
# ground truth values as well as their predicted probabilities
# Then, for each of them, the ROC AUC score is computed
# Finally, the average ROC AUC score is taken across the 10 test sets

roc_auc_score_list = []

for i, current_ground_truths in enumerate(ground_truth_label_list):
    # Access the predicted probabilities of the test set at hand
    current_split_preds_df = pd.read_csv(
        f"results_split_{i}/predictions_on_VACV_WR_pos_and_neg_data_"\
        f"set_test_set_split_{i}_without_training.tsv",
        sep="\t"
    )

    current_probs = current_split_preds_df["preds"].to_list()

    current_roc_auc_score = roc_auc_score(
        current_ground_truths, current_probs
    )
    
    roc_auc_score_list.append(current_roc_auc_score)

roc_auc_score_mean = np.mean(roc_auc_score_list)
roc_auc_score_std = np.std(roc_auc_score_list)

print(
    "The average ROC AUC score across the 10 test sets\nis "
    f"{roc_auc_score_mean} \xB1 {roc_auc_score_std}."
)

The average ROC AUC score across the 10 test sets
is 0.718051512823087 ± 0.04692949363532433.
