In [1]:
"""
The purpose of this Jupyter notebook is to evaluate the performance of
xCAPT5 ...
"""

'\nThe purpose of this Jupyter notebook is to evaluate the performance of\nxCAPT5 ...\n'

In [1]:
# As this Jupyter notebook is not part of a package, the module's/file's
# `__name__` attribute is set to `__main__`, i.e. it does not contain
# any package information
# This, in turn, makes relative imports infeasible
# Thus, the directory the desired file is located in has to be manually
# added to path
import sys
sys.path.append(
    "/Users/jacobanter/Documents/Code/VACV_screen/"
    "HVIDB_pos_instances_with_nucleolus_neg_instances/Results"
)

In [2]:
import evaluation_utils

In [4]:
# In the case of xCAPT5, only the predicted probabilities, but not the
# corresponding labels are stored in the output file
# Therefore, as a first step, a `label` column is added to the output
# file
# As threshold, 0.5 is applied, i.e. a PPI having a probability of at
# least 0.5 is predicted to occur and is assigned a label of 1;
# conversely, PPIs with a probability below 0.5 are predicted not to
# occur and are assigned a label of 0
path_to_results_file = (
    "xCAPT5_interaction_probs_sled_ckpt_bullet-proof_data_set_with_"
    "XGBoost.tsv"
)

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files=path_to_results_file,
    pred_col_name="interaction_probability",
    n_fold=1
)

In [5]:
evaluation_utils.evaluation_k_fold_cross_val(
    ground_truth_path="/Users/jacobanter/Documents/Code/VACV_screen/"
    "HVIDB_pos_instances_with_nucleolus_neg_instances/new_combined_"
    "data_set_creation/data_set_files/entire_bullet-proof_ppi_data_"
    "set.tsv",
    splits_path=path_to_results_file,
    n_fold=1,
    probability_key="interaction_probability",
    model_name="xCAPT5 with XGBoost Sled ckpt",
    output_path="xCAPT5_with_XGBoost_Sled_ckpt_results_on_new_"\
    "combined_VACV_WR_data_set_without_training.txt"
)

Using 1-fold cross-validation, the metrics for xCAPT5 with XGBoost Sled ckpt are as follows:
Accuracy:      0.8398058252427184 ± 0.0
Precision:     0.7916666666666666 ± 0.0
Recall:        0.9223300970873787 ± 0.0
F1-score:      0.852017937219731 ± 0.0
Specificity:   0.7572815533980582 ± 0.0
MCC:           0.689061827088388 ± 0.0
ROC AUC score: 0.9353879960410971 ± 0.0
AUPRC score    0.9344352406386947 ± 0.0


((0.8398058252427184, 0.0),
 (0.7916666666666666, 0.0),
 (0.9223300970873787, 0.0),
 (0.852017937219731, 0.0),
 (0.7572815533980582, 0.0),
 (0.689061827088388, 0.0),
 (0.9353879960410971, 0.0),
 (0.9344352406386947, 0.0))

In [3]:
# For table 2 of the publication, the evaluation is supposed to be
# limited to the test set
# Apart from that, XGBoost has been fitted in two different manners, the
# first of which involved fitting from scratch and the second of which
# involved fine-tuning from either the Pan or the Sled checkpoint
# In this Jupyter notebook, exclusively the Sled checkpoint is dealt
# with
# First, perform the evaluation for XGBoost fitted from scratch
path_to_test_set_results_XGBoost_from_scratch = (
    "fine-tuning_XGBoost_on_training_set_only_from_scratch/xCAPT5_"
    "interaction_probs_Sled_ckpt_bullet-proof_test_set_XGBoost_from_"
    "scratch_seed_{i}_with_XGBoost_no_fitting.tsv"
)

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files=path_to_test_set_results_XGBoost_from_scratch,
    pred_col_name="interaction_probability",
    n_fold=5
)

In [4]:
evaluation_utils.evaluation_k_fold_cross_val(
    ground_truth_path="/Users/jacobanter/Documents/Code/VACV_screen/"\
    "HVIDB_pos_instances_with_nucleolus_neg_instances/new_combined_"\
    "data_set_creation/data_set_files/bullet-proof_test_set.tsv",
    splits_path=path_to_test_set_results_XGBoost_from_scratch,
    n_fold=5,
    probability_key="interaction_probability",
    model_name="xCAPT5 Sled ckpt XGBoost from scratch",
    output_path="fine-tuning_XGBoost_on_training_set_only_from_"\
    "scratch/xCAPT5_Sled_ckpt_XGBoost_from_scratch_results_on_new_"\
    "test_test_5_seeds.txt"
)

Using 5-fold cross-validation, the metrics for xCAPT5 Sled ckpt XGBoost from scratch are as follows:
Accuracy:      0.8942857142857144 ± 0.014568627181693668
Precision:     0.8712343922870238 ± 0.018085383399919393
Recall:        0.9257142857142858 ± 0.01399708424447531
F1-score:      0.8975571848135999 ± 0.013657129454246127
Specificity:   0.8628571428571428 ± 0.02138089935299392
MCC:           0.7903003187962664 ± 0.028771815854130068
ROC AUC score: 0.946857142857143 ± 0.001514060162529917
AUPRC score    0.9523353978200391 ± 0.0013385932814591883


((0.8942857142857144, 0.014568627181693668),
 (0.8712343922870238, 0.018085383399919393),
 (0.9257142857142858, 0.01399708424447531),
 (0.8975571848135999, 0.013657129454246127),
 (0.8628571428571428, 0.02138089935299392),
 (0.7903003187962664, 0.028771815854130068),
 (0.946857142857143, 0.001514060162529917),
 (0.9523353978200391, 0.0013385932814591883))

In [5]:
# Now, address XGBoost fine-tuned from the corresponding Sled checkpoint
path_to_test_set_results_XGBoost_from_Sled_ckpt = (
    "fine-tuning_XGBoost_on_training_set_only_resumption_from_Sled_"
    "ckpt/xCAPT5_interaction_probs_Sled_ckpt_bullet-proof_test_set_"
    "XGBoost_fine-tuned_from_Sled_ckpt_seed_{i}_with_XGBoost_no_"
    "fitting.tsv"
)

evaluation_utils.add_labels_based_on_probs(
    path_tsv_files=path_to_test_set_results_XGBoost_from_Sled_ckpt,
    pred_col_name="interaction_probability",
    n_fold=5
)

In [6]:
evaluation_utils.evaluation_k_fold_cross_val(
    ground_truth_path="/Users/jacobanter/Documents/Code/VACV_screen/"\
    "HVIDB_pos_instances_with_nucleolus_neg_instances/new_combined_"\
    "data_set_creation/data_set_files/bullet-proof_test_set.tsv",
    splits_path=path_to_test_set_results_XGBoost_from_Sled_ckpt,
    n_fold=5,
    probability_key="interaction_probability",
    model_name="xCAPT5 Sled ckpt XGBoost from Sled ckpt",
    output_path="fine-tuning_XGBoost_on_training_set_only_resumption_"\
    "from_Sled_ckpt/xCAPT5_Sled_ckpt_XGBoost_from_Sled_ckpt_results_"\
    "on_new_test_set_5_seeds.txt"
)

Using 5-fold cross-validation, the metrics for xCAPT5 Sled ckpt XGBoost from Sled ckpt are as follows:
Accuracy:      0.5714285714285714 ± 0.0
Precision:     0.8571428571428571 ± 0.0
Recall:        0.17142857142857143 ± 0.0
F1-score:      0.2857142857142857 ± 0.0
Specificity:   0.9714285714285713 ± 1.1102230246251565e-16
MCC:           0.23809523809523808 ± 0.0
ROC AUC score: 0.6367346938775511 ± 0.0
AUPRC score    0.6590250412519321 ± 0.0


((0.5714285714285714, 0.0),
 (0.8571428571428571, 0.0),
 (0.17142857142857143, 0.0),
 (0.2857142857142857, 0.0),
 (0.9714285714285713, 1.1102230246251565e-16),
 (0.23809523809523808, 0.0),
 (0.6367346938775511, 0.0),
 (0.6590250412519321, 0.0))