In [1]:
"""
The purpose of this Jupyter notebook is to determine the distribution of
protein sequence lengths in the combined data set involving both
confirmed human-VACV WR PPIs and reliable negative PPIs. This is done as
one of the published benchmark PPI prediction models, SENSE-PPI, is
especially susceptible to memory issues brought about by large
sequences.
"""

'\nThe purpose of this Jupyter notebook is to determine the distribution of\nprotein sequence lengths in the combined data set involving both\nconfirmed human-VACV WR PPIs and reliable negative PPIs. This is done as\none of the published benchmark PPI prediction models, SENSE-PPI, is\nespecially susceptible to memory issues brought about by large\nsequences.\n'

In [19]:
import pandas as pd
from biotite.sequence.io import fasta

In [3]:
# First, gain an impression of the distribution of protein lengths in
# the combined FASTA file, i.e. the FASTA file comprising proteins
# involved in both confirmed positive interactions and reliable negative
# interactions
path_to_combined_fasta = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/human_nucleolus_and_VACV_WR_prot_"
    "seqs.fasta"
)

prot_seqs_VACV_WR_pos_and_neg_PPIs_fasta = fasta.FastaFile.read(
    path_to_combined_fasta
)

combined_prot_seq_len_list = []

for _, seq_str in prot_seqs_VACV_WR_pos_and_neg_PPIs_fasta.items():
    combined_prot_seq_len_list.append(len(seq_str))

# By default, the `.sort()` method sorts the respective list's elements
# in ascending order; in order to sort them in descending order,
# `reverse=True` has to be explicitly passed
combined_prot_seq_len_list.sort(reverse=True)

print(combined_prot_seq_len_list)

[35991, 34350, 6885, 5795, 5596, 4128, 3433, 3256, 3224, 3177, 3177, 2997, 2871, 2850, 2839, 2785, 2677, 2623, 2554, 2514, 2482, 2440, 2426, 2390, 2382, 2364, 2364, 2351, 2346, 2346, 2325, 2297, 2271, 2144, 2027, 2004, 1995, 1979, 1907, 1905, 1898, 1883, 1871, 1849, 1820, 1800, 1798, 1792, 1722, 1720, 1711, 1690, 1685, 1683, 1675, 1670, 1626, 1570, 1531, 1524, 1500, 1488, 1478, 1432, 1419, 1374, 1365, 1341, 1336, 1328, 1324, 1312, 1297, 1282, 1273, 1270, 1257, 1255, 1245, 1235, 1233, 1230, 1226, 1222, 1222, 1214, 1209, 1200, 1193, 1167, 1164, 1157, 1151, 1146, 1140, 1135, 1132, 1130, 1128, 1123, 1097, 1096, 1093, 1071, 1070, 1068, 1063, 1060, 1058, 1053, 1042, 1025, 1025, 1024, 1023, 1020, 1014, 1013, 1012, 1012, 1009, 1002, 1001, 989, 980, 979, 970, 970, 963, 960, 958, 956, 951, 950, 950, 943, 937, 929, 927, 919, 916, 907, 907, 906, 905, 900, 896, 894, 894, 892, 886, 885, 882, 881, 875, 874, 869, 868, 860, 857, 856, 853, 851, 851, 847, 844, 839, 830, 828, 825, 821, 819, 812, 810, 808,

In [4]:
# Now, in a bid to determine which type of protein the two extremely
# long proteins are (VACV WR protein, human protein involved in
# confirmed positive interaction or human nucleolus protein), the FASTA
# file containing human nucleolus proteins is subjected to the same
# procedure
path_to_human_nucleolus_prots_fasta = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/uniprotkb_organism_id_9606_AND_scl_"
    "SL-0188_2025_01_16_uniprot_only_header.fasta"
)

human_nucleolus_prots_fasta = fasta.FastaFile.read(
    path_to_human_nucleolus_prots_fasta
)

nucleolus_prots_len_list = []

for _, seq_str in human_nucleolus_prots_fasta.items():
    nucleolus_prots_len_list.append(len(seq_str))

nucleolus_prots_len_list.sort(reverse=True)

print(nucleolus_prots_len_list)

[5596, 4128, 3256, 2997, 2839, 2785, 2677, 2271, 2144, 2004, 1905, 1883, 1871, 1849, 1820, 1792, 1722, 1720, 1690, 1626, 1531, 1524, 1500, 1488, 1432, 1419, 1374, 1365, 1336, 1328, 1297, 1282, 1273, 1270, 1255, 1226, 1209, 1200, 1167, 1164, 1157, 1146, 1135, 1132, 1130, 1123, 1097, 1096, 1071, 1063, 1060, 1058, 1053, 1042, 1025, 1025, 1024, 1014, 1009, 989, 979, 970, 970, 960, 958, 951, 950, 943, 929, 919, 907, 905, 900, 896, 894, 892, 885, 882, 881, 874, 869, 860, 857, 856, 853, 851, 851, 847, 830, 828, 821, 819, 812, 808, 804, 800, 796, 796, 795, 783, 781, 773, 771, 767, 766, 765, 764, 762, 759, 758, 756, 756, 755, 749, 748, 746, 741, 740, 737, 734, 732, 731, 730, 729, 719, 711, 710, 709, 707, 707, 706, 702, 699, 692, 690, 688, 687, 686, 685, 681, 679, 677, 670, 669, 666, 653, 653, 651, 648, 646, 646, 645, 641, 639, 638, 634, 632, 629, 627, 614, 610, 599, 598, 597, 594, 588, 588, 582, 582, 578, 574, 572, 570, 570, 567, 560, 558, 558, 556, 551, 549, 547, 543, 534, 531, 530, 529, 528, 

In [5]:
# The two abovementioned extremely long proteins do not belong to the
# nucleolus proteins; thus, it is investigated whether they belong to
# the VACV WR proteins known to engage in human-virus PPIs
path_to_VACV_WR_prots_fasta = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/VACV_WR_prots_in_HVIDB_uniprot_only_"
    "header.fasta"
)

VACV_WR_prots_fasta = fasta.FastaFile.read(path_to_VACV_WR_prots_fasta)

VACV_WR_prots_len_list = []

for _, seq_str in VACV_WR_prots_fasta.items():
    VACV_WR_prots_len_list.append(len(seq_str))

VACV_WR_prots_len_list.sort(reverse=True)

print(VACV_WR_prots_len_list)

[844, 785, 771, 634, 564, 512, 439, 426, 371, 345, 331, 319, 300, 284, 273, 272, 244, 240, 226, 224, 221, 219, 204, 204, 203, 190, 190, 187, 177, 175, 171, 166, 165, 151, 150, 149, 149, 125, 117, 110, 90, 88, 61]


In [6]:
# It emerges that the two extremely long proteins do not belong to the
# VACV WR proteins either
# Hence, they must be part of the human proteins confirmed to engage in
# human-VACV WR PPIs
# The precise amount of confirmed human-VACV WR PPIs involving these two
# extremely long proteins is determined

# First, determine the UniProt IDs of the two proteins
# To this end, it is iterated over the entries of the combined FASTA
# file
extremely_long_uniprots_ids = []
for uniprot_id, seq_str in prot_seqs_VACV_WR_pos_and_neg_PPIs_fasta.items():
    if len(seq_str) == 35991:
        extremely_long_uniprots_ids.append(uniprot_id)
    elif len(seq_str) == 34350:
        extremely_long_uniprots_ids.append(uniprot_id)

print(extremely_long_uniprots_ids)

['C9JQJ2', 'Q8WZ42']


In [9]:
# Now, determine the amount of PPIs these two proteins are involved in
path_to_confirmed_VACV_WR_PPIs = (
    "/Users/jacobanter/Documents/Code/VACV_screen/"
    "all_HVIDB_VACV_WR_interactions.csv"
)

confirmed_VACV_WR_PPIs_df = pd.read_csv(path_to_confirmed_VACV_WR_PPIs)

human_prots_in_PPIs = [
    int_pair.split("-")[0] for int_pair
    in confirmed_VACV_WR_PPIs_df["Human-virus PPI"]
]

n_PPIs_with_long_prots = (
    human_prots_in_PPIs.count("C9JQJ2")
    +
    human_prots_in_PPIs.count("Q8WZ42")
)

print(
    "Total amount of confirmed PPIs involving the two extremely long "
    f"protein sequences: {n_PPIs_with_long_prots}"
)

Total amount of confirmed PPIs involving the two extremely long protein sequences: 2


In [23]:
# Furthermore, SENSE-PPI is specified to work best with proteins of
# 50 - 800 amino acids in length
# Out of curiosity, the data set's size is determined for different
# values of the maximum sequence length
VACV_WR_pos_and_neg_PPIs_df = pd.read_csv(
    "VACV_WR_pos_and_nucleolus_prots_neg_PPI_instances.tsv",
    sep="\t"
)

# Generate a list containing for each and every PPI the larger of the
# two proteins' lengths
larger_len_PPIs_list = [
    max(
        len(prot_seqs_VACV_WR_pos_and_neg_PPIs_fasta[human_id]),
        len(prot_seqs_VACV_WR_pos_and_neg_PPIs_fasta[VACV_id])
    )
    for human_id, VACV_id in zip(
        VACV_WR_pos_and_neg_PPIs_df["Human_prot"],
        VACV_WR_pos_and_neg_PPIs_df["VACV_prot"]
    )
]

larger_len_PPIs_list.sort(reverse=True)

In [26]:
ds_size_without_max_len = len(VACV_WR_pos_and_neg_PPIs_df)

max_len_vals = [800, 1000, 5000, 10000]
n_PPIs_per_max_len_val = []

for max_len_val in max_len_vals:
    n_PPIs_satisfying_constraint = sum([
        (prot_len <= max_len_val) for prot_len in larger_len_PPIs_list
    ])
    n_PPIs_per_max_len_val.append(n_PPIs_satisfying_constraint)

In [27]:
print(ds_size_without_max_len)
print(n_PPIs_per_max_len_val)

869
[664, 736, 864, 867]
