In [1]:
"""
The purpose of this Jupyter notebook is to create a new combined data
set meeting two requirements. The first requirement is that the human
proteins involved in negative PPIs exclusively occur in the nucleolus.
The second requirement is that the proteins involved in negative PPIs
(both VACV and human) have a length of at most 1,700 amino acids. The
second requirement is imposed so as to ensure the feasibility of
molecular dynamics simulations.
"""

'\nThe purpose of this Jupyter notebook is to create a new combined data\nset meeting two requirements. The first requirement is that the human\nproteins involved in negative PPIs exclusively occur in the nucleolus.\nThe second requirement is that the proteins involved in negative PPIs\n(both VACV and human) have a length of at most 1,700 amino acids. The\nsecond requirement is imposed so as to ensure the feasibility of\nmolecular dynamics simulations.\n'

In [2]:
# Unfortunately, restricting the search query to proteins occurring
# exclusively in the nucleolus via the GUI (i.e. the UniProt website) is
# not possible
# Therefore, it is resorted to using the XML file in order to filter the
# results

# In detail, on the UniProt website, a search is conducted with the
# following settings: Organism [OS] 9606 AND subcellular location term
# Nucleolus [SL-0188]
# The search results are downloaded as XML file; the search has been
# conducted on 19th June 2025

# The filtering is based on the following rationale: Each protein has at
# least one subcellular location, namely the nucleolus
# Only proteins with exactly one subcellular location (i.e. the
# nucleolus) are retained
# In the case of proteins with multiple isoforms, the protein is only
# retained if all isoforms occur exclusively in the nucleolus

import xml.etree.ElementTree as ET

path_to_nucleolus_xml = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_"
    "19.xml"
)

tree = ET.parse(path_to_nucleolus_xml)
root = tree.getroot()

In [3]:
uniprot_accs_only_in_nucleolus = []

for entry in root.iterfind("./{http://uniprot.org/uniprot}entry"):
    # Some entries have multiple `accession` tags, only the first of
    # which is of interest
    # Conveniently enough, the `find()` method finds the first child
    # with a particular tag
    uniprot_acc = entry.find("{http://uniprot.org/uniprot}accession").text
    
    # Determine the amount of subcellular locations listed for the entry
    # at hand
    # Some entries have isoforms; thus, for each potential isoform, the
    # amount of subcellular locations must not exceed 1 in order for the
    # protein to be retained
    scl_entries = entry.findall(
        "{http://uniprot.org/uniprot}comment[@type='subcellular location']"
    )
    
    n_subcell_locs = [
        len(scl_entry.findall(
            "./{http://uniprot.org/uniprot}subcellularLocation"
        ))
        for scl_entry in scl_entries
    ]

    if all([n == 1 for n in n_subcell_locs]):
        uniprot_accs_only_in_nucleolus.append(uniprot_acc)


In [4]:
print(
    "The amount of human proteins occurring exclusively in the "
    f"nucleolus is {len(uniprot_accs_only_in_nucleolus):,}."
)

The amount of human proteins occurring exclusively in the nucleolus is 433.


In [5]:
# As it turns out, one of the human proteins involved in positive
# reliable PPIs from HVIDB is a human nucleolus protein
# In detail, it is the protein with the uniprot accession F4ZW62
# Therefore, this nucleolus protein cannot be used in the construction
# of negative PPI instances and has to be removed from the list
print(
    "F4ZW62" in uniprot_accs_only_in_nucleolus
)

uniprot_accs_only_in_nucleolus.remove("F4ZW62")

assert "F4ZW62" not in uniprot_accs_only_in_nucleolus, (
    "The removal of the UniProt accession \"F4ZW62\" was not successful!"
)

True


In [None]:
import pandas as pd

# In a subsequent step, the TSV file downloaded from UniProt is filtered
# to only contain proteins occurring exclusively in the nucleolus
# This TSV file does not only contain proteins exclusively occurring in
# the nucleolus, but also proteins occurring in the nucleolus among
# other locations
# Load the TSV file into a Pandas DataFrame
path_to_uniprot_tsv = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_"
    "06_19.tsv"
)
nucleolus_uniprot_df = pd.read_csv(
    path_to_uniprot_tsv,
    sep="\t"
)

print(
    "DataFrame length prior to filtering based on location: "
    f"{len(nucleolus_uniprot_df):,}"
)

nucleolus_uniprot_df = nucleolus_uniprot_df[
    nucleolus_uniprot_df["Entry"].isin(uniprot_accs_only_in_nucleolus)
]

print(
    "DataFrame length after filtering based on location: "
    f"{len(nucleolus_uniprot_df):,}"
)

DataFrame length prior to filtering based on location: 1,156
DataFrame length after filtering based on location: 432


In [None]:
# Molecular Dynamics (MD) simulations are supposed to be performed to
# verify the reliability of negative PPI instances
# As too long protein lengths make MD simulations infeasible, it is
# investigated how many proteins have a sequence length of at most 1,700
# amino acids
n_prots_below_cut_off = (nucleolus_uniprot_df["Length"] <= 1700).sum()

print(
    "Amount of proteins meeting length requirement: "
    f"{n_prots_below_cut_off} of {len(nucleolus_uniprot_df)}"
)

Amount of proteins meeting length requirement: 421 of 432


In [None]:
# Apply this sequence length cut-off
nucleolus_uniprot_df = nucleolus_uniprot_df[
    nucleolus_uniprot_df["Length"] <= 1700
]

assert len(nucleolus_uniprot_df) == 421, (
    "Something went wrong while applying the length cut-off!"
)

In [None]:
# Save the filtered DataFrame to a TSV file
nucleolus_uniprot_df.to_csv(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_19"
    "_exclusively_nucleolus_filtered.tsv",
    sep="\t",
    header=True,
    index=False
)

In [3]:
# Load the filtered TSV file into a DataFrame
import pandas as pd

path_to_uniprot_tsv = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_"
    "19_exclusively_nucleolus_filtered.tsv"
)
nucleolus_uniprot_df = pd.read_csv(
    path_to_uniprot_tsv,
    sep="\t"
)

In [4]:
from biotite.sequence.io import fasta

In [None]:
# The FASTA file of the nucleolus proteins has also been downloaded on
# 19th June 2025 as "FASTA (canonical)"
# Load the FASTA file into a FASTA file object and adjust the headers
# such that they exclusively contain the UniProt accession

path_to_nucleolus_fasta = (
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_19.fasta"
)

nucleolus_fasta = fasta.FastaFile.read(path_to_nucleolus_fasta)

nucleolus_fasta_uniprot_only_header = fasta.FastaFile()

for header, seq in nucleolus_fasta.items():
    uniprot_only_header = header.split("|")[1]
    nucleolus_fasta_uniprot_only_header[uniprot_only_header] = seq

nucleolus_fasta_uniprot_only_header.write(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_19_"
    "uniprot_only_header.fasta"
)

In [5]:
# Load the FASTA file with adjusted headers
nucleolus_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0188_nucleolus_2025_06_19_"
    "uniprot_only_header.fasta"
)

In [None]:
# Regarding the human proteins involved in reliable positive
# interactions, a FASTA file encompassing all of them has already been
# generated earlier
# However, it contains full headers
# The headers are also adjusted to only contain the UniProt accession
human_prots_in_HVIDB_data_set_fasta = fasta.FastaFile.read(
    "human_prots_in_HVIDB_VACV_WR_data_set.fasta"
)

human_prots_in_HVIDB_data_set_uniprot_only_header_fasta = fasta.FastaFile()

for header, seq in human_prots_in_HVIDB_data_set_fasta.items():
    uniprot_only_header = header.split("|")[1]
    human_prots_in_HVIDB_data_set_uniprot_only_header_fasta[
        uniprot_only_header
    ] = seq

In [None]:
# Save the FASTA file with updated headers to disk
human_prots_in_HVIDB_data_set_uniprot_only_header_fasta.write(
    "human_prots_in_HVIDB_VACV_WR_data_set_uniprot_only_header.fasta"
)

In [6]:
# Load the FASTA file with adjusted headers
human_prots_in_HVIDB_data_set_fasta = fasta.FastaFile.read(
    "human_prots_in_HVIDB_VACV_WR_data_set_uniprot_only_header.fasta"
)

In [13]:
# In addition to nucleolus proteins, peroxisome proteins are also good
# candidates for negative human interaction partners
# Thus, peroxisome proteins are subjected to the same procedure as
# nucleolus proteins
# In detail, on the UniProt website, a search is conducted with the
# following settings: Organism [OS] 9606 AND subcellular location term
# Peroxisome [SL-0204]
# The search results are downloaded both as XML file and FASTA file;
# regarding the FASTA file, only canonical protein sequences are
# downloaded, i.e. no isoforms ("FASTA (canonical)" is chosen as
# "Format" option); the search has been conducted on 23rd June 2025

# Here, a different filtering rationale is being applied: The individual
# proteins as well as their isoforms are allowed to contain multiple
# subcellular locations, but each subcellular location must contain the
# term "Peroxisome"

# Load the XML file
path_to_peroxisome_xml = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_"
    "06_23.xml"
)

peroxisome_tree = ET.parse(path_to_peroxisome_xml)
peroxisome_root = peroxisome_tree.getroot()

In [14]:
uniprot_accs_only_in_peroxisome = []

for entry in peroxisome_root.iterfind("./{http://uniprot.org/uniprot}entry"):
    # Some entries have multiple `accession` tags, only the first of
    # which is of interest
    # Conveniently enough, the `find()` method finds the first child
    # with a particular tag
    uniprot_acc = entry.find("{http://uniprot.org/uniprot}accession").text

    # Retrieve all subcellular location entries for the protein at hand
    scl_entries = entry.findall(
        "{http://uniprot.org/uniprot}comment[@type='subcellular location']/"
        "{http://uniprot.org/uniprot}subcellularLocation/"
        "{http://uniprot.org/uniprot}location"
    )

    scl_texts = [scl.text for scl in scl_entries]

    if all([
        "Peroxisome" in scl_text for scl_text in scl_texts
    ]):
        uniprot_accs_only_in_peroxisome.append(uniprot_acc)

In [15]:
print(
    "The amount of human proteins occurring exclusively in the "
    f"peroxisome is {len(uniprot_accs_only_in_peroxisome):,}."
)

The amount of human proteins occurring exclusively in the peroxisome is 213.


In [17]:
# Quickly check whether any of the human proteins involved in reliable
# positive PPIs from HVIDB is part of this peroxisome subset
assert all([
    peroxisome_prot not in human_prots_in_HVIDB_data_set_fasta.keys()
    for peroxisome_prot in uniprot_accs_only_in_peroxisome
]), (
    "One or more peroxisome proteins indeed are involved in reliable "
    "positive interactions!"
)

In [19]:
# Fortunately, no peroxisome proteins are involved in reliable positive
# interactions from HVIDB
# Now, the procedure is analogous to that of nucleolus proteins, i.e.
# the TSV file downloaded from UniProt is filtered to only contain
# proteins occurring exclusively in peroxisomes
# Subsequently, a second filtering step is applied, removing all
# proteins exceeding a length of 1,700 amino acids

# Load the TSV file into a Pandas DataFrame; the TSV file has been
# downloaded on 24th June 2025
path_to_peroxisome_uniprot_tsv = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_"
    "06_24.tsv"
)
peroxisome_uniprot_df = pd.read_csv(
    path_to_peroxisome_uniprot_tsv,
    sep="\t"
)

print(
    "DataFrame length prior to filtering based on location: "
    f"{len(peroxisome_uniprot_df):,}"
)

peroxisome_uniprot_df = peroxisome_uniprot_df[
    peroxisome_uniprot_df["Entry"].isin(uniprot_accs_only_in_peroxisome)
]

print(
    "DataFrame length after filtering based on location: "
    f"{len(peroxisome_uniprot_df):,}"
)

DataFrame length prior to filtering based on location: 359
DataFrame length after filtering based on location: 213


In [20]:
# Perform the length-based filtering
n_peroxisome_prots_below_cut_off = (
    peroxisome_uniprot_df["Length"] <= 1700
).sum()

print(
    "Amount of peroxisome proteins meeting the length requirement: "
    f"{n_peroxisome_prots_below_cut_off} of {len(peroxisome_uniprot_df)}"
)

Amount of peroxisome proteins meeting the length requirement: 212 of 213


In [21]:
# Apply this sequence length cut-off and save the filtered DataFrame to
# a TSV file
peroxisome_uniprot_df = peroxisome_uniprot_df[
    peroxisome_uniprot_df["Length"] <= 1700
]

assert len(peroxisome_uniprot_df) == 212, (
    "Something went wrong while applying the length cut-off!"
)

peroxisome_uniprot_df.to_csv(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_06_"
    "24_exclusively_peroxisome_filtered.tsv",
    sep="\t",
    header=True,
    index=False
)

In [7]:
# Load the filtered TSV file into a DataFrame
path_to_peroxisome_uniprot_tsv = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_06_"
    "24_exclusively_peroxisome_filtered.tsv"
)
peroxisome_uniprot_df = pd.read_csv(
    path_to_peroxisome_uniprot_tsv,
    sep="\t"
)

In [23]:
# The FASTA file of the peroxisome proteins has been downloaded on 23rd
# June 2025 as "FASTA (canonical)"
# Load the FASTA file into a FASTA file object and adjust the headers
# such that they exclusovely contain the UniProt accessions
path_to_peroxisome_fasta = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/"
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_06_"
    "23.fasta"
)

peroxisome_fasta = fasta.FastaFile.read(path_to_peroxisome_fasta)

peroxisome_fasta_uniprot_only_header = fasta.FastaFile()

for header, seq in peroxisome_fasta.items():
    uniprot_only_header = header.split("|")[1]
    peroxisome_fasta_uniprot_only_header[uniprot_only_header] = seq

peroxisome_fasta_uniprot_only_header.write(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_06_23"
    "_uniprot_only_header.fasta"
)

In [8]:
# Load the FASTA file with adjusted headers
peroxisome_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_AND_cc_scl_t_0204_peroxisome_2025_06_"
    "23_uniprot_only_header.fasta"
)

In [None]:
# Now that eligible human proteins for the negative PPI instances have
# been determined, the VACV WR proteins are turned to
# All VACV WR proteins available on UniProt are downloaded both as FASTA
# and TSV file on 19th June 2025
# In total, there are 440 VACV WR proteins
# Regarding the FASTA file, only canonical protein sequences are
# downloaded, i.e. no isoforms ("FASTA (canonical)" is chosen as
# "Format" option)
# Load the FASTA file and modify the headers such that it only contains
# the UniProt accessions
path_to_VACV_WR_fasta = (
    "uniprotkb_organism_id_10254_2025_06_19_all_VACV_WR_prots.fasta"
)

all_VACV_WR_prots_fasta = fasta.FastaFile.read(path_to_VACV_WR_fasta)

all_VACV_WR_prots_uniprot_only_header_fasta = fasta.FastaFile()

for header, seq in all_VACV_WR_prots_fasta.items():
    uniprot_only_header = header.split("|")[1]

    all_VACV_WR_prots_uniprot_only_header_fasta[uniprot_only_header] = (
        seq
    )

all_VACV_WR_prots_uniprot_only_header_fasta.write(
    "uniprotkb_organism_id_10254_2025_06_19_all_VACV_WR_prots_"
    "uniprot_only_header.fasta"
)

In [9]:
# Load the FASTA file with adjusted headers
path_to_VACV_WR_fasta = (
    "uniprotkb_organism_id_10254_2025_06_19_all_VACV_WR_prots_uniprot_"
    "only_header.fasta"
)

all_VACV_WR_prots_fasta = fasta.FastaFile.read(path_to_VACV_WR_fasta)

In [26]:
# Quickly verify that all VACV WR proteins involved in positive
# interactions are covered by the VACV WR FASTA file
# To this end, the interaction file has to be loaded
import numpy as np

uniprot_accs_in_VACV_WR_fasta = list(all_VACV_WR_prots_fasta.keys())

path_to_pos_ints = "all_HVIDB_VACV_WR_interactions.csv"

pos_VACV_WR_ints_df = pd.read_csv(
    path_to_pos_ints
)

# Extract the UniProt IDs of VACV WR proteins
interaction_pairs = pos_VACV_WR_ints_df["Human-virus PPI"]

VACV_WR_in_pos_ints_uniprot_accs = np.unique([
    int_pair.split("-")[1]
    for int_pair in interaction_pairs
])

assert all([
    VACV_WR_uniprot_acc in uniprot_accs_in_VACV_WR_fasta
    for VACV_WR_uniprot_acc in VACV_WR_in_pos_ints_uniprot_accs
]), (
    "Not all VACV WR proteins involved in reliable positive "
    "interactions are covered by the FASTA file!"
)

In [27]:
# Investigate whether imposing a length cut-off reduces the amount of
# eligible VACV WR proteins
path_to_VACV_WR_tsv = (
    "uniprotkb_organism_id_10254_2025_06_19_all_VACV_WR_prots.tsv"
)
all_VACV_WR_prots_df = pd.read_csv(
    path_to_VACV_WR_tsv,
    sep="\t"
)

n_VACV_WR_prots_below_cut_off = (
    all_VACV_WR_prots_df["Length"] <= 1700
).sum()

print(
    "Amount of VACV WR proteins meeting length requirement: "
    f"{n_VACV_WR_prots_below_cut_off} out of {len(all_VACV_WR_prots_df)}"
)

Amount of VACV WR proteins meeting length requirement: 440 out of 440


In [11]:
# A FASTA file is generated encompassing all protein classes, i.e. VACV 
# WR proteins, human nucleolus proteins as well as human proteins
# involved in reliable positive interactions
# Human peroxisome proteins are not included as the amount of eligible
# nucleolus proteins already suffices
VACV_WR_pos_human_and_human_nucleolus_prots_fasta = fasta.FastaFile()

# First, add the VACV WR proteins
for header, seq in all_VACV_WR_prots_fasta.items():
    VACV_WR_pos_human_and_human_nucleolus_prots_fasta[
        header
    ] = seq

# As a next step, add the human proteins involved in reliable positive
# interactions
for header, seq in human_prots_in_HVIDB_data_set_fasta.items():
    VACV_WR_pos_human_and_human_nucleolus_prots_fasta[
        header
    ] = seq

# # In the penultimate step, all human peroxisome proteins meeting the
# # length requirement are added
# for header, seq in peroxisome_fasta.items():
#     if header in peroxisome_uniprot_df["Entry"].values:
#         VACV_WR_pos_human_and_human_nucleolus_prots_fasta[
#             header
#         ] = seq

# Finally, add all human nucleolus proteins meeting the length
# requirement
for header, seq in nucleolus_fasta.items():
    if header in nucleolus_uniprot_df["Entry"].values:
        VACV_WR_pos_human_and_human_nucleolus_prots_fasta[
            header
        ] = seq

# As a sanity check, verify that the newly created FASTA file
# encompasses the expected amount of entries
expected_n_entries = (
    len(all_VACV_WR_prots_fasta)
    +
    len(human_prots_in_HVIDB_data_set_fasta)
    +
    # len(peroxisome_uniprot_df)
    # +
    len(nucleolus_uniprot_df)
)

assert (
    len(VACV_WR_pos_human_and_human_nucleolus_prots_fasta)
    ==
    expected_n_entries
), (
    "The newly created FASTA file does not encompass the expected "
    "amount of entries!"
)

# Save the FASTA file to disk
# VACV_WR_pos_human_and_human_nucleolus_prots_fasta.write(
#     "VACV_WR_prots_pos_human_prots_human_peroxisome_prots_and_human_"
#     "nucleolus_prots_max_length_1700_AAs.fasta"
# )
VACV_WR_pos_human_and_human_nucleolus_prots_fasta.write(
    "VACV_WR_prots_pos_human_prots_and_human_nucleolus_prots_max_"
    "length_1700_AAs.fasta"
)