In [1]:
"""
The objective of this Jupyter notebook is to create a data set combining
the 412 experimentally confirmed human-VACV PPIs from HVIDB, i.e. the
positive instances with negative instances. To be more precise, PPIs
involving proteins occurring in the human nucleolus serve as negative
instances. As opposed to the positive instances, the negative instances
are not experimentally confirmed, but can nevertheless be considered
reliable. This is due to the fact that the currently known VACV biology
suggests the absence of major interactions between nucleolus proteins
and VACV proteins.

Note that the abovementioned 412 human-VACV PPIs are specific to VACV
strain Western Reserve (which will be abbreviated from now on as VACV
WR).
"""

'\nThe objective of this Jupyter notebook is to create a data set combining\nthe 412 experimentally confirmed human-VACV PPIs from HVIDB, i.e. the\npositive instances with negative instances. To be more precise, PPIs\ninvolving proteins occurring in the human nucleolus serve as negative\ninstances. As opposed to the positive instances, the negative instances\nare not experimentally confirmed, but can nevertheless be considered\nreliable. This is due to the fact that the currently known VACV biology\nsuggests the absence of major interactions between nucleolus proteins\nand VACV proteins.\n\nNote that the abovementioned 412 human-VACV PPIs are specific to VACV\nstrain Western Reserve (which will be abbreviated from now on as VACV\nWR).\n'

In [2]:
import random

import pandas as pd
from biotite.sequence.io import fasta

In [3]:
# Load the positive instances, i.e. the 412 experimentally confirmed
# human-VACV PPIs from HVIDB
all_HVIDB_pairs = pd.read_csv("../all_HVIDB_VACV_WR_interactions.csv")

In [4]:
# Retrieval of nucleus proteins from Homo sapiens from UniProt was
# performed on 16th January 2025 via the following search settings:
# Searching in: UniProtKB
# Organism: 9606 (Homo sapiens)
# Subcellular location term: SL-0188
# Restricting the search results to reviewed proteins (Swiss-Prot)
# yields 457 nucleolus proteins
nucleolus_prots_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_AND_scl_SL-0188_2025_01_16.fasta"
)

In [16]:
# The headers of the nucleolus proteins FASTA file are quite verbose,
# which is why they are simplified to just the UniProt ID
nucleolus_fasta_simple_header = fasta.FastaFile()

for header, seq_str in nucleolus_prots_fasta.items():
    # Conveniently enough, the UniProt ID is separated by vertical bars
    # ("pipes") from the remaining header
    uniprot_id = header.split("|")[1]
    
    nucleolus_fasta_simple_header[uniprot_id] = seq_str

nucleolus_fasta_simple_header.write(
    "uniprotkb_organism_id_9606_AND_scl_SL-0188_2025_01_16_uniprot_only_header.fasta"
)

In [18]:
# The FASTA file harbouring the sequences of VACV WR proteins is
# subjected to the same procedure of header simplification
VACV_WR_prots_fasta = fasta.FastaFile.read(
    "../VACV_WR_prots_in_HVIDB.fasta"
)

VACV_WR_prots_fasta_simple_header = fasta.FastaFile()

for header, seq_str in VACV_WR_prots_fasta.items():
    uniprot_id = header.split("|")[1]

    VACV_WR_prots_fasta_simple_header[uniprot_id] = seq_str

VACV_WR_prots_fasta_simple_header.write(
    "VACV_WR_prots_in_HVIDB_uniprot_only_header.fasta"
)

In [10]:
# Prepare the TSV file combining the positive and the negative instances
# The TSV file encompasses three columns:
# `Human_prot`
# `VACV_prot`
# `Interaction`, where the value 1 represents the presence of a PPI and
# the value 0 represents the absence of a PPI, conversely

# Conveniently enough, the UniProt IDs of the human and VACV WR protein
# are separated from one another by a hyphen
pos_instances_human_prots = [
    int_pair.split("-")[0] for int_pair
    in all_HVIDB_pairs["Human-virus PPI"]
]
pos_instances_VACV_WR_prots = [
    int_pair.split("-")[1] for int_pair
    in all_HVIDB_pairs["Human-virus PPI"]
]
pos_instances_interaction_vals = [1] * len(all_HVIDB_pairs)

# Negative instances are generated by pairing each nucleolus protein
# (i.e. human protein) with a randomly chosen VACV WR protein
# To this end, the VACV WR UniProt IDs as well as the nucleolus protein
# UniProt IDs, i.e. FASTA file keys have to be extracted in a first step
VACV_WR_prots_fasta = fasta.FastaFile.read(
    "VACV_WR_prots_in_HVIDB_uniprot_only_header.fasta"
)
VACV_WR_uniprot_ids = list(VACV_WR_prots_fasta.keys())

nucleolus_prots_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_AND_scl_SL-0188_2025_01_16_uniprot_"
    "only_header.fasta"
)
nucleolus_prots_uniprot_ids = list(nucleolus_prots_fasta.keys())

neg_instances_human_prots = nucleolus_prots_uniprot_ids
# The random selection of a list element is accomplished via
# `random.choice()`
random.seed(0)
neg_instances_VACV_WR_prots = [
    random.choice(VACV_WR_uniprot_ids) for
    _ in range(len(nucleolus_prots_uniprot_ids))
]
neg_instances_interaction_vals = [0] * len(nucleolus_prots_fasta)

In [12]:
# Finally, assemble the TSV file as a Pandas DataFrame and save it to
# disk
pos_and_neg_instances_df = pd.DataFrame(
    data={
        "Human_prot": pos_instances_human_prots + neg_instances_human_prots,
        "VACV_prot": pos_instances_VACV_WR_prots + neg_instances_VACV_WR_prots,
        "Interaction": (
            pos_instances_interaction_vals + neg_instances_interaction_vals
        )
    }
)

pos_and_neg_instances_df.to_csv(
    "VACV_WR_pos_and_nucleolus_prots_neg_PPI_instances.tsv",
    sep="\t",
    index=False
)