In [1]:
"""
The purpose of this Jupyter notebook is to generate the TSV file as well
as the FASTA file required to evaluate xCAPT5's off-target filtering
performance. In detail, the TSV file pairs each human protein occurring
in the combined data (both confirmed positive PPIs as well as reliable
negative PPIs) with all VACV WR proteins. Accordingly, the corresponding
FASTA file encompasses the sequences of all VACV WR proteins and of
human proteins occurring in the combined data set.
"""

"\nThe purpose of this Jupyter notebook is to generate the TSV file as well\nas the FASTA file required to evaluate xCAPT5's off-target filtering\nperformance. In detail, the TSV file pairs each human protein occurring\nin the combined data (both confirmed positive PPIs as well as reliable\nnegative PPIs) with all VACV WR proteins. Accordingly, the corresponding\nFASTA file encompasses the sequences of all VACV WR proteins and of\nhuman proteins occurring in the combined data set.\n"

In [2]:
import numpy as np
import pandas as pd
from biotite.sequence.io import fasta

In [None]:
# Determine the unique UniProt accessions of human proteins occurring in
# the combined data set
path_to_combined_data_set_tsv = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/VACV_WR_pos_and_nucleolus_prots_neg_"
    "PPI_instances.tsv"
)

combined_data_set_df = pd.read_csv(
    path_to_combined_data_set_tsv,
    sep="\t"
)

human_prots = np.unique(combined_data_set_df["Human_prot"]).tolist()

In [6]:
n_human_prots_in_combined_data_set = len(human_prots)

print(
    "Amount of unique human proteins occurring in the combined data "
    f"set: {n_human_prots_in_combined_data_set:,}"
)

Amount of unique human proteins occurring in the combined data set: 800


In [7]:
# Now, load the UniProt accessions of all VACV WR proteins
# They are obtained from the FASTA file downloaded from UniProt
path_to_VACV_WR_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/uniprotkb_taxonomy_"
    "id_10254_all_VACV_WR_prots_05_11_2024.fasta"
)

VACV_WR_fasta = fasta.FastaFile.read(path_to_VACV_WR_fasta_file)

VACV_WR_uniprot_accessions = [
    header.split("|")[1] for header in VACV_WR_fasta.keys()
]

In [8]:
n_VACV_WR_prots = len(VACV_WR_uniprot_accessions)

print(
    f"Amount of VACV WR proteins: {n_VACV_WR_prots}"
)

Amount of VACV WR proteins: 442


In [9]:
# Now, each human protein occurring in the combined data set is paired
# with all 442 VACV WR proteins
# The resulting PPI pairs are saved to a TSV file
human_column_list = [
    uniprot_id
    for uniprot_id in human_prots
    for _ in range(n_VACV_WR_prots)
]

VACV_column_list = (
    VACV_WR_uniprot_accessions * n_human_prots_in_combined_data_set
)

In [10]:
assert len(human_column_list) == len(VACV_column_list), (
    "Something went wrong while creating the column lists!"
)

In [11]:
prot_pair_dict = {
    "Human_protein": human_column_list,
    "VACV_protein": VACV_column_list
}

prot_pair_df = pd.DataFrame(data=prot_pair_dict)
print(
    "Amount of PPI pairs in the TSV file for table 2: "
    f"{len(prot_pair_df):,}"
)

prot_pair_df.to_csv(
    "PPI_pairs_between_human_prots_in_combined_data_set_and_VACV_WR_"\
    "proteome.tsv",
    sep="\t",
    header=False,
    index=False
)

Amount of PPI pairs in the TSV file for table 2: 353,600


In [12]:
# Now, address the generation of the corresponding FASTA file
# The file `human_nucleolus_and_VACV_WR_prot_seqs.fasta` is supposed to
# contain all sequences; it is verified whether this indeed is the case
path_to_combined_data_set_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/human_nucleolus_and_VACV_WR_prot_"
    "seqs.fasta"
)
combined_data_set_fasta = fasta.FastaFile.read(
    path_to_combined_data_set_fasta_file
)

fasta_file_accs = combined_data_set_fasta.keys()

human_prot_presence_list = [
    human_prot in fasta_file_accs for human_prot in human_prots
]

VACV_WR_presence_list = [
    VACV_WR_prot in fasta_file_accs
    for VACV_WR_prot in VACV_WR_uniprot_accessions
]

assert all(human_prot_presence_list), (
    "Not all human proteins occurring in the combined data set are "
    "covered by the FASTA file!"
)

assert all(VACV_WR_presence_list), (
    "Not all VACV WR proteins are covered by the FASTA file!"
)

AssertionError: Not all VACV WR proteins are covered by the FASTA file!

In [13]:
# Generate a FASTA file encompassing all VACV WR protein sequences as
# well as sequences of human proteins occurring in the combined data set
path_to_VACV_WR_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/uniprotkb_taxonomy_"
    "id_10254_all_VACV_WR_prots_05_11_2024.fasta"
)
VACV_WR_fasta = fasta.FastaFile.read(
    path_to_VACV_WR_fasta_file
)

human_prots_in_combined_data_set_and_VACV_WR_proteome_fasta = fasta.FastaFile()

for header, seq in VACV_WR_fasta.items():
    uniprot_accession = header.split("|")[1]

    human_prots_in_combined_data_set_and_VACV_WR_proteome_fasta[
        uniprot_accession
    ] = seq

for header, seq in combined_data_set_fasta.items():
    if header in human_prots:
        human_prots_in_combined_data_set_and_VACV_WR_proteome_fasta[
            header
        ] = seq

In [14]:
# Verify whether the total amount of sequences stored in the FASTA file
# corresponds to the sum of human proteins in the combined data set and
# the VACV WR proteome
assert (
    len(human_prots_in_combined_data_set_and_VACV_WR_proteome_fasta)
    ==
    (len(VACV_WR_fasta) + len(human_prots))
), (
    "Not all proteins of the VACV WR proteome and/or the human "
    "proteins occurring in the combined data set have been added!"
)

In [15]:
# Finally, having passed the sanity check, the FASTA file is saved to
# disk
human_prots_in_combined_data_set_and_VACV_WR_proteome_fasta.write(
    "human_proteins_in_combined_data_set_and_VACV_WR_proteome.fasta"
)

In [8]:
# Strangely enough, SENSE-PPI does not manage to process 350,000 PPIs
# within 48 hours although it manages to process 500,000 PPIs within 48
# hours on other occasions
# Therefore, the TSV file is split into two parts of roughly equal size
path_to_PPI_tsv_file = (
    "PPI_pairs_between_human_prots_in_combined_data_set_and_VACV_WR_"
    "proteome.tsv"
)

prot_pair_df = pd.read_csv(
    path_to_PPI_tsv_file,
    sep="\t",
    header=None
)

len_PPI_df = len(prot_pair_df)

n_PPIs_first_half = len_PPI_df // 2

first_half_df = prot_pair_df.iloc[:n_PPIs_first_half]
first_half_df.to_csv(
    "PPI_pairs_between_human_prots_in_combined_data_set_and_VACV_WR_"\
    "proteome_part_1.tsv",
    sep="\t",
    index=False,
    header=False
)

second_half_df = prot_pair_df.iloc[n_PPIs_first_half:]
second_half_df.to_csv(
    "PPI_pairs_between_human_prots_in_combined_data_set_and_VACV_WR_"\
    "proteome_part_2.tsv",
    sep="\t",
    index=False,
    header=False
)