In [1]:
"""
The purpose of this Jupyter notebook is to generate the TSV file as well
as the FASTA file required to evaluate xCAPT5's off-target filtering
performance. In detail, the TSV file pairs each humn protein occurring
in the HVIDB data set for VACV WR with all VACV WR proteins.
Accordingly, the corresponding TSV file encompasses the sequences of all
VACV WR proteins and of human proteins occurring in the HVIDB data set.
"""

"\nThe purpose of this Jupyter notebook is to generate the TSV file as well\nas the FASTA file required to evaluate xCAPT5's off-target filtering\nperformance. In detail, the TSV file pairs each humn protein occurring\nin the HVIDB data set for VACV WR with all VACV WR proteins.\nAccordingly, the corresponding TSV file encompasses the sequences of all\nVACV WR proteins and of human proteins occurring in the HVIDB data set.\n"

In [2]:
import numpy as np
import pandas as pd
from biotite.sequence.io import fasta

In [3]:
# Determine the unique UniProt accessions of human proteins occurring in
# the HVIDB data set
path_to_HVIDB_data_set = (
    "/Users/jacobanter/Documents/Code/VACV_screen/all_HVIDB_VACV_"
    "WR_interactions.csv"
)

HVIDB_df = pd.read_csv(path_to_HVIDB_data_set)

# In the `Human-virus PPI` column of the HVIDB data set, PPI pairs are
# listed as combinations of UniProt accessions separated by a hyphen
# from one another
# The firt UniProt accession represents a human protein, whereas the
# second UniProt accession represents a VACV WR protein
human_prots = np.unique([
    int_pair.split("-")[0] for int_pair in HVIDB_df["Human-virus PPI"]
]).tolist()

In [4]:
n_human_prots_in_HVIDB = len(human_prots)

print(
    "Amount of unique human proteins occurring in the HVIDB data set: "
    f"{n_human_prots_in_HVIDB:,}"
)

Amount of unique human proteins occurring in the HVIDB data set: 354


In [5]:
# Now, load the UniProt accessions of all VACV WR proteins
# They are obtained from the FASTA file downloaded from UniProt
path_to_VACV_WR_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/uniprotkb_taxonomy_"
    "id_10254_all_VACV_WR_prots_05_11_2024.fasta"
)

VACV_WR_fasta = fasta.FastaFile.read(path_to_VACV_WR_fasta_file)

VACV_WR_uniprot_accessions = [
    header.split("|")[1] for header in VACV_WR_fasta.keys()
]

In [6]:
n_VACV_WR_prots = len(VACV_WR_uniprot_accessions)

print(
    f"Amount of VACV WR proteins: {n_VACV_WR_prots}"
)

Amount of VACV WR proteins: 442


In [7]:
# Now, each human protein occurring in the HVIDB data set is paired with
# all 442 VACV WR proteins
# The resulting PPI pairs are saved to a TSV file
human_column_list = [
    uniprot_id
    for uniprot_id in human_prots
    for _ in range(n_VACV_WR_prots)
]

VACV_column_list = VACV_WR_uniprot_accessions * n_human_prots_in_HVIDB

In [8]:
assert len(human_column_list) == len(VACV_column_list), (
    "Something went wrong while creating the column lists!"
)

In [9]:
prot_pair_dict = {
    "Human_protein": human_column_list,
    "VACV_protein": VACV_column_list
}

prot_pair_df = pd.DataFrame(data=prot_pair_dict)
prot_pair_df.to_csv(
    "PPI_pairs_between_human_prots_from_HVIDB_and_VACV_WR_proteome.tsv",
    sep="\t",
    header=False,
    index=False
)

In [11]:
# Now, address the generation of the corresponding FASTA file
# This requires loading the FASTA file containing the human proteins
# occurring in the HVIDB data set
path_to_human_prots_in_HVIDB_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/human_prots_in_"
    "HVIDB_VACV_WR_data_set.fasta"
)
human_prots_in_HVIDB_fasta = fasta.FastaFile.read(
    path_to_human_prots_in_HVIDB_fasta_file
)

human_prots_from_HVIDB_and_VACV_WR_proteome_fasta = fasta.FastaFile()

for header, seq in VACV_WR_fasta.items():
    uniprot_accession = header.split("|")[1]

    human_prots_from_HVIDB_and_VACV_WR_proteome_fasta[
        uniprot_accession
    ] = seq

for header, seq in human_prots_in_HVIDB_fasta.items():
    uniprot_accession = header.split("|")[1]

    human_prots_from_HVIDB_and_VACV_WR_proteome_fasta[
        uniprot_accession
    ] = seq

In [12]:
# Verify whether the total amount of sequences stored in the FASTA file
# corresponds to the sum of human proteins in the HVIDB data set and the
# VACV WR proteome
assert (
    len(human_prots_from_HVIDB_and_VACV_WR_proteome_fasta)
    ==
    (len(VACV_WR_fasta) + len(human_prots_in_HVIDB_fasta))
), (
    "Not all proteins of the VACV WR proteome and/or the human "
    "proteins occurring in the HVIDB data set have been added!"
)

In [None]:
# Finally, having passed the sanity check, the FASTA file is saved to
# disk
human_prots_from_HVIDB_and_VACV_WR_proteome_fasta.write(
    "human_proteins_in_HVIDB_and_VACV_WR_proteome.fasta"
)