In [1]:
"""
The purpose of this script is to prepare the input required by the
SENSE-PPI LLM for PPI prediction. According to the package's
documentation, a TSV file can be passed as input the rows of which
contains pairs of proteins to test.

Thus, each human protein targeted in the siRNA screen is paired with all
440 VACV proteins (comprised in the file "uniprotkb_taxonomy_id_10254_
all_VACV_WR_prots_05_11_2024.fasta").

For the time being, the pairing is performed only for the Qiagen single
siRNA subset as it is the most comprehensive, encompassing 20,213 genes.
"""
import math

import numpy as np
import pandas as pd
from biotite.sequence.io import fasta
from biotite.database import uniprot

In [8]:
# Load the Qiagen subset of the VACV data set and extract the UniProt
# IDs
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

VACV_df = pd.read_csv(
    (
        "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
        "entries_only_without_Qiagen_mismatches.tsv"
    ),
    sep="\t",
    dtype=dtype_dict
)

Qiagen_df = VACV_df[
    VACV_df["Manufacturer"] == "Qiagen"
]

In [9]:
# Bear in mind that some of the targeted genes are pseudogenes and are
# thus not associated with any UniProt entries
# They have the entry "Not available" in the "UniProt_IDs" column and
# are filtered out
Qiagen_df = Qiagen_df[
    Qiagen_df["UniProt_IDs"] != "Not available"
]

# Also keep in mind that some entries are composite entries, i.e.
# comprise multiple UniProt IDs representing different isoforms
# The different isoforms are separated from one another via semicolons
Qiagen_uniprot_ids = np.unique([
    uniprot_id for entry in Qiagen_df["UniProt_IDs"]
    for uniprot_id in entry.split(";")
]).tolist()

In [4]:
# Investigate whether all UniProt IDs also occur in the Homo sapiens
# reference proteome
homo_sapiens_reference_proteome_fasta = fasta.FastaFile.read(
    "uniprotkb_Homo_sapiens_reference_proteome_06_11_2024.fasta"
)
ref_proteome_uniprot_ids = [
    header.split("|")[1]
    for header in homo_sapiens_reference_proteome_fasta.keys()
]

contained_in_ref_proteome = [
    Qiagen_id in ref_proteome_uniprot_ids
    for Qiagen_id in Qiagen_uniprot_ids
]

assert all(contained_in_ref_proteome), (
    f"{len(contained_in_ref_proteome) - sum(contained_in_ref_proteome)}"
    " proteins are not contained in the reference proteome of Homo "
    "sapiens!"
)

AssertionError: 24 proteins are not contained in the reference proteome of Homo sapiens!

In [5]:
# It emerges that 24 of the proteins belonging to the Qiagen subset are
# not contained in the reference proteome; therefore, their sequences
# have to be retrieved by performing a database query
ids_not_contained = np.array(Qiagen_uniprot_ids)[
    ~np.array(contained_in_ref_proteome)
]
print(ids_not_contained)

['A0A090N7S4' 'A4D112' 'A4D1Y7' 'B7ZGW9' 'B9EIR0' 'B9EIR1' 'D3DUG6'
 'L8E6Z1' 'Q0VGM3' 'Q3MIM1' 'Q4G0H6' 'Q5G014' 'Q6P462' 'Q6PJD4' 'Q6PL46'
 'Q6ZW74' 'Q86X61' 'Q8N5R8' 'Q8N5S0' 'Q8N7Z3' 'Q93065' 'Q96QB4' 'Q9BUY1'
 'Q9BXE6']


In [7]:
# Retrieve the sequences of the 24 proteins not contained in the
# reference proteome from UniProt
uniprot_entries = uniprot.fetch(
    ids_not_contained,
    format="fasta"
)

assert len(uniprot_entries) == 24, (
    "An entry has not been retrieved for each of the 24 proteins!"
)

In [8]:
# Create an empty FASTA file and add the proteins contained in the
# reference proteome to it
human_prots_Qiagen_subset_fasta = fasta.FastaFile()

# The dictionary keys, i.e. FASTA entry headers have to be simplified to
# just the UniProt IDs so as to enable effortless dictionary indexing
homo_sapiens_reference_proteome_dict = {
    uniprot_id: seq_str
    for uniprot_id, (_, seq_str) in zip(
        ref_proteome_uniprot_ids,
        homo_sapiens_reference_proteome_fasta.items()
    )
}

for Qiagen_uniprot_id in Qiagen_uniprot_ids:
    try:
        sequence = homo_sapiens_reference_proteome_dict[
            Qiagen_uniprot_id
        ]
    except KeyError:
        continue

    human_prots_Qiagen_subset_fasta[Qiagen_uniprot_id] = sequence

In [9]:
# Also add the 24 proteins not contained in the reference proteome to
# the FASTA file
for uniprot_entry in uniprot_entries:
    # As no target path has been specified, the objects returned by
    # `uniprot.fetch()` are StringIO objects, which can be read into a
    # FASTA file
    current_fasta_file = fasta.FastaFile.read(uniprot_entry)
    # Each file contains only one entry; hence, the first and only entry
    # is retrieved from the iterator returned by the `.items()` method
    header, seq_str = list(current_fasta_file.items())[0]
    # Bear in mind that the header has to be simplified to just the
    # UniProt ID
    header = header.split("|")[1]

    # Finally, append the entry to the FASTA file
    human_prots_Qiagen_subset_fasta[header] = seq_str

In [10]:
assert (
    len(human_prots_Qiagen_subset_fasta) == len(Qiagen_uniprot_ids)
), (
    "An entry hasn't been added to the FASTA file for each protein in "
    "the Qiagen subset!"
)

In [11]:
# Now that the sanity check was successful, the FASTA file is saved to
# disk
human_prots_Qiagen_subset_fasta.write(
    "human_proteins_in_VACV_data_set_Qiagen_subset.fasta"
)

In [14]:
# Contrary to what I expected, the situation with SENSE-PPI is as
# follows: SENSE-PPI requires at least one FASTA file as input the
# headers of which consist solely of UniProt IDs and the sequences of
# which represent the corresponding protein sequences
# Note that this FASTA file has to contain all interacting protein
# sequences, i.e. in the case of human-virus PPIs, this file has to
# contain both human and virus sequences
# In case of just providing the aforementioned FASTA file, all possible
# interaction pairs are computed
# However, if interaction scores are supposed to be computed only for
# selected protein pairs, then a second file must be passed as input
# This second file is  a TSV file the rows of which represent protein
# pairs to compute the interaction probability for; the proteins are
# represented by their UniProt ID; the TSV must have at least two
# columns harbouring the UniProt IDs of interaction pairs, it may
# optionally contain a third column harbouring labels (i.e. 1 for "true"
# and 0 for "false")

# Therefore, the two FASTA files containing the human and VACV sequences
# have to be merged
# This can be easily accomplished using the `fasta.set_sequences()`
# function
human_prots_Qiagen_subset_and_all_VACV_WR_prots_fasta = fasta.FastaFile()

# Obtain a dictionary for each of the two files
VACV_WR_prots_fasta = fasta.FastaFile.read(
    "uniprotkb_taxonomy_id_10254_all_VACV_WR_prots_05_11_2024.fasta"
)
VACV_WR_fasta_dict = fasta.get_sequences(VACV_WR_prots_fasta)
# Bear in mind that the VACV headers are unaltered, so the UniProt IDs
# still have to be extracted
VACV_WR_fasta_dict = {
    header.split("|")[1]: seq_object
    for header, seq_object in VACV_WR_fasta_dict.items()
}

human_prots_Qiagen_subset_fasta = fasta.FastaFile.read(
    "human_proteins_in_VACV_data_set_Qiagen_subset.fasta"
)
human_prots_Qiagen_subset_fasta_dict = fasta.get_sequences(
    human_prots_Qiagen_subset_fasta
)

# Now, merge the two dictionaries and write the sequences contained in
# the resulting dictionary into a new FASTA file
# Merging can be easily achieved via the double asterisk (**) operator
# used to unpack dictionaries
human_Qiagen_and_VACV_WR_prots_dict = {
    **human_prots_Qiagen_subset_fasta_dict, **VACV_WR_fasta_dict
}

human_Qiagen_and_VACV_WR_prots_fasta = fasta.FastaFile()
fasta.set_sequences(
    human_Qiagen_and_VACV_WR_prots_fasta,
    human_Qiagen_and_VACV_WR_prots_dict
)

n_human_Qiagen_prots = len(human_prots_Qiagen_subset_fasta)
n_VACV_WR_prots = len(VACV_WR_prots_fasta)

assert (
    len(human_Qiagen_and_VACV_WR_prots_fasta)
    ==
    n_human_Qiagen_prots + n_VACV_WR_prots
), (
    "Something went wrong while merging the two dictionaries!"
)

human_Qiagen_and_VACV_WR_prots_fasta.write(
    "human_prots_Qiagen_subset_and_all_VACV_WR_prots.fasta"
)



In [24]:
# Now, the TSV file itemising protein pairs to test is turned to
# A DataFrame can be created in various ways, one of which consists of
# using a dictionary with column names as keys and lists representing
# the column contents as values
# All human proteins in the Qiagen subset are supposed to be paired with
# each and every VACV strain Western Reserve protein
# Hence, for the column harbouring human proteins, a list is created
# wherein each human protein is repeated as often as there are VACV WR
# proteins, i.e. 442 times (the protein sequences have been downladed
# including isoforms, which is why the FASTA file encompasses 442
# instead of 440 entries)
# Conversely, for the column harbouring VACV proteins, the entire VACV
# WR proteome is repeated as often as there are human proteins in the
# Qiagen subset
print(
    f"There are {n_VACV_WR_prots} proteins in the VACV WR proteome, "
    "whereas the Qiagen subset\nof the VACV screen encompasses "
    f"{n_human_Qiagen_prots:,} proteins."
)

human_column_list = [
    uniprot_id
    for uniprot_id in Qiagen_uniprot_ids
    for _ in range(n_VACV_WR_prots)
]

VACV_WR_uniprot_IDs = [
    header.split("|")[1] for header in VACV_WR_prots_fasta.keys()
]
VACV_column_list = VACV_WR_uniprot_IDs * n_human_Qiagen_prots

assert len(human_column_list) == len(VACV_column_list), (
    "Something went wrong while creating the column lists!"
)

There are 442 proteins in the VACV WR proteome, whereas the Qiagen subset
of the VACV screen encompasses 18,610 proteins.


In [25]:
# Create a DataFrame using the two lists and save the DataFrame to a TSV
# file
prot_pair_dict = {
    "Human_protein": human_column_list,
    "VACV_protein": VACV_column_list
}

prot_pair_df = pd.DataFrame(data=prot_pair_dict)

prot_pair_df.to_csv(
    "PPI_pairs_between_Qiagen_subset_and_VACV_WR_proteome.tsv",
    sep="\t",
    header=False,
    index=False
)

In [26]:
# Also create a TSV file encompassing only five PPI pairs for testing
# purposes
test_df = prot_pair_df.iloc[:5]
test_df.to_csv(
    "SENSE-PPI_test_prot_pairs.tsv",
    sep="\t",
    header=False,
    index=False
)

In [23]:
# Create a FASTA file containing the corresponding protein sequences
uniprot_ids_in_test_tsv = (
    test_df["Human_protein"].unique().tolist()
    +
    test_df["VACV_protein"].unique().tolist()
)

test_fasta = fasta.FastaFile()

for test_uniprot_id in uniprot_ids_in_test_tsv:
    seq_str = human_Qiagen_and_VACV_WR_prots_fasta[test_uniprot_id]
    test_fasta[test_uniprot_id] = seq_str

test_fasta.write(
    "SENSE-PPI_test_prot_seqs.fasta"
)

In [33]:
print(
    "Amount of interaction pairs between the Qiagen subset and the "
    f"VACV strain\nWestern Reserve proteome: {len(prot_pair_df):,}"
)

print(
    "Amount of protein sequences from both the Qiagen subset and the "
    "VACV\nstrain WR proteome: "
    f"{len(human_Qiagen_and_VACV_WR_prots_fasta):,}"
)

Amount of interaction pairs between the Qiagen subset and the VACV strain
Western Reserve proteome: 8,225,620
Amount of protein sequences from both the Qiagen subset and the VACV
strain WR proteome: 19,052


In [None]:
# The TSV file encompasses more than 8 million interactions, which are
# way too many to be processed in one batch job
# For this reason, the TSV file is split into 17 chunks each
# encompassing 500,000 interaction pairs (except the last one)
# The same is done to the FASTA file harbouring the human and VACV
# protein sequences
prot_pair_df = pd.read_csv(
    "SENSE-PPI/PPI_pairs_between_Qiagen_subset_and_VACV_WR_proteome.tsv",
    sep="\t"
)

# Providing the FASTA file harbouring all sequences causes an Out of
# Memory (OOM) error
# Therefore, a separate FASTA file is generated for each chunk
human_Qiagen_and_VACV_WR_prots_fasta = fasta.FastaFile.read(
    "SENSE-PPI/human_prots_Qiagen_subset_and_all_VACV_WR_prots.fasta"
)

CHUNK_SIZE = 500000
n_chunks = math.ceil(len(prot_pair_df) / CHUNK_SIZE)

prot_pair_chunks = [
    prot_pair_df.iloc[i * CHUNK_SIZE : (i + 1) * CHUNK_SIZE]
    for i in range(n_chunks)
]

# Save each of the chunks to a separate TSV file
for i, prot_pair_chunk in enumerate(prot_pair_chunks):
    # Save a chunk from the TSV file to a new TSV file
    prot_pair_chunk.to_csv(
        (
            "SENSE-PPI/PPI_pairs_between_Qiagen_subset_and_VACV_WR_"
            f"proteome_chunk_{i}_size_{CHUNK_SIZE:,}.tsv"
        ),
        sep="\t",
        header=False,
        index=False
    )

    # Extract the corresponding chunk from the FASTA file
    # This requires determining the unique UniProt IDs in the current
    # chunk
    # As `pandas.unique()` accepts exclusively one-dimensional
    # array-like objects as input, it is resorted to employing the
    # function in conjunction with `pandas.melt()`, which transforms a
    # DataFrame from a wide format into a long format
    # This yields a DataFrame encompassing only two columns bearing the
    # name "variable" and "value", respectively
    # Finally, the `unique()` method is applied to the `value` column of
    # the DataFrame
    unique_prot_ids_current_chunk = pd.melt(
        prot_pair_chunk
    )["value"].unique()

    current_chunk_fasta = fasta.FastaFile()
    for uniprot_id in unique_prot_ids_current_chunk:
        current_chunk_fasta[uniprot_id] = (
            human_Qiagen_and_VACV_WR_prots_fasta[uniprot_id]
        )
    current_chunk_fasta.write(
        "SENSE-PPI/"
        f"human_prots_Qiagen_subset_and_VACV_WR_prots_seqs_chunk_{i}_"
        f"size_{CHUNK_SIZE:,}.fasta"
    )

    # Additionally, a TXT file is created in which each line is
    # populated with one UniProt ID of the current chunk
    # This is done with the intention to automate the copying of
    # embedding files into the corresponding directories
    # Bear in mind that in the context of working with files, the `with`
    # context manager is preferred as it automatically takes care of
    # closing files, even in case of exceptions/errors
    with open(f"SENSE-PPI/UniProt_IDs_chunk_{i}.txt", "w") as f:
        prot_ids_with_newlines = [
            uniprot_id if i == 0 else "\n" + uniprot_id
            for i, uniprot_id in enumerate(unique_prot_ids_current_chunk)
        ]
        f.writelines(prot_ids_with_newlines)

19052


In [6]:
# Weirdly enough, while all other chunks could be processed without any
# problems, OOM errors for chunks 11 as well as 13
# Therefore, the aforementioned chunks are further split into three
# sub-chunks each
SUB_CHUNK_SIZE = math.ceil(500000 / 3)

# Read in the two chunks
prot_pairs_chunk_11 = pd.read_csv(
    (
        "SENSE-PPI/Files_per_chunk_except_chunk_0/PPI_pairs_between_"
        "Qiagen_subset_and_VACV_WR_proteome_chunk_11_size_500,000.tsv"
    ),
    sep="\t"
)
prot_pairs_chunk_13 = pd.read_csv(
    (
        "SENSE-PPI/Files_per_chunk_except_chunk_0/PPI_pairs_between_"
        "Qiagen_subset_and_VACV_WR_proteome_chunk_13_size_500,000.tsv"
    ),
    sep="\t"
)

# Also read in the FASTA file encompassing all human as well as VACV
# proteins
human_Qiagen_and_VACV_WR_prots_fasta = fasta.FastaFile.read(
    "SENSE-PPI/human_prots_Qiagen_subset_and_all_VACV_WR_prots.fasta"
)

prot_pair_subchunks_11 = [
    prot_pairs_chunk_11.iloc[i * SUB_CHUNK_SIZE : (i + 1) * SUB_CHUNK_SIZE]
    for i in range(3)
]
prot_pair_subchunks_13 = [
    prot_pairs_chunk_13.iloc[i * SUB_CHUNK_SIZE : (i + 1) * SUB_CHUNK_SIZE]
    for i in range(3)
]

# Save each of the chunks to a separate TSV file
# First, deal with the sub-chunks of chunk 11
for i, prot_pair_sub_chunk in enumerate(prot_pair_subchunks_11):
    prot_pair_sub_chunk.to_csv(
        (
            "SENSE-PPI/PPI_pairs_between_Qiagen_subset_and_VACV_WR_"
            f"proteome_chunk_11_{i}_size_{SUB_CHUNK_SIZE:,}.tsv"
        ),
        sep="\t",
        header=False,
        index=False
    )

    # Extract the corresponding chunk from the FASTA file
    unique_prot_ids_current_sub_chunk = pd.melt(
        prot_pair_sub_chunk
    )["value"].unique()

    current_sub_chunk_fasta = fasta.FastaFile()
    for uniprot_id in unique_prot_ids_current_sub_chunk:
        current_sub_chunk_fasta[uniprot_id] = (
            human_Qiagen_and_VACV_WR_prots_fasta[uniprot_id]
        )
    current_sub_chunk_fasta.write(
        "SENSE-PPI/"
        "human_prots_Qiagen_subset_and_VACV_WR_prots_seqs_chunk_11_"
        f"{i}_size_{SUB_CHUNK_SIZE:,}.fasta"
    )

# Now, turn to chunk 13
for i, prot_pair_sub_chunk in enumerate(prot_pair_subchunks_13):
    prot_pair_sub_chunk.to_csv(
        (
            "SENSE-PPI/PPI_pairs_between_Qiagen_subset_and_VACV_WR_"
            f"proteome_chunk_13_{i}_size_{SUB_CHUNK_SIZE:,}.tsv"
        ),
        sep="\t",
        header=False,
        index=False
    )

    # Extract the corresponding chunk from the FASTA file
    unique_prot_ids_current_sub_chunk = pd.melt(
        prot_pair_sub_chunk
    )["value"].unique()

    current_sub_chunk_fasta = fasta.FastaFile()
    for uniprot_id in unique_prot_ids_current_sub_chunk:
        current_sub_chunk_fasta[uniprot_id] = (
            human_Qiagen_and_VACV_WR_prots_fasta[uniprot_id]
        )
    current_sub_chunk_fasta.write(
        "SENSE-PPI/"
        "human_prots_Qiagen_subset_and_VACV_WR_prots_seqs_chunk_13_"
        f"{i}_size_{SUB_CHUNK_SIZE:,}.fasta"
    )

In [None]:
# Contrary to expectation, splitting each of the two chunks into three
# sub-chunks did not resolve the issue
# As a last resort, both chunks are processed with CPU only