In [12]:
"""
The purpose of this script is to prepare the input required by the
SENSE-PPI LLM for PPI prediction. According to the package's
documentation, a TSV file can be passed as input the rows of which
contains pairs of proteins to test.

Thus, each human protein targeted in the siRNA screen is paired with all
440 VACV proteins (comprised in the file "uniprotkb_taxonomy_id_10254_
all_VACV_WR_prots_05_11_2024.fasta").

For the time being, the pairing is performed only for the Qiagen single
siRNA subset as it is the most comprehensive, encompassing 20,213 genes.
"""

import numpy as np
import pandas as pd
from biotite.sequence.io import fasta

In [13]:
# Load the Qiagen subset of the VACV data set and extract the UniProt
# IDs
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

VACV_df = pd.read_csv(
    (
        "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
        "entries_only_without_Qiagen_mismatches.tsv"
    ),
    sep="\t",
    dtype=dtype_dict
)

Qiagen_df = VACV_df[
    VACV_df["Manufacturer"] == "Qiagen"
]

In [14]:
# Bear in mind that some of the targeted genes are pseudogenes and are
# thus not associated with any UniProt entries
# They have the entry "Not available" in the "UniProt_IDs" column and
# are filtered out
Qiagen_df = Qiagen_df[
    Qiagen_df["UniProt_IDs"] != "Not available"
]

# Also keep in mind that some entries are composite entries, i.e.
# comprise multiple UniProt IDs representing different isoforms
# The different isoforms are separated from one another via semicolons
Qiagen_uniprot_ids = np.unique([
    uniprot_id for entry in Qiagen_df["UniProt_IDs"]
    for uniprot_id in entry.split(";")
]).tolist()

In [15]:
# Investigate whether all UniProt IDs also occur in the Homo sapiens
# reference proteome
homo_sapiens_reference_proteome_fasta = fasta.FastaFile.read(
    "uniprotkb_Homo_sapiens_reference_proteome_06_11_2024.fasta"
)
ref_proteome_uniprot_ids = [
    header.split("|")[1]
    for header in homo_sapiens_reference_proteome_fasta.keys()
]

contained_in_ref_proteome = [
    Qiagen_id in ref_proteome_uniprot_ids
    for Qiagen_id in Qiagen_uniprot_ids
]

assert all(contained_in_ref_proteome), (
    f"{len(contained_in_ref_proteome) - sum(contained_in_ref_proteome)}"
    " proteins are not contained in the reference proteome of Homo "
    "sapiens!"
)

AssertionError: 24 proteins are not contained in the reference proteome of Homo sapiens!

In [16]:
# It emerges that 24 of the proteins belonging to the Qiagen subset are
# not contained in the reference proteome; therefore, their sequences
# have to be retrieved by performing a database query
ids_not_contained = np.array(Qiagen_uniprot_ids)[
    ~np.array(contained_in_ref_proteome)
]
print(ids_not_contained)

['A0A090N7S4' 'A4D112' 'A4D1Y7' 'B7ZGW9' 'B9EIR0' 'B9EIR1' 'D3DUG6'
 'L8E6Z1' 'Q0VGM3' 'Q3MIM1' 'Q4G0H6' 'Q5G014' 'Q6P462' 'Q6PJD4' 'Q6PL46'
 'Q6ZW74' 'Q86X61' 'Q8N5R8' 'Q8N5S0' 'Q8N7Z3' 'Q93065' 'Q96QB4' 'Q9BUY1'
 'Q9BXE6']


In [None]:
...

In [20]:
# Create an empty FASTA file and add the proteins contained in the
# reference proteome to it
human_prots_Qiagen_subset_fasta = fasta.FastaFile()

# The dictionary keys, i.e. FASTA entry headers have to be simplified to
# just the UniProt IDs so as to enable effortless dictionary indexing
homo_sapiens_reference_proteome_dict = {
    uniprot_id: seq_str
    for uniprot_id, (_, seq_str) in zip(
        ref_proteome_uniprot_ids,
        homo_sapiens_reference_proteome_fasta.items()
    )
}

for Qiagen_uniprot_id in Qiagen_uniprot_ids:
    try:
        sequence = homo_sapiens_reference_proteome_dict[
            Qiagen_uniprot_id
        ]
    except KeyError:
        continue

    human_prots_Qiagen_subset_fasta[Qiagen_uniprot_id] = sequence

In [None]:
# ...