In [1]:
"""
The purpose of this Jupyter notebook is to preprocess/prepare the VACV
screen TSV file for the evaluation of off-target filtering performance
(i.e. for populating table number 2).
"""

'\nThe purpose of this Jupyter notebook is to preprocess/prepare the VACV\nscreen TSV file for the evaluation of off-target filtering performance\n(i.e. for populating table number 2).\n'

In [2]:
import pandas as pd

path_to_VACV_screen_tsv_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/VACV_Report_only_"
    "valid_single_pooled_siRNA_and_esiRNA_single_entries_only_without_"
    "Qiagen_mismatches.tsv"
)

VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_tsv_file,
    sep="\t"
)

  VACV_screen_df = pd.read_csv(


In [3]:
# As the evaluation of off-target filtering performance is done on the
# same data set as in the comparison of protein language models, the
# corresponding data set (confirmed positive interactions and reliable
# negative interactions between human and VACV WR proteins) is loaded as
# well
path_to_combined_VACV_WR_human_PPI_data_set = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/VACV_WR_pos_and_nucleolus_prots_"
    "neg_PPI_instances.tsv"
)

VACV_WR_pos_and_neg_PPIs_df = pd.read_csv(
    path_to_combined_VACV_WR_human_PPI_data_set,
    sep="\t"
)

In [4]:
# Now, extract the human interaction partners from the PPI pairs and
# determine whether they are present in the VACV screen TSV file
# The file with confirmed positive and reliable negative interactions
# uses UniProt Accessions (in the VACV screen TSV file, the
# corresponding column is incorrectly named "UniProt_IDs")
import numpy as np

human_int_partners = np.unique(VACV_WR_pos_and_neg_PPIs_df["Human_prot"])

# Bear in mind that the `UniProt_IDs` column also contains composite
# entries, i.e. entries encompassing multiple UniProt Accessions
# separated by semicolons in order to accommodate isoforms generated by
# alternative splicing, etc.
prots_interrogated_by_screen = np.unique([
    sub_entry
    for entry in VACV_screen_df["UniProt_IDs"]
    for sub_entry in entry.split(";")
])

coverage_list = [
    human_int_partner in prots_interrogated_by_screen
    for human_int_partner in human_int_partners
]

assert all(coverage_list), (
    "Not all human interaction partners comprised in the PPI data set "
    "are interrogated in the VACV screen!"
)

AssertionError: Not all human interaction partners comprised in the PPI data set are interrogated in the VACV screen!

In [5]:
n_human_int_partners_uniprot_accession_mismatch = (
    len(coverage_list) - np.count_nonzero(coverage_list)
)

print(
    "Amount of human interaction partners exhibiting a UniProt "
    f"accession mismatch: {n_human_int_partners_uniprot_accession_mismatch:,}"
)

Amount of human interaction partners exhibiting a UniProt accession mismatch: 63


In [6]:
# Output the UniProt Accessions of the proteins not interrogated in the
# screen
prots_with_uniprot_acc_mismatch = human_int_partners[
    ~np.array(coverage_list)
]

for prot_with_mismatch in prots_with_uniprot_acc_mismatch:
    print(prot_with_mismatch)

A0A075B749
A0A0D9SG04
A0A0J9YX62
A0A0U1RRM6
A0A1B0GTL5
A0A2R8Y5A3
A0A3B3IRW5
A0A3B3IS91
A0A3B3ISQ4
A0N0N7
A0N0Q3
A3KPC7
A4FTV9
A6NFX8
A6NNZ2
A8ASI8
A8MPP1
A8MUS3
B2R4P9
B2R4R0
B2R4S9
B2RDW1
B2ZZ89
B4DJ51
B4DLJ1
B8ZZN6
C9JQJ2
D9YZV4
D9ZGF2
E5KTA5
E9KL37
E9PDI4
F4ZW62
F8VVA7
F8VXC8
F8VZQ9
F8WBV6
G3V5R9
G5E9I4
H3BSR6
I0J062
J3QK89
K7EQ78
K7ERV3
P0DW81
P22532
P57053
P61571
P61572
P61573
P61574
P61575
P61576
P61578
P61579
P86452
Q0WX57
Q548T7
Q5VTE0
Q6ZN40
Q99878
Q9UN81
V9GZ56


In [7]:
# A UniProt database query is performed for the UniProt accessions not
# comprised in the VACV screen TSV file
# The files are requested in XML format as this allows to extract the
# official gene symbol in a systematic and reliable manner
from biotite.database import uniprot

# As no target directory to store files in is specified, the file
# contents are stored in StringIO objects
uniprot_not_queried = True

while uniprot_not_queried:
    try:
        uniprot_io_objects = uniprot.fetch(
            prots_with_uniprot_acc_mismatch,
            format="xml"
        )
        uniprot_not_queried = False
    except ConnectionError:
        continue

In [8]:
# Iterate over the StringIO objects, access their contents via the
# `.getvalue()` method and determine the official gene symbol
# Determination of the official gene name is accomplished by means of
# the `xml` library, which allows parsing XML files
import xml.etree.ElementTree as ET

off_gene_symbol_list = []

for io_object, uniprot_accession in zip(
    uniprot_io_objects, prots_with_uniprot_acc_mismatch
):
    xml_string = io_object.getvalue()
    
    root = ET.fromstring(xml_string)

    # An XPath expression is used in order to find the official gene
    # symbol
    off_gene_symbol_entries = root.findall(
        "./{http://uniprot.org/uniprot}entry/"
        "{http://uniprot.org/uniprot}gene/"
        "{http://uniprot.org/uniprot}name[@type='primary']"
    )

    # One protein can be encoded by multiple genes, which is why a
    # sublist is generated for each protein containing the potentially
    # multiple gene names
    current_gene_name_list = []

    for gene_symbol_entry in off_gene_symbol_entries:
        current_gene_name_list.append(gene_symbol_entry.text)
    
    off_gene_symbol_list.append(current_gene_name_list)

In [9]:
# Determine unique values for the amount of genes associated with a
# UniProt Accession
n_genes_per_prot_list = list(map(len, off_gene_symbol_list))

print(np.unique(n_genes_per_prot_list))

[0 1 7]


In [10]:
# Determine the amount of UniProt accessions without an associated gene
n_prots_without_gene = np.count_nonzero(
    np.array(n_genes_per_prot_list) == 0
)

print(
    "Amount of UniProt accessions without an associated gene: "
    f"{n_prots_without_gene}"
)

Amount of UniProt accessions without an associated gene: 1


In [11]:
# Determine the protein's identity and subject its XML file to closer
# scrutiny
orphan_prot_idx = n_genes_per_prot_list.index(0)

orphan_xml_str = uniprot_io_objects[orphan_prot_idx].getvalue()

print(orphan_xml_str)

<?xml version="1.0" encoding="UTF-8"  standalone="no" ?>
<uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/docs/uniprot.xsd">
<entry dataset="TrEMBL" created="2011-06-28" modified="2025-02-05" version="81" xmlns="http://uniprot.org/uniprot">
  <accession>F4ZW62</accession>
  <name>F4ZW62_HUMAN</name>
  <protein>
    <submittedName>
      <fullName evidence="8">NF45</fullName>
    </submittedName>
  </protein>
  <organism evidence="8">
    <name type="scientific">Homo sapiens</name>
    <name type="common">Human</name>
    <dbReference type="NCBI Taxonomy" id="9606"/>
    <lineage>
      <taxon>Eukaryota</taxon>
      <taxon>Metazoa</taxon>
      <taxon>Chordata</taxon>
      <taxon>Craniata</taxon>
      <taxon>Vertebrata</taxon>
      <taxon>Euteleostomi</taxon>
      <taxon>Mammalia</taxon>
      <taxon>Eutheria</taxon>
      <taxon>Euarchontoglires</taxon>
      <tax

In [12]:
# Strangely enough, while the XML file does specify an NCBI gene ID, it
# does not indicate the corresponding gene name
# Thus, for this UniProt accession, the official gene symbol is manually
# added to the correct sublist
off_gene_symbol_list[orphan_prot_idx].append("ILF2")

In [13]:
# Now that each and every UniProt accession has at least one associated
# gene symbol, it is investigated for each UniProt accession whether at
# least one of them is interrogated in the VACV screen
gene_symbols_in_screen = np.unique(
    VACV_screen_df["Name"]
)

acc_mismatch_subset_with_gene_name_mismatch = []

for prot, sub_list in zip(prots_with_uniprot_acc_mismatch, off_gene_symbol_list):
    presence_list = [
        gene_symbol in gene_symbols_in_screen
        for gene_symbol in sub_list
    ]

    if not any(presence_list):
        print(
            f"UniProt accession {prot}'s gene name(s) do(es) not match "
            "any of the gene names in the VACV screen!"
        )
        acc_mismatch_subset_with_gene_name_mismatch.append(prot)

UniProt accession A0A3B3IS91's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession A3KPC7's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession A4FTV9's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession A6NNZ2's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession A8MPP1's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession B2R4P9's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession B2R4R0's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession B2R4S9's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession B4DJ51's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniProt accession I0J062's gene name(s) do(es) not match any of the gene names in the VACV screen!
UniPro

In [14]:
n_prots_acc_and_name_mismatch = len(
    acc_mismatch_subset_with_gene_name_mismatch
)

print(
    "The precise amount of proteins not matching both the UniProt "
    "accessions\nand the gene names in the VACV screen is "
    f"{n_prots_acc_and_name_mismatch:,}."
)

The precise amount of proteins not matching both the UniProt accessions
and the gene names in the VACV screen is 26.


In [15]:
# Upon closer scrutiny of the proteins with both UniProt accession and
# gene name mismatch, it emerges that although the corresponding gene
# indeed is present in the screen, a gene name different from the one
# used by NCBI is specified
# Therefore, for those proteins the name of which does not match the one
# used NCBI, it is investigated whether the corresponding gene is
# covered by the screen
# This is accomplished by means of the NCBI gene ID
# Performing this coverage investigation in multiple steps is necessary
# as e.g. not all proteins the gene symbol of which matches the one used
# by NCBI have an associated NCBI gene ID in their XML file

# As the different UniProt entries have different amounts of associated
# genes, the unique numbers of associated gene names are determined
n_associated_gene_IDs = []

for io_object, uniprot_accession in zip(
    uniprot_io_objects, prots_with_uniprot_acc_mismatch
):
    if uniprot_accession in acc_mismatch_subset_with_gene_name_mismatch:
        xml_string = io_object.getvalue()

        root = ET.fromstring(xml_string)

        # An XPath expression is used in order to find the NCBI gene ID
        gene_id_entries = root.findall(
            "./{http://uniprot.org/uniprot}entry/"
            "{http://uniprot.org/uniprot}dbReference[@type='GeneID']"
        )

        n_associated_gene_IDs.append(len(gene_id_entries))

print(
    "Unique numbers of associated gene IDs: "
    f"{np.unique(n_associated_gene_IDs)}"
)

Unique numbers of associated gene IDs: [ 0  1  2  3  5  7 14]


In [16]:
# Based on the amount of associated gene IDs, the UniProt entries
# exhibiting both a UniProt accession and a gene name mismatch are
# divided into three groups
# The first group encompasses proteins without any associated gene IDs
# The second group comprises proteins with only one associated gene ID
# The third group contains proteins with more than one associated gene
# ID

# Output the UniProt accessions for each of the three groups
uniprot_accs_group_one = []

uniprot_accs_group_two = []
gene_ids_group_two = []

uniprot_accs_group_three = []
gene_ids_group_three = []

for io_object, uniprot_accession in zip(
    uniprot_io_objects, prots_with_uniprot_acc_mismatch
):
    if uniprot_accession in acc_mismatch_subset_with_gene_name_mismatch:
        xml_string = io_object.getvalue()

        root = ET.fromstring(xml_string)

        # An XPath expression is used in order to find the NCBI gene ID
        gene_id_entries = root.findall(
            "./{http://uniprot.org/uniprot}entry/"
            "{http://uniprot.org/uniprot}dbReference[@type='GeneID']"
        )

        n_gene_ids = len(gene_id_entries)

        if n_gene_ids == 0:
            uniprot_accs_group_one.append(uniprot_accession)
        elif n_gene_ids == 1:
            uniprot_accs_group_two.append(uniprot_accession)
            gene_id = gene_id_entries[0].attrib["id"]
            gene_ids_group_two.append(gene_id)
        else:
            uniprot_accs_group_three.append(uniprot_accession)
            gene_ids = [
                gene_id_entry.attrib["id"]
                for gene_id_entry in gene_id_entries
            ]
            gene_ids_group_three.append(gene_ids)

print("UniProt accessions of proteins with no associated gene ID:")
for accession in uniprot_accs_group_one:
    print(accession)
print()

print("Uniprot accessions of proteins with one associated gene ID:")
for accession, gene_id in zip(uniprot_accs_group_two, gene_ids_group_two):
    print(f"{accession} -> {gene_id}")
print()

print("UniProt accessions of proteins with multiple associated gene IDs:")
for accession, gene_ids in zip(uniprot_accs_group_three, gene_ids_group_three):
    print(f"{accession} -> {gene_ids}")

UniProt accessions of proteins with no associated gene ID:
A0A3B3IS91
A6NNZ2
A8MPP1
P0DW81
P61571
P61572
P61573
P61574
P61575
P61576
P61578
P61579
Q5VTE0
Q9UN81

Uniprot accessions of proteins with one associated gene ID:
A3KPC7 -> 85235
I0J062 -> 101927423
P22532 -> 6703
P57053 -> 102724334
P86452 -> 100381270
Q99878 -> 8331

UniProt accessions of proteins with multiple associated gene IDs:
A4FTV9 -> ['8329', '8330', '8332', '8336', '8969']
B2R4P9 -> ['3020', '3021']
B2R4R0 -> ['121504', '554313', '8294', '8359', '8360', '8361', '8362', '8363', '8364', '8365', '8366', '8367', '8368', '8370']
B2R4S9 -> ['8339', '8343', '8344', '8346', '8347']
B4DJ51 -> ['801', '805', '808']
Q0WX57 -> ['728369', '728373', '728379', '728393', '728400', '728405', '728419']


In [17]:
# The UniProt IDs of each of the three groups are subjected to closer
# scrutiny

# With regard to the first group (no associated gene ID), it emerges
# that the gene IDs can be found out manually by querying the NCBI
# database
# While some genes are interrogated by the VACV screen, others are not
# A0A3B3IS91 -> 5428, present
# A6NNZ2 -> 260334 (TUBB8B), not present
# A8MPP1 -> 100302090 (DDX11L8), not present
# P0DW81 -> 101929977 (MARCHF6-DT), not present
# P61571 -> 100616103 (ERVK-21), not present
# P61572 -> 105376906 (ERVK-19), not present
# P61573 -> 100616102 (ERVK-9), not present
# P61574 -> 124902738 (LOC124902738), not present
# P61575 -> 619465 (ERVK-8), not present
# P61576 -> no gene available
# P61578 -> 100616411 (ERVK-16), not present
# P61579 -> 100862683 (ERVK-25), not presen
# Q5VTE0 -> 158078 (EEF1A1P5), not present
# Q9UN81 -> no gene available

# Investigating the UniProt accessions of the second group (one
# associated gene ID) reveals the following:
# A3KPC7 -> 85235, present
# I0J062 -> 101927423, not present
# P22532 -> 6703, not present
# P57053 -> 102724334, not present
# P86452 -> 100381270, not present
# Q99878 -> 8331, not present

# As to the third group (multiple associated gene IDs), many UniProt
# accessions are covered by multiple gene IDs
#
# A4FTV9 (Histone H2A) -> ['8329', '8330', '8332', '8336', '8969']; gene
# ids 8330, 8332, 8336 as well as 8969 are present
#
# B2R4P9 (Histone H3) -> ['3020', '3021']; gene ids 3020 as well as 3021
# are present
#
# B2R4R0 (Histone H4) -> ['121504', '554313', '8294', '8359', '8360',
# '8361', '8362', '8363', '8364', '8365', '8366', '8367', '8368',
# '8370']; gene ids 121504, 554313, 8359, 8360, 8361, 8362, 8363, 8364,
# 8365, 8366, 8367, 8368 as well as 8370 are present
#
# B2R4S9 (Histone H2B) -> ['8339', '8343', '8344', '8346', '8347']; gene
# ids 8339, 8343, 8344, 8346 as well as 8347 are present
#
# B4DJ51 (Calmodulin 1) -> ['801', '805', '808']; gene ids 801, 805 as
# well as 808 are present
#
# Q0WX57 (Ubiquitin carboxyl-terminal hydrolase 17-like protein 24) ->
# ['728369', '728373', '728379', '728393', '728400', '728405', '728419'];
# no gene ids are present

# The question arises how to treat the multiple gene ids covering one
# and the same UniProt accession
# Maybe just take the highest intensity? -> Ask Artur!

In [33]:
# Now that the coverage of the human interaction partners of the
# human-VACV PPIs has been exhaustively analysed, a dictionary is
# created mapping each UniProt accession of a human interaction partner
# to the corresponding NCBI gene ID
# Just as the analysis, the dictionary creation is accomplished in a
# stepwise manner
uniprot_acc_gene_id_dict = {}

# Start with the human interaction partners the gene name of which
# matches the official gene symbols listed in the screen TSV file
for int_partner in human_int_partners:
    if int_partner in prots_interrogated_by_screen:
        # As the `UniProt_IDs` column may contain multiple UniProt
        # accessions separated from each other via semicolons, it is
        # checked for the presence of substrings rather than for exact
        # matches
        # Also bear in mind that selecting a column yields a Pandas
        # Series
        # It is likely that there are multiple rows for the interaction
        # partner at hand; however, as all rows are supposed to contain
        # the same gene ID, simply the first gene ID is selected
        current_gene_id = VACV_screen_df.loc[
            VACV_screen_df["UniProt_IDs"].str.contains(int_partner),
            "ID"
        ].iloc[0]

        # The rationale behind setting a list as value instead of the
        # mere gene ID is that there are proteins with multiple
        # associated gene IDs
        # Consistently setting a list as value simplifies the
        # implementation of downstream code
        uniprot_acc_gene_id_dict[int_partner] = [current_gene_id]

In [34]:
print(len(uniprot_acc_gene_id_dict))

737


In [35]:
# Now, address the human interaction partners exhibiting a UniProt
# accession mismatch
for prot, sub_list in zip(prots_with_uniprot_acc_mismatch, off_gene_symbol_list):
    presence_list = [
        gene_symbol in gene_symbols_in_screen
        for gene_symbol in sub_list
    ]

    if any(presence_list):
        # Determine the precise gene symbol present in the screen; it is
        # always just one
        present_symbol = np.array(sub_list)[np.array(presence_list)][0]
        
        current_gene_id = VACV_screen_df.loc[
            VACV_screen_df["Name"] == present_symbol,
            "ID"
        ].iloc[0]

        uniprot_acc_gene_id_dict[prot] = [current_gene_id]

In [36]:
print(len(uniprot_acc_gene_id_dict))

774


In [37]:
# As a next step, turn to human interaction partners exhibiting both a
# UniProt accession mismatch and a gene name mismatch
# For these human interaction partners, three subgroups could be
# identified, namely human interaction partners with no, one and more
# than one associated gene IDs

# For proteins with one and multiple associated gene IDs, the respective
# gene IDs are set as value in the dictionary
for uniprot_accession, gene_id in zip(
    uniprot_accs_group_two, gene_ids_group_two
):
    uniprot_acc_gene_id_dict[uniprot_accession] = [gene_id]

for uniprot_accession, gene_id_list in zip(
    uniprot_accs_group_three, gene_ids_group_three
):
    uniprot_acc_gene_id_dict[uniprot_accession] = gene_id_list

In [38]:
print(len(uniprot_acc_gene_id_dict))

786


In [39]:
# Finally, deal with proteins with no associated gene ID
# The gene IDs for these proteins were found out by manually querying
# the NCBI database
# Thus, a list of gene IDs has to be generated manually as well
gene_ids_group_one = [
    5428, 260334, 100302090, 101929977, 100616103, 105376906, 100616102,
    124902738, 619465, None, 100616411, 100862683, 158078, None
]

for uniprot_accession, gene_id in zip(
    uniprot_accs_group_one, gene_ids_group_one
):
    uniprot_acc_gene_id_dict[uniprot_accession] = [gene_id]

In [40]:
print(len(uniprot_acc_gene_id_dict))

800


In [41]:
# Perform a couple of final sanity checks
# One of them consists of verifying that the total amount of UniProt
# accessions in the dictionary equals the total amount of unique UniProt
# accessions occurring the combined VACV data set
assert len(uniprot_acc_gene_id_dict) == len(human_int_partners), (
    "Not all unique UniProt accessions occurring in the combined VACV "
    "data set have been added to the dictionary!"
)

In [42]:
# Another sanity check involves verifying that all dictionary values are
# of data type `list`
dict_vals = [value for value in uniprot_acc_gene_id_dict.values()]

assert all([isinstance(value, list) for value in dict_vals]), (
    "Not all dictionary values are a list!"
)

In [43]:
# Now that all sanity checks have successfully been passed, the
# dictionary mapping UniProt accessions to NCBI gene IDs is pickled,
# i.e. saved to a file
import pickle

# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open(
    "dictionary_mapping_UniProt_accessions_of_combined_VACV_WR_data_"\
    "set_to_gene_IDs.pkl",
    "wb"
) as f:
    pickle.dump(uniprot_acc_gene_id_dict, f)