In [1]:
import time

import numpy as np
import pandas as pd
from biotite.database import entrez
from biotite.database import uniprot
import biotite.sequence.io.fasta as fasta

In [2]:
# Unfortunately, to my knowledge, downloading all VACV interactions from
# the HVIDB database at once is not possible
# Hence, they have been downloaded in packages of up to 100 interactions
# It is investigated whether the 5 packages are all different from each
# other so as to rule out errors during the manual download, such as
# accidentally downloading the same package twice
# As a first step, the individual packages are loaded into the Jupyter
# notebook
HVIDB_package_1 = pd.read_csv("HVIDB_VACV_interactions_1.csv")
HVIDB_package_2 = pd.read_csv("HVIDB_VACV_interactions_2.csv")
HVIDB_package_3 = pd.read_csv("HVIDB_VACV_interactions_3.csv")
HVIDB_package_4 = pd.read_csv("HVIDB_VACV_interactions_4.csv")
HVIDB_package_5 = pd.read_csv("HVIDB_VACV_interactions_5.csv")

# According to the HVIDB database, the total amount of interactions is
# 456
# It is checked whether this also applies to the five downloaded
# packages
packages_list = [
    HVIDB_package_1,
    HVIDB_package_2,
    HVIDB_package_3,
    HVIDB_package_4,
    HVIDB_package_5
]

assert sum(map(len, packages_list)) == 456, (
    "An error has been made during the manual download of the "
    "interaction packages!"
)

In [3]:
# Now, verify that all downloaded interaction packages are distinct from
# one another
# Unfortunately, checking equality for more than two DataFrames at once
# is not possible to the best of my knowledge
# Hence, it is resorted to for-loops
package_equality = False

for i, package in enumerate(packages_list):
    # For obvious reasons, equality checks are only performed between
    # different interaction packages
    # Moreover, redundant equality checks are avoided
    for j in range(i + 1, 5):
        package_equality = package_equality or package.equals(packages_list[j])

assert package_equality == False, (
    "Identical interaction packages have erroneously been downloaded!"
)

In [4]:
# Now, for the sake of convenience, concatenate the five individual
# interaction packages to generate one coherent CSV file
HVIDB_VACV_interactions_df = pd.concat(packages_list, ignore_index=True)

assert (
    len(HVIDB_VACV_interactions_df) == 456
    and
    len(HVIDB_VACV_interactions_df.columns) == 4
), "A mistake was done during DataFrame concatenation!"

In [9]:
# Save the coherent DataFrame as a CSV file
HVIDB_VACV_interactions_df.to_csv(
    "all_HVIDB_VACV_interactions.csv", index=False
)

In [5]:
# Determine the amount of genes targeted by siRNAs/esiRNAs and in the
# VACV screen
# Low-molecular weight organic compounds are taken into account as
# despite having the same final outcome as siRNAs, i.e. the inhibition
# of the action of proteins, low-molecular weight compounds and siRNAs
# differ from one another in their mode of action: The former come into
# play after protein translation has taken place, whereas the latter
# unfold their effect on the post-transcriptional level by targeting
# mRNAs
# Hence, exactly the same effect is not expected
single_pooled_siRNA_and_esiRNA_STRING_IDs_df = pd.read_csv(
    "single_pooled_siRNA_and_esiRNA_STRING_IDs.csv",
    delimiter="\t"
)

n_targets = len(single_pooled_siRNA_and_esiRNA_STRING_IDs_df)

print(
    "Amount of human proteins targeted in the VACV screen by "
    f"single/pooled siRNAs as well as esiRNAs: {n_targets:,}"
)

Amount of human proteins targeted in the VACV screen by single/pooled siRNAs as well as esiRNAs: 18,395


In [6]:
# Now, determine the amount of human genes interrogated in the VACV
# screen that engage in PPIs recorded by HVIDB
# To this end, the protein target names have to be converted in
# UniProtIDs as they are employed by HVIDB
# This is achieved via the ID mapping GUI on
# https://www.uniprot.org/id-mapping
VACV_screen_human_targets_series = (
    single_pooled_siRNA_and_esiRNA_STRING_IDs_df["queryItem"]
)

VACV_screen_human_targets_series.to_csv(
    "VACV_screen_human_targets_single_pooled_siRNA_and_esiRNA_gene_names.txt",
    header=False,
    index=False
)

In [1]:
# Using the parameters "from: Gene_Name", "to: UniProtKB-Swiss-Prot" and
# "taxId: 9606", 18,185 of the 18,396 human protein IDs were mapped to
# 18,645 results, whereas 213 ID were not mapped
# This is subjected to close scrutiny
#
# For the ALG1L gene, the official gene symbol has apparently been
# changed after the VACV screen has been conducted; its new official
# gene symbol is ALG1L1P while its NCBI Gene ID remained unchanged
# (200810)
#
# Regarding the gene AKAP2, the NCBI Gene ID has been changed from 11217
# to 445815; note that AKAP2 is only an alias and that the offical gene
# symbol is PALM2AKAP2
#
# With respect to the gene PPAN-P2RY11, neither the official gene symbol
# nor the NCBI Gene ID have been altered after the VACV screen, but the
# respective UniProtKB entry belongs to the UniProtKB/TrEMBL section,
# i.e. has not been manually reviewed. Consequently, using the setting
# "to: UniProtKB-Swiss-Prot" does not yield any UniProtID for this gene
# It becomes apparent that the most convenient way of obtaining
# UniProtIDs in one single step is to use the settings
# "From database: Genomic annotation databases/GeneID" (Gene ID is the
# same as NCBI Gene ID) and "To database: UniProt/UniProtKB" (choosing
# UniProtKB includes both UniProtKB sections, i.e. UniProtKB/Swiss-Prot
# and UniProtKB/TrEMBL)
#
# With regard to the gene CDY, the specified NCBI Gene ID is still valid
# (203611), but its official gene symbol is CDY2B
#
# As to the gene FLJ40296, the specified NCBI Gene ID is still valid
# (122183), but its official gene symbol is PRR20A
#
# Regarding the gene MGC39606, the specified NCBI Gene ID is still valid
# (399668), but its official gene symbol is SMIM10L2A
#
# etc.

In [6]:
# Instead of doing this investigation manually, it is accomplished in an
# automated fashion by means of the Biotite bioinformatics library
# Biotite provides functionalities allowing to access various NCBI
# databases in a programmatic manner, i.e. from within Python scripts

# Two columns of interest for the following endeavour are "ID" as well
# as "ID_manufacturer"
# Upon closer scrutiny, it became apparent that contrary to their names,
# both features basically harbour the same information, namely the NCBI
# Gene ID of the targeted gene
# However, they differ in that while the feature "ID" is populated
# depending on whether the respective experiment was successful or not,
# "ID_manufacturer" is continuously populated, irrespective of the
# experiment's outcome
# Hence, the entries of the "ID_manufacturer" feature are employed in
# order to query NCBI's gene database
# As a first step, it is checker whether all entries of the
# "ID_manufacturer" feature are indeed populated with values or whether
# there are isolated occurrences of "Not available"
# Note that the features "ID" and "ID_manufacturer" only harbour NCBI
# Gene IDs in the case of siRNA, pooled siRNA and esiRNA

# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

# Dask DataFrames exhibit a peculiarity regarding the index labels: By
# default, the index labels are integers, just as with Pandas
# DataFrames. However, unlike Pandas DataFrames, the index labels do not
# monotonically increase from 0, but restart at 0 for each partition,
# thereby resulting in duplicated index labels (Dask subdivides a Dask
# DataFram into multiple so-called partitions as the whole idea behind
# Dask is to handle large data sets in a memory-efficient way, https://
# docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.reset_
# index.html)
# Hence, performing operations with Dask DataFrames might potentially
# raise the `ValueError: cannot reindex on an axis with duplicate
# labels` error
# In this case, loading the entire data set into a Pandas DataFrame is
# feasible, which is why this is preferred to loading it into a Dask
# DataFrame (strangely enough, this has not been possible in the very
# beginning, which is why Dask was used in the first place)
main_csv_df = pd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

# Bear in mind that due to operator precedence, i.e. "|" (logical OR)
# having precedence over equality checks, the equality checks have to be
# surrounded by parentheses
single_pooled_siRNA_and_esiRNA_df = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]

In [7]:
single_pooled_siRNA_and_esiRNA_IDs = single_pooled_siRNA_and_esiRNA_df[
    "ID_manufacturer"
]

assert "Not available" not in single_pooled_siRNA_and_esiRNA_IDs, (
    "Contrary to our expectations, the entry \"Not available\" does "
    "occur in the feature \"ID_manufacturer\"!"
)

In [8]:
ID_change_str = "This record was replaced with GeneID:"

# Iterate over the DataFrame rows using the iterrows() method
for i, row in main_csv_df.iterrows():
    # Incorporating kind of a numeric progress bar
    if i % 10000 == 0:
        print(i)
    well_type = row["WellType"]
    if (
        well_type == "SIRNA"
        or
        well_type == "POOLED_SIRNA"
        or
        well_type == "ESIRNA"
    ):
        gene_name = row["Name"]
        NCBI_Gene_ID = row["ID_manufacturer"]
        
        # Query NCBI's gene database with the Gene ID currently dealt
        # with
        # Code execution is suspended for two seconds in order to avoid
        # server-side errors
        # The VACV data set comprises 132,066 measurements involving
        # single siRNAs, pooled siRNAs and esiRNAs
        # For each of those measurements, the NCBI database is queried
        # at least once, entailing the suspension of code execution for
        # two seconds; in total, this amounts to a "waiting time" of
        # 264,132 seconds, which corresponds to slightly more than three
        # days; this period of time is acceptable
        time.sleep(2)
        NCBI_entry = entrez.fetch_single_file(
            uids=[NCBI_Gene_ID],
            file_name=None,
            db_name="gene",
            ret_type="",
            ret_mode="text"
        )

        # As the file_name is specified to be None, Biotite's
        # fetch_single_file() function returns a StringIO object
        # It's content can be accessed via the getvalue() method
        # Note that the getvalue() method is preferred to the read()
        # method as the latter moves the cursor to the last index so
        # that repeatedly using the read() method returns an empty
        # string
        NCBI_entry_str = NCBI_entry.getvalue()

        # Different approaches are necessary depending on whether merely
        # the official gene symbol was altered, whereas the NCBI Gene ID
        # remained unchanged, or the record has been replaced altogether
        # with an entirely new ID
        if ID_change_str in NCBI_entry_str:
            # The respective record has been replaced altogether with a
            # new ID
            # Hence, the new ID is retrieved and used to query NCBI's
            # gene database
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            # For some strange reason, the string retrieved from the
            # NCBI entry contains blank lines; they are removed
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # The new ID is comprised in the penultimate list element
            # and conveniently enough separated from the preceding
            # string by a space character
            new_gene_ID = NCBI_entry_str_list[-1].split()[-1]
            main_csv_df.at[i, "ID"] = new_gene_ID
            main_csv_df.at[i, "ID_manufacturer"] = new_gene_ID

            # Again, in a bid to prevent the occurrence of server-side
            # errors, code execution is suspended for two seconds
            time.sleep(2)
            NCBI_entry = entrez.fetch_single_file(
                uids=[new_gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
            NCBI_entry_str = NCBI_entry.getvalue()
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # The official gene symbol is comprised in the first list
            # element, but is preceded by the string "1. ",
            # which encompasses three characters
            official_gene_symbol = NCBI_entry_str_list[0][3:]

            main_csv_df.at[i, "Name"] = official_gene_symbol
        else:
            # Remove blank lines from the string retrieved from the NCBI
            # entry
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # Following the removal of empty strings, the official gene
            # symbol is represented by the first list element, but it is
            # preceded by the string "1. ", which encompasses three
            # characters
            # Hence, prior to comparing the gene names provided by the
            # VACV screen data set to the official gene symbols, the
            # first list element has to be sliced accordingly
            official_gene_symbol = NCBI_entry_str_list[0][3:]
            if gene_name != official_gene_symbol:
                main_csv_df.at[i, "Name"] = official_gene_symbol
            
            # Irrespective of whether the gene name provided by the VACV
            # data set and the official gene symbol match or not, the
            # corresponding cell of the "ID" feature is populated with
            # the NCBI Gene ID harboured by the "ID_manufacturer"
            # feature (remember that cells of the "ID" feature are not
            # continuously populated)
            main_csv_df.at[i, "ID"] = NCBI_Gene_ID

KeyboardInterrupt: 

In [None]:
# As a last step, save the new Pandas DataFrame to a new CSV file
main_csv_df.to_csv(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_updated.csv",
    index=False
)

In [2]:
# A UniProt reference proteome has been found, which encompasses 219
# proteins (the respective web page claims that it encompasses 218
# proteins, but a closer look at the FASTA file reveals that they are
# 219 instead)
# It is checked whether these 219 proteins comprise all interacting VACV
# proteins recorded by HVIDB or whether HVIDB contains VACV proteins not
# comprised in the reference proteome
# First, perform a numerical check
HVIDB_VACV_interactions_df = pd.read_csv(
    "all_HVIDB_VACV_interactions.csv"
)
print(HVIDB_VACV_interactions_df.columns)

Index(['Human-virus PPI', 'Experimental System', 'PubMed ID', 'Virus acronym'], dtype='object')


In [3]:
# The VACV UniProt IDs are comprised in the first column bearing the
# name "Human-virus PPI"; to be more precise, the first column
# represents an interaction pair between a human and a VACV protein as a
# combination of the respective UniProt IDs, which are separated by a
# hyphen
# The first UniProt ID represents a human protein, whereas the second
# UniProt ID represents a VACV protein
interaction_pair_IDs = HVIDB_VACV_interactions_df[
    "Human-virus PPI"
].to_list()

In [4]:
VACV_interaction_IDs = [
    pair.split("-")[1] for pair in interaction_pair_IDs
]

assert len(interaction_pair_IDs) == len(VACV_interaction_IDs), (
    "Something went wrong!"
)

In [5]:
# Bear in mind that one and the same protein can engage in more than one
# PPI, which also applies to VACV
# Hence, duplicate UniProt IDs are discarded
VACV_unique_interaction_IDs = np.unique(VACV_interaction_IDs).tolist()

In [6]:
# Scrutinising the FASTA file revealed that it indeed comprises 218
# sequences as claimed by the UniProt web page (the ">" character, which
# represents the start of a header, has an additional occurrence in an
# entry description)
assert len(VACV_unique_interaction_IDs) <= 218, (
    "The amount of VACV proteins involved in interspecies PPI recorded "
    "by HVIDB exceeds the size of the UniProt reference proteome "
    "(219)."
)

print(
    "Total amount of proteins involved in interspecies PPI recorded by "
    f"HVIDB: {len(VACV_unique_interaction_IDs)}"
)

Total amount of proteins involved in interspecies PPI recorded by HVIDB: 61


In [7]:
# Now, beyond a simple numerical check, it is investigated for each VACV
# protein in HVIDB individually whether it is included in the UniProt
# reference proteome or not
# Conveniently enough, the header contains the respective protein's
# UniProt ID, thereby obviating the necessity of ID conversion
# Gather the UniProt IDs of the proteins comprised in the UniProt
# reference proteome
ref_proteome_fasta = fasta.FastaFile.read(
    "uniprotkb_proteome_UP000000344_2024_08_05.fasta"
)
ref_proteome_dict = fasta.get_sequences(ref_proteome_fasta)
ref_proteome_headers = list(ref_proteome_dict.keys())
ref_IDs = [header.split("|")[1] for header in ref_proteome_headers]

absence_list = [0] * len(VACV_unique_interaction_IDs)

for i, query_ID in enumerate(VACV_unique_interaction_IDs):
    if query_ID not in ref_IDs:
        absence_list[i] = 1

assert sum(absence_list) == 0, (
    f"{sum(absence_list)} proteins involved in interspecies PPIs "
    "recorded by HVIDB do not occur in the UniProt reference proteome!"
)

AssertionError: 18 proteins involved in interspecies PPIs recorded by HVIDB do not occur in the UniProt reference proteome!

In [9]:
# Determine the UniProt IDs of those 18 proteins not comprised in the
# reference proteome
outcast_protein_indices = np.nonzero(absence_list)
outcast_protein_IDs = np.array(VACV_unique_interaction_IDs)[
    outcast_protein_indices
]

In [10]:
print(outcast_protein_IDs)

['A4GDF4' 'O57173' 'O57263' 'P04299' 'P20505' 'P20639' 'P68318' 'P68451'
 'P68467' 'Q1M1E0' 'Q49PX0' 'Q49QD4' 'Q711A3' 'Q76RC6' 'Q76ZR2' 'Q86638'
 'Q8BDF8' 'Q8UYL3']


In [14]:
VACV_proteome_fasta = ref_proteome_fasta.copy()

# Querying the UniProtKB database with a couple of the IDs not included
# in the reference genome reveals that some IDs are unreviewes, while
# others have a low annotation score and where therefore not
# incorporated
# The functionalities provided by Biotite are again used in order to
# retrieve the respective proteins' sequences from the UniProt database,
# add them to the sequences from the reference proteome and save the
# resulting concatenation to a new FASTA file
# As a list of IDs is passed to `uniprot.fetch()` and `target_path` is
# not specified, a list of StringIO objects is returned which can be
# iterated over
for io_object in uniprot.fetch(outcast_protein_IDs, "fasta"):
    # Read the StringIO object into a Fasta file
    fasta_file = fasta.FastaFile.read(io_object)
    # Each file contains only one entry; hence, the first and only
    # element is retrieved from the the iterator returned by the 
    # `items()` method
    header, seq_str = list(fasta_file.items())[0]
    
    VACV_proteome_fasta[header] = seq_str

# Strangely enough, creating a deep copy of a FASTA file resets its
# length to zero although it is not empty (applies to Biotite version
# 0.39.0); hence, instead of evaluating whether the new file's length
# equals the old file's length plus the amount of new entries, it is
# checked whether the new file's length equals the amount of newly added
# items
assert len(VACV_proteome_fasta) == 18, (
    "Something went wrong while adding the 18 entries to the reference "
    "proteome!"
)

In [15]:
print(len(VACV_proteome_fasta))
print(VACV_proteome_fasta)

18
>sp|P01136|VGF_VACCW Pro-Viral epidermal growth factor OS=Vaccinia virus (strain Western Reserve) OX=10254 GN=OPG019 PE=1 SV=1
MSMKYLMLLFAAMIIRSFADSGNAIETTSPEITNATTDIPAIRLCGPEGDGYCLHGDCIH
ARDIDGMYCRCSHGYTGIRCQHVVLVDYQRSENPNTTTSYIPSPGIMLVLVGIIIITCCL
LSVYRFTRRTKLPIQDMVVP
>sp|P04021|PG057_VACCW Envelope phospholipase OPG057 OS=Vaccinia virus (strain Western Reserve) OX=10254 GN=OPG057 PE=1 SV=1
MWPFASVPAGAKCRLVETLPENMDFRSDHLTTFECFNEIITLAKKYIYIASFCCNPLSTT
RGALIFDKLKEASEKGIKIIVLLDERGKRNLGELQSHCPDINFITVNIDKKNNVGLLLGC
FWVSDDERCYVGNASFTGGSIHTIKTLGVYSDYPPLATDLRRRFDTFKAFNSAKNSWLNL
CSAACCLPVSTAYHIKNPIGGVFFTDSPEHLLGYSRDLDTDVVIDKLKSAKTSIDIEHLA
IVPTTRVDGNSYYWPDIYNSIIEAAINRGVKIRLLVGNWDKNDVYSMATARSLDALCVQN
DLSVKVFTIQNNTKLLIVDDEYVHITSANFDGTHYQNHGFVSFNSIDKQLVSEAKKIFER
DWVSSHSKSLKI
>sp|P04298|MCEL_VACCW mRNA-capping enzyme catalytic subunit OS=Vaccinia virus (strain Western Reserve) OX=10254 GN=OPG113 PE=1 SV=1
MDANVVSSSTIATYIDALAKNASELEQRSTAYEINNELELVFIKPPLITLTNVVNISTIQ
ESFIRFTVTNKEGVKIRTKIPLSKVHGLDV

In [None]:
# 64858 DCLRE1B
# Incorporate time.sleep() in order to avoid the error?

In [None]:
# A0 in Inkscape
# Prospective Poster
# Describe Task, Setting, Methods, intermediate results (e.g. XGBoost)
# Use HZDR corporate Design/CASUS template for poster (both HZDR and CASUS)
# Regarding compounds and siRNA: they are different, but one could be used
# to validate the other
# Exactly the same effect is not expected