In [10]:
import pandas as pd
import dask.dataframe as dd
from biotite.database import entrez

from requests.exceptions import ChunkedEncodingError

In [2]:
# Unfortunately, to my knowledge, downloading all VACV interactions from
# the HVIDB database at once is not possible
# Hence, they have been downloaded in packages of up to 100 interactions
# It is investigated whether the 5 packages are all different from each
# other so as to rule out errors during the manual download, such as
# accidentally downloading the same package twice
# As a first step, the individual packages are loaded into the Jupyter
# notebook
HVIDB_package_1 = pd.read_csv("HVIDB_VACV_interactions_1.csv")
HVIDB_package_2 = pd.read_csv("HVIDB_VACV_interactions_2.csv")
HVIDB_package_3 = pd.read_csv("HVIDB_VACV_interactions_3.csv")
HVIDB_package_4 = pd.read_csv("HVIDB_VACV_interactions_4.csv")
HVIDB_package_5 = pd.read_csv("HVIDB_VACV_interactions_5.csv")

# According to the HVIDB database, the total amount of interactions is
# 456
# It is checked whether this also applies to the five downloaded
# packages
packages_list = [
    HVIDB_package_1,
    HVIDB_package_2,
    HVIDB_package_3,
    HVIDB_package_4,
    HVIDB_package_5
]

assert sum(map(len, packages_list)) == 456, (
    "An error has been made during the manual download of the "
    "interaction packages!"
)

In [3]:
# Now, verify that all downloaded interaction packages are distinct from
# one another
# Unfortunately, checking equality for more than two DataFrames at once
# is not possible to the best of my knowledge
# Hence, it is resorted to for-loops
package_equality = False

for i, package in enumerate(packages_list):
    # For obvious reasons, equality checks are only performed between
    # different interaction packages
    # Moreover, redundant equality checks are avoided
    for j in range(i + 1, 5):
        package_equality = package_equality or package.equals(packages_list[j])

assert package_equality == False, (
    "Identical interaction packages have erroneously been downloaded!"
)

In [4]:
# Now, for the sake of convenience, concatenate the five individual
# interaction packages to generate one coherent CSV file
HVIDB_VACV_interactions_df = pd.concat(packages_list, ignore_index=True)

assert (
    len(HVIDB_VACV_interactions_df) == 456
    and
    len(HVIDB_VACV_interactions_df.columns) == 4
), "A mistake was done during DataFrame concatenation!"

In [9]:
# Save the coherent DataFrame as a CSV file
HVIDB_VACV_interactions_df.to_csv(
    "all_HVIDB_VACV_interactions.csv", index=False
)

In [5]:
# Determine the amount of genes targeted by siRNAs/esiRNAs and
# low-molecular weight organic compounds in the VACV screen
siRNA_and_compounds_STRING_IDs_df = pd.read_csv(
    "siRNA_and_compounds_STRING_IDs.csv",
    delimiter="\t"
)

n_targets = len(siRNA_and_compounds_STRING_IDs_df)

print(
    f"Amount of human proteins targeted in the VACV screen: {n_targets:,}"
)

Amount of human proteins targeted in the VACV screen: 18,396


In [13]:
# Now, determine the amount of human genes interrogated in the VACV
# screen that engage in PPIs recorded by HVIDB
# To this end, the protein target names have to be converted in
# UniProtIDs as they are employed by HVIDB
# This is achieved via the ID mapping GUI on
# https://www.uniprot.org/id-mapping
VACV_screen_human_targets_series = siRNA_and_compounds_STRING_IDs_df[
    "queryItem"
]

VACV_screen_human_targets_series.to_csv(
    "VACV_screen_human_targets_gene_names.txt",
    header=False,
    index=False
)

In [1]:
# Using the parameters "from: Gene_Name", "to: UniProtKB-Swiss-Prot" and
# "taxId: 9606", 18,185 of the 18,396 human protein IDs were mapped to
# 18,645 results, whereas 213 ID were not mapped
# This is subjected to close scrutiny
#
# For the ALG1L gene, the official gene symbol has apparently been
# changed after the VACV screen has been conducted; its new official
# gene symbol is ALG1L1P while its NCBI Gene ID remained unchanged
# (200810)
#
# Regarding the gene AKAP2, the NCBI Gene ID has been changed from 11217
# to 445815; note that AKAP2 is only an alias and that the offical gene
# symbol is PALM2AKAP2
#
# With respect to the gene PPAN-P2RY11, neither the official gene symbol
# nor the NCBI Gene ID have been altered after the VACV screen, but the
# respective UniProtKB entry belongs to the UniProtKB/TrEMBL section,
# i.e. has not been manually reviewed. Consequently, using the setting
# "to: UniProtKB-Swiss-Prot" does not yield any UniProtID for this gene
# It becomes apparent that the most convenient way of obtaining
# UniProtIDs in one single step is to use the settings
# "From database: Genomic annotation databases/GeneID" (Gene ID is the
# same as NCBI Gene ID) and "To database: UniProt/UniProtKB" (choosing
# UniProtKB includes both UniProtKB sections, i.e. UniProtKB/Swiss-Prot
# and UniProtKB/TrEMBL)
#
# With regard to the gene CDY, the specified NCBI Gene ID is still valid
# (203611), but its official gene symbol is CDY2B
#
# As to the gene FLJ40296, the specified NCBI Gene ID is still valid
# (122183), but its official gene symbol is PRR20A
#
# Regarding the gene MGC39606, the specified NCBI Gene ID is still valid
# (399668), but its official gene symbol is SMIM10L2A
#
# etc.

In [11]:
# Instead of doing this investigation manually, it is accomplished in an
# automated fashion by means of the Biotite bioinformatics library
# Biotite provides functionalities allowing to access various NCBI
# databases in a programmatic manner, i.e. from within Python scripts

# Two columns of interest for the following endeavour are "ID" as well
# as "ID_manufacturer"
# Upon closer scrutiny, it became apparent that contrary to their names,
# both features basically harbour the same information, namely the NCBI
# Gene ID of the targeted gene
# However, they differ in that while the feature "ID" is populated
# depending on whether the respective experiment was successful or not,
# "ID_manufacturer" is continuously populated, irrespective of the
# experiment's outcome
# Hence, the entries of the "ID_manufacturer" feature are employed in
# order to query NCBI's gene database
# As a first step, it is checker whether all entries of the
# "ID_manufacturer" feature are indeed populated with values or whether
# there are isolated occurrences of "Not available"
# Note that the features "ID" and "ID_manufacturer" only harbour NCBI
# Gene IDs in the case of siRNA, pooled siRNA and esiRNA

# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

# Bear in mind that due to operator precedence, i.e. "|" (logical OR)
# having precedence over equality checks, the equality checks have to be
# surrounded by parentheses
single_pooled_siRNA_and_esiRNA_df = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]

In [12]:
single_pooled_siRNA_and_esiRNA_IDs = single_pooled_siRNA_and_esiRNA_df[
    "ID_manufacturer"
]

assert "Not available" not in single_pooled_siRNA_and_esiRNA_IDs.compute(), (
    "Contrary to our expectations, the entry \"Not available\" does occur "
    "in the feature \"ID_manufacturer\"!"
)

In [13]:
# Extract a DataFrame encompassing the features "WellType", "ID",
# "ID_manufacturer" and "Name"
sub_csv_df = main_csv_df[
    ["WellType", "ID", "ID_manufacturer", "Name"]
].compute()

In [14]:
ID_change_str = "This record was replaced with GeneID:"

# Iterate over the DataFrame rows using the iterrows() method
for i, row in sub_csv_df.iterrows():
    well_type = row["WellType"]
    if (
        well_type == "SIRNA"
        or
        well_type == "POOLED_SIRNA"
        or
        well_type == "ESIRNA"
    ):
        gene_name = row["Name"]
        NCBI_Gene_ID = row["ID_manufacturer"]
        
        # Query NCBI's gene database with the Gene ID currently dealt
        # with
        try:
            NCBI_entry = entrez.fetch_single_file(
                uids=[NCBI_Gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
        except:
            print(NCBI_Gene_ID, gene_name)
            break

        # As the file_name is specified to be None, Biotite's
        # fetch_single_file() function returns a StringIO object
        # It's content can be accessed via the getvalue() method
        # Note that the getvalue() method is preferred to the read()
        # method as the latter moves the cursor to the last index so
        # that repeatedly using the read() method returns an empty
        # string
        NCBI_entry_str = NCBI_entry.getvalue()

        # Different approaches are necessary depending on whether merely
        # the official gene symbol was altered, whereas the NCBI Gene ID
        # remained unchanged, or the record has been replaced altogether
        # with an entirely new ID
        if ID_change_str in NCBI_entry_str:
            # The respective record has been replaced altogether with a
            # new ID
            # Hence, the new ID is retrieved and used to query NCBI's
            # gene database
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            # For some strange reason, the string retrieved from the
            # NCBI entry contains blank lines; they are removed
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # The new ID is comprised in the penultimate list element
            # and conveniently enough separated from the preceding
            # string by a space character
            new_gene_ID = NCBI_entry_str_list[-1].split()[-1]
            sub_csv_df.at[i, "ID"] = new_gene_ID
            sub_csv_df.at[i, "ID_manufacturer"] = new_gene_ID

            try:
                NCBI_entry = entrez.fetch_single_file(
                    uids=[new_gene_ID],
                    file_name=None,
                    db_name="gene",
                    ret_type="",
                    ret_mode="text"
                )
            except:
                print(new_gene_ID)
                break
            NCBI_entry_str = NCBI_entry.getvalue()
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # The official gene symbol is comprised in the first list
            # element, but is preceded by the string "1. ",
            # which encompasses three characters
            official_gene_symbol = NCBI_entry_str_list[0][3:]

            sub_csv_df.at[i, "Name"] = official_gene_symbol
        else:
            # Remove blank lines from the string retrieved from the NCBI
            # entry
            NCBI_entry_str_list = NCBI_entry_str.split("\n")
            while "" in NCBI_entry_str_list:
                NCBI_entry_str_list.remove("")
            
            # Following the removal of empty strings, the official gene
            # symbol is represented by the first list element, but it is
            # preceded by the string "1. ", which encompasses three
            # characters
            # Hence, prior to comparing the gene names provided by the
            # VACV screen data set to the official gene symbols, the
            # first list element has to be sliced accordingly
            official_gene_symbol = NCBI_entry_str_list[0][3:]
            if gene_name != official_gene_symbol:
                sub_csv_df.at[i, "Name"] = official_gene_symbol
            
            # Irrespective of whether the gene name provided by the VACV
            # data set and the official gene symbol match or not, the
            # corresponding cell of the "ID" feature is populated with
            # the NCBI Gene ID harboured by the "ID_manufacturer"
            # feature (remember that cells of the "ID" feature are not
            # continuously populated)
            sub_csv_df.at[i, "ID"] = NCBI_Gene_ID

64858 DCLRE1B
