In [2]:
import pandas as pd
import dask.dataframe as dd

In [3]:
# Unfortunately, to my knowledge, downloading all VACV interactions from
# the HVIDB database at once is not possible
# Hence, they have been downloaded in packages of up to 100 interactions
# It is investigated whether the 5 packages are all different from each
# other so as to rule out errors during the manual download, such as
# accidentally downloading the same package twice
# As a first step, the individual packages are loaded into the Jupyter
# notebook
HVIDB_package_1 = pd.read_csv("HVIDB_VACV_interactions_1.csv")
HVIDB_package_2 = pd.read_csv("HVIDB_VACV_interactions_2.csv")
HVIDB_package_3 = pd.read_csv("HVIDB_VACV_interactions_3.csv")
HVIDB_package_4 = pd.read_csv("HVIDB_VACV_interactions_4.csv")
HVIDB_package_5 = pd.read_csv("HVIDB_VACV_interactions_5.csv")

# According to the HVIDB database, the total amount of interactions is
# 456
# It is checked whether this also applies to the five downloaded
# packages
packages_list = [
    HVIDB_package_1,
    HVIDB_package_2,
    HVIDB_package_3,
    HVIDB_package_4,
    HVIDB_package_5
]

assert sum(map(len, packages_list)) == 456, (
    "An error has been made during the manual download of the "
    "interaction packages!"
)

In [5]:
# Now, verify that all downloaded interaction packages are distinct from
# one another
# Unfortunately, checking equality for more than two DataFrames at once
# is not possible to the best of my knowledge
# Hence, it is resorted to for-loops
package_equality = False

for i, package in enumerate(packages_list):
    # For obvious reasons, equality checks are only performed between
    # different interaction packages
    # Moreover, redundant equality checks are avoided
    for j in range(i + 1, 5):
        package_equality = package_equality or package.equals(packages_list[j])

assert package_equality == False, (
    "Identical interaction packages have erroneously been downloaded!"
)

In [8]:
# Now, for the sake of convenience, concatenate the five individual
# interaction packages to generate one coherent CSV file
HVIDB_VACV_interactions_df = pd.concat(packages_list, ignore_index=True)

assert (
    len(HVIDB_VACV_interactions_df) == 456
    and
    len(HVIDB_VACV_interactions_df.columns) == 4
), "A mistake was done during DataFrame concatenation!"

In [9]:
# Save the coherent DataFrame as a CSV file
HVIDB_VACV_interactions_df.to_csv(
    "all_HVIDB_VACV_interactions.csv", index=False
)

In [11]:
# Determine the amount of genes targeted by siRNAs/esiRNAs and
# low-molecular weight organic compounds in the VACV screen
siRNA_and_compounds_STRING_IDs_df = pd.read_csv(
    "siRNA_and_compounds_STRING_IDs.csv",
    delimiter="\t"
)

n_targets = len(siRNA_and_compounds_STRING_IDs_df)

print(
    f"Amount of human proteins targeted in the VACV screen: {n_targets:,}"
)

Amount of human proteins targeted in the VACV screen: 18,396


In [13]:
# Now, determine the amount of human genes interrogated in the VACV
# screen that engage in PPIs recorded by HVIDB
# To this end, the protein target names have to be converted in
# UniProtIDs as they are employed by HVIDB
# This is achieved via the ID mapping GUI on
# https://www.uniprot.org/id-mapping
VACV_screen_human_targets_series = siRNA_and_compounds_STRING_IDs_df[
    "queryItem"
]

VACV_screen_human_targets_series.to_csv(
    "VACV_screen_human_targets_gene_names.txt",
    header=False,
    index=False
)

In [15]:
print(len(siRNA_and_compounds_STRING_IDs_df["queryItem"]))

18396
