In [1]:
import csv

import numpy as np
import pandas as pd
import dask.dataframe as dd
import requests



In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [3]:
# According to the feature "WellType", eight different types of
# experiments have been conducted, which are the following:
# 1.) COMPOUND: refers to small molecules, i.e. low-molecular weight
# organic compounds; note that for small molecules, the feature "Name"
# represents the name of the small compound, whereas the proteins
# targeted by the small molecules are specified by the feature
# "Name_alternatives"
# 2.) CONTROL
# 3.) ESIRNA, i.e. endoribonuclease-prepared siRNA, which differs from
# the conventional siRNA in that it is not chemically synthesised, but
# generated by enzymatic digestion of long dsRNA molecules
# 4.) MIRNA_INHIBITOR
# 5.) MIRNA_MIMIC
# 6.) Not available
# 7.) POOLED_SIRNA, i.e. transfection of mammalian cells has been
# performed with an ensemble of four different siRNAs
# 8.) SIRNA, i.e. transfection of mammalian cells has been performed
# with only one single type of siRNA molecules

In [4]:
# Extract target genes knocked down via single siRNAs, pooled siRNAs and
# esiRNAs
# Bear in mind that due operator precedence, i.e. "|" (logical or)
# having precedence over equality checks, the equality checks must be
# surrounded by parentheses
target_name_series_single_pooled_siRNA_and_esiRNA = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]["Name"]

# Some entries of the feature "Name" are "Not available", those need to
# be filtered out
# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
target_names_single_pooled_siRNA_and_esiRNA = [
    name for name in target_name_series_single_pooled_siRNA_and_esiRNA
    if name != "Not available"
]

In [5]:
# Extract target genes knocked down by low-molecular weight organic
# compounds
target_name_series_compounds = main_csv_df.loc[
    main_csv_df["WellType"] == "COMPOUND"
]["Name_alternatives"]

# The names of genes targeted by small molecules require special
# processing for three reasons: Firstly, some entries encompass multiple
# names separated by commata or the ampersand sign. Secondly, in case of
# proteins belonging to the same family, such as HER1 and HER2, or
# otherwise related proteins, e.g. Aurora kinase and CDK, the individual
# proteins are separated from each other via forward slashes. Thirdly,
# in the case of TGF-beta, a question mark occurs in lieu of the
# corresponding Greek letter. Hence, the question mark has to be
# replaced with the word "beta".
# Unfortunately, iterable unpacking cannot be used in conjunction with
# list comprehensions, which is why for loops must be resorted to
target_names_compounds = []

for name in target_name_series_compounds:
    if "TGF" in name:
        name = name.replace("?", "beta")
    # The forward slash only occurs in two situations, the first of
    # which is "HER1/2" and the second if which is "Aurora / CDK"
    if "1/2" in name:
        name = name.replace("HER1/2", "HER1,HER2")
    if " / " in name:
        name = name.replace(" / ", ",")
    if "," in name:
        target_names_compounds += name.split(",")
    elif "&" in name:
        target_names_compounds += name.split("&")
    else:
        target_names_compounds.append(name)

# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
#target_names_compounds = np.unique(target_names_compounds).tolist()

In [6]:
# Now, determine the union of both lists
# Creating the union does not include duplicates
target_names_siRNA_and_compounds = list(
    set(target_names_single_pooled_siRNA_and_esiRNA)
    |
    set(target_names_compounds)
)

# Also determine unique target names for single/pooled siRNA and esiRNA
# separately
# Again, this can be accomplished by converting the list into a set as
# sets do not include duplicate values
target_names_siRNA = list(
    set(target_names_single_pooled_siRNA_and_esiRNA)
)

In [8]:
string_api_url = "https://version-12-0.string-db.org/api"
output_format = "tsv"
method = "get_string_ids"

# Set parameters
params = {
    "identifiers": "\r".join(target_names_siRNA_and_compounds), # Protein list
    "species": 9606, # Species NCBI identifier, in this case human
    "limit": 1, # Only one (best) identifier per input protein
    "echo_query": 1, # See your input identifiers in the output
    "caller_identity": "Elvis Jagger Abdul-Jabbar"
}

# Construct the URL
request_url = "/".join([string_api_url, output_format, method])

# Call STRING
results = requests.post(request_url, data=params)

assert results.status_code == 200, "The request was not successful!"

In [8]:
# Save the result to a CSV file
# with statements are preferred for the purpose of working with files as
# they automatically take care of closing files, even in case of errors/
# exceptions
with open("siRNA_and_compounds_STRING_IDs.csv", "w") as f:
    # By using a tab in lieu of a comma as delimiter, a TSV instead of a
    # CSV is generated, technically speaking
    csv_writer = csv.writer(f, delimiter="\t")

    for i, line in enumerate(results.text.strip().split("\n")):
        split_line = line.split("\t")
        input_identifier, string_identifier = split_line[0], split_line[2]
        csv_writer.writerow([input_identifier, string_identifier])

In [9]:
# Map the target names to STRING IDs (Ensembl IDs) for single/pooled
# siRNAs and esiRNAs separately
params = {
    "identifiers": "\r".join(target_names_siRNA), # Protein list
    "species": 9606, # Species NCBI identifier, in this case human
    "limit": 1, # Only one (best) identifier per input protein
    "echo_query": 1, # See your input identifiers in the output
    "caller_identity": "Elvis Jagger Abdul-Jabbar"
}

# Construct the URL
request_url = "/".join([string_api_url, output_format, method])

# Call STRING
results = requests.post(request_url, data=params)

assert results.status_code == 200, "The request was not successful!"

In [10]:
# Also save the results for single/pooled siRNAs and esiRNAs to a CSV
# file
with open("single_pooled_siRNA_and_esiRNA_STRING_IDs.csv", "w") as f:
    csv_writer = csv.writer(f, delimiter="\t")

    for i, line in enumerate(results.text.strip().split("\n")):
        split_line = line.split("\t")
        input_identifier, string_identifier = split_line[0], split_line[2]
        csv_writer.writerow([input_identifier, string_identifier])

In [9]:
csv_per_row_list = []

# Load the STRING identifiers from the csv file
with open("siRNA_and_compounds_STRING_IDs.csv", "r") as f:
    csv_reader = csv.reader(f, delimiter="\t")
    # Skip the header row as it does not contain any actual data
    next(csv_reader)

    for row in csv_reader:
        # Each row in the CSV file is retrieved as a list the elements
        # of which are the tab-separated values
        # Thus, appending the list of each row to `csv_per_row_list`
        # results in a list of lists, i.e. a nested list
        # Employing a trick involving the unpacking operator (asterisk)
        # in conjunction with the zip() function, the two columns are
        # restored in an elegant way
        # (https://realpython.com/python-zip-function/ and
        # https://realpython.com/python-kwargs-and-args/#unpacking-with-the-asterisk-operators)
        csv_per_row_list.append(row)

_, siRNA_and_compounds_string_IDs = zip(*csv_per_row_list)

# As the zip() function returns an iterator of tuples, the tuple is
# converted into a list
siRNA_and_compounds_string_IDs = list(siRNA_and_compounds_string_IDs)

In [10]:
# Conveniently enough, in addition to the ordinary STRING database
# focusing on intraspecies protein-protein interactions, the
# Viruses.STRING database was introduced in October 2018
# As its name already suggests, it focuses on virus-virus and virus-host
# protein-protein interactions (PPI)
# However, contrary to the claim on the website, this resource cannot be
# accessed via a REST API
# Hence, the result file is downloaded from the website and loaded into
# the script
cols_to_use = ["node1_external_id", "node2_external_id"]

string_interaction_VACV_human_df = pd.read_csv(
    "string_interactions_VACV_homo_sapiens.tsv",
    delimiter="\t",
    usecols=cols_to_use
)

In [22]:
# Initialise a list the elements of which are zeros
# The amount of zeros in the list corresponds to the amount of
# individual (pooled) siRNAs and compounds, i.e. to the length of the
# set `siRNA_and_compounds_string_IDs`
int_with_VACV_siRNA_and_compounds = [0] * len(siRNA_and_compounds_string_IDs)

# Iterate over the data frame rows using the iterrows() method
for i, row in string_interaction_VACV_human_df.iterrows():
    int_partner_1 = row["node1_external_id"]
    int_partner_2 = row["node2_external_id"]
    # The if-elif-statement below checks whether the first or the second
    # interaction partner also occurs in the list of human proteins
    # targeted by siRNAs and compounds
    # If this is the case, the other interaction partner has to belong
    # to vaccinia virus in order for the criterion of host-pathogen PPI
    # to be met
    if int_partner_1 in siRNA_and_compounds_string_IDs:
        # The NCBI identifier of vaccinia virus is 10245; it is
        # prepended to the identifiers of VACV proteins
        if "10245" in int_partner_2:
            print(i)
            print(int_partner_1)
            print(siRNA_and_compounds_string_IDs[siRNA_and_compounds_string_IDs.index(int_partner_1)])
            print()
            # Determine the position index of the human protein
            # interacting with VACV in `siRNA_and_compounds_string_IDs`
            # and alter the entry of `int_with_VACV_siRNA_and_compounds`
            # at the respective position from 0 to 1
            human_prot_idx = siRNA_and_compounds_string_IDs.index(
                int_partner_1
            )
            int_with_VACV_siRNA_and_compounds[human_prot_idx] = 1
    elif int_partner_2 in siRNA_and_compounds_string_IDs:
        if "10245" in int_partner_1:
            print(i)
            print(int_partner_2)
            print(siRNA_and_compounds_string_IDs[siRNA_and_compounds_string_IDs.index(int_partner_2)])
            print()
            human_prot_idx = siRNA_and_compounds_string_IDs.index(
                int_partner_2
            )
            int_with_VACV_siRNA_and_compounds[human_prot_idx] = 1

9
9606.ENSP00000376943
9606.ENSP00000376943

10
9606.ENSP00000256458
9606.ENSP00000256458

18
9606.ENSP00000415941
9606.ENSP00000415941

19
9606.ENSP00000419425
9606.ENSP00000419425

21
9606.ENSP00000400175
9606.ENSP00000400175

22
9606.ENSP00000393312
9606.ENSP00000393312

23
9606.ENSP00000425561
9606.ENSP00000425561

79
9606.ENSP00000376943
9606.ENSP00000376943

81
9606.ENSP00000256458
9606.ENSP00000256458



In [15]:
n_recorded_interactions = np.count_nonzero(
    int_with_VACV_siRNA_and_compounds
)

print(
    "Amount of human proteins involved in interspecies PPIs recorded "
    "by Viruses.STRING: "
    f"{n_recorded_interactions}"
)

Amount of human proteins involved in interspecies PPIs recorded by Viruses.STRING: 7


In [16]:
# In total, 13 virus-host PPIs between VACV and Homo sapiens are
# recorded
# I only checked whether the targeted genes in the VACV screen are
# involved in interspecies PPI at all; thus, multiple interactions
# involving one and the same human protein are not captured
# It is therefore investigated whether there indeed is a discrepancy
# between the human proteins involved in interspecies PPI according to
# Viruses.STRING and the VACV screen or whether all human interaction
# partners captured by Viruses.STRING also occur in the VACV screen, but
# are involved in multiple interactions
# The TSV file from Viruses.STRING harbours both intra- and interspecies
# PPI; since we are only interested in intraspecies PPI occurring
# between VACV and Homo sapiens, we extract them from the TSV file
cols_to_use = [
    "#node1",
    "node2",
    "node1_external_id",
    "node2_external_id"
]

string_interaction_VACV_human_df = pd.read_csv(
    "string_interactions_VACV_homo_sapiens.tsv",
    delimiter="\t",
    usecols=cols_to_use
)

string_interspecies_interactions = string_interaction_VACV_human_df.loc[
    # The criterion for an interspecies PPI is met when one interaction
    # partner originates from VACV, whereas the other originates from
    # Homo sapiens
    # The NCBI identifier of VACV is 10245, while that of Homo sapiens
    # is 9606
    # Bear in mind that due to operator precedence, parentheses have to
    # be used at the appropriate locations
    (
        string_interaction_VACV_human_df["node1_external_id"].str
        .contains("9606")
        &
        string_interaction_VACV_human_df["node2_external_id"].str
        .contains("10245")
    )
    |
    (
        string_interaction_VACV_human_df["node1_external_id"].str
        .contains("10245")
        &
        string_interaction_VACV_human_df["node2_external_id"].str
        .contains("9606")
    )
]

assert len(string_interspecies_interactions) == 13, (
    "Row extraction of the Pandas DataFrame has not been successful!"
)

In [17]:
print(string_interspecies_interactions)

      #node1    node2 node1_external_id     node2_external_id
9   VACWR040  BCL2L11    10245.F1_VACCW  9606.ENSP00000376943
10  VACWR039    IRAK2    10245.K7_VACCW  9606.ENSP00000256458
11  VACWR040     BAK1    10245.F1_VACCW  9606.ENSP00000353878
12  VACWR178    TRAF6   10245.A52_VACCW  9606.ENSP00000337853
13  VACWR039    DDX3X    10245.K7_VACCW  9606.ENSP00000382840
18        C3      C4B   10245.VCP_VACCW  9606.ENSP00000415941
19  VACWR030     PPIA    10245.M1_VACCW  9606.ENSP00000419425
21  VACWR050     RHOA   10245.F11_VACCW  9606.ENSP00000400175
22  VACWR101    FGFR1    10245.H3_VACCW  9606.ENSP00000393312
23  VACWR085    EIF4E    10245.G7_VACCW  9606.ENSP00000425561
79  VACWR028  BCL2L11    10245.N1_VACCW  9606.ENSP00000376943
80  VACWR039    TRAF6    10245.K7_VACCW  9606.ENSP00000337853
81  VACWR178    IRAK2   10245.A52_VACCW  9606.ENSP00000256458


In [None]:
# The following human interactions partners involved in interspecies PPI
# are also comprised in the VACV screen:
# BCL2L11 (reliable measurement)
# IRAK2 (reliable measurement)
# BAK1 (reliable measurement)
# TRAF6 (reliable measurement)
# DDX3X (reliable measurement)
# C4B (unreliable measurement, but values are available)
# PPIA (unreliable measurement, but values are available)
# RHOA (measurement quality unknown, but values are available)
# FGFR1 (reliable measurement)
# EIF4E (reliable measurement)
#
# STRING ID and Viruses.STRING ID do not match for the following
# proteins:
# BAK1 has a STRING ID of ENSP00000363591 and a Viruses.STRING ID of
# ENSP00000353878
# TRAF6 has a STRING ID of ENSP00000433623 and a Viruses.STRING ID of
# ENSP00000337853
# DDX3X has a STRING ID of ENSP00000494040 and a Viruses.STRING ID of
# ENSP00000382840
#
# The discrepancy between the 13 interspecies PPI recorded by
# Viruses.STRING and the 7 human proteins comprised in the VACV screen
# can be explained as follows:
# For three proteins, namely BAK1, TRAF6 and DDX3X, the STRING ID and
# the Viruses.STRING ID do not match for some reason (13 - 3 = 10)
# Apart from that, three proteins, namely BCL2L11, TRAF6 and IRAK2, are
# involved in two interspecies PPI each (10 - 3 = 7)
#
# In conclusion, the interspecies PPI recorded by Viruses.STRING involve
# 10 different human interaction partners. Now, the following problem
# emerged: For 3 of those 10 human interaction partners, the measurement
# either is erroneous/of poor quality or the measurement quality is
# unknown; nonetheless, measurement values are available for C4B, PPIA
# and RHOA