In [1]:
import csv

import numpy as np
import dask.dataframe as dd
import requests



In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [3]:
# According to the feature "WellType", eight different types of
# experiments have been conducted, which are the following:
# 1.) COMPOUND: refers to small molecules, i.e. low-molecular weight
# organic compounds; note that for small molecules, the feature "Name"
# represents the name of the small compound, whereas the proteins
# targeted by the small molecules are specified by the feature
# "Name_alternatives"
# 2.) CONTROL
# 3.) ESIRNA, i.e. endoribonuclease-prepared siRNA, which differs from
# the conventional siRNA in that it is not chemically synthesised, but
# generated by enzymatic digestion of long dsRNA molecules
# 4.) MIRNA_INHIBITOR
# 5.) MIRNA_MIMIC
# 6.) Not available
# 7.) POOLED_SIRNA, i.e. transfection of mammalian cells has been
# performed with an ensemble of four different siRNAs
# 8.) SIRNA, i.e. transfection of mammalian cells has been performed
# with only one single type of siRNA molecules

In [4]:
# Extract target genes knocked down via single siRNAs, pooled siRNAs and
# esiRNAs
# Bear in mind that due operator precedence, i.e. "|" (logical or)
# having precedence over equality checks, the equality checks must be
# surrounded by parentheses
target_name_series_single_pooled_siRNA_and_esiRNA = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]["Name"]

# Some entries of the feature "Name" are "Not available", those need to
# be filtered out
# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
target_names_single_pooled_siRNA_and_esiRNA = [
    name for name in target_name_series_single_pooled_siRNA_and_esiRNA
    if name != "Not available"
]

In [5]:
# Extract target genes knocked down by low-molecular weight organic
# compounds
target_name_series_compounds = main_csv_df.loc[
    main_csv_df["WellType"] == "COMPOUND"
]["Name_alternatives"]

# The names of genes targeted by small molecules require special
# processing for three reasons: Firstly, some entries encompass multiple
# names separated by commata or the ampersand sign. Secondly, in case of
# proteins belonging to the same family, such as HER1 and HER2, or
# otherwise related proteins, e.g. Aurora kinase and CDK, the individual
# proteins are separated from each other via forward slashes. Thirdly,
# in the case of TGF-beta, a question mark occurs in lieu of the
# corresponding Greek letter. Hence, the question mark has to be
# replaced with the word "beta".
# Unfortunately, iterable unpacking cannot be used in conjunction with
# list comprehensions, which is why for loops must be resorted to
target_names_compounds = []

for name in target_name_series_compounds:
    if "TGF" in name:
        name = name.replace("?", "beta")
    # The forward slash only occurs in two situations, the first of
    # which is "HER1/2" and the second if which is "Aurora / CDK"
    if "1/2" in name:
        name = name.replace("HER1/2", "HER1,HER2")
    if " / " in name:
        name = name.replace(" / ", ",")
    if "," in name:
        target_names_compounds += name.split(",")
    elif "&" in name:
        target_names_compounds += name.split("&")
    else:
        target_names_compounds.append(name)

# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
#target_names_compounds = np.unique(target_names_compounds).tolist()

In [6]:
# Now, determine the union of both lists
# Creating the union does not include duplicates
target_names_siRNA_and_compounds = list(
    set(target_names_single_pooled_siRNA_and_esiRNA)
    |
    set(target_names_compounds)
)

In [7]:
string_api_url = "https://version-12-0.string-db.org/api"
output_format = "tsv"
method = "get_string_ids"

# Set parameters
params = {
    "identifiers": "\r".join(target_names_siRNA_and_compounds), # Protein list
    "species": 9606, # Species NCBI identifier, in this case human
    "limit": 1, # Only one (best) identifier per input protein
    "echo_query": 1, # See your input identifiers in the output
    "caller_identity": "Elvis Jagger Abdul-Jabbar"
}

# Construct the URL
request_url = "/".join([string_api_url, output_format, method])

# Call STRING
results = requests.post(request_url, data=params)

assert results.status_code == 200, "The request was not successful!"

In [9]:
# Save the result to a CSV file
# with statements are preferred for the purpose of working with files as
# they automatically take care of closing files, even in case of errors/
# exceptions
with open("siRNA_and_compounds_STRING_IDs.csv", "w") as f:
    # By using a tab in lieu of a comma as delimiter, a TSV instead of a
    # CSV is generated, technically speaking
    csv_writer = csv.writer(f, delimiter="\t")

    for i, line in enumerate(results.text.strip().split("\n")):
        split_line = line.split("\t")
        input_identifier, string_identifier = split_line[0], split_line[2]
        csv_writer.writerow([input_identifier, string_identifier])

In [16]:
csv_per_row_list = []

# Load the STRING identifiers from the csv file
with open("siRNA_and_compounds_STRING_IDs.csv", "r") as f:
    csv_reader = csv.reader(f, delimiter="\t")
    # Skip the header row as it does not contain any actual data
    next(csv_reader)

    for row in csv_reader:
        # Each row in the CSV file is retrieved as a list the elements
        # of which are the tab-separated values
        # Thus, appending the list of each row to `csv_per_row_list`
        # results in a list of lists, i.e. a nested list
        # Employing a trick involving the unpacking operator (asterisk)
        # in conjunction with the zip() function, the two columns are
        # restored in an elegant way
        # (https://realpython.com/python-zip-function/)
        csv_per_row_list.append(row)

_, siRNA_and_compounds_string_IDs = zip(*csv_per_row_list)