In [1]:
import numpy as np
import dask.dataframe as dd



In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [None]:
# According to the feature "WellType", eight different types of
# experiments have been conducted, which are the following:
# 1.) COMPOUND: refers to small molecules, i.e. low-molecular weight
# organic compounds; note that for small molecules, the feature "Name"
# represents the name of the small compound, whereas the proteins
# targeted by the small molecules are specified by the feature
# "Name_alternatives"
# 2.) CONTROL
# 3.) ESIRNA, i.e. endoribonuclease-prepared siRNA, which differs from
# the conventional siRNA in that it is not chemically synthesised, but
# generated by enzymatic digestion of long dsRNA molecules
# 4.) MIRNA_INHIBITOR
# 5.) MIRNA_MIMIC
# 6.) Not available
# 7.) POOLED_SIRNA, i.e. transfection of mammalian cells has been
# performed with an ensemble of four different siRNAs
# 8.) SIRNA, i.e. transfection of mammalian cells has been performed
# with only one single type of siRNA molecules

In [3]:
# Extract target genes knocked down via single siRNAs, pooled siRNAs and
# esiRNAs
# Bear in mind that due operator precedence, i.e. "|" (logical or)
# having precedence over equality checks, the equality checks must be
# surrounded by parentheses
target_name_series_single_pooled_siRNA_and_esiRNA = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]["Name"]

# Some entries of the feature "Name" are "Not available", those need to
# be filtered out
# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
target_names_single_pooled_siRNA_and_esiRNA = [
    name for name
    in np.unique(target_name_series_single_pooled_siRNA_and_esiRNA)
    if name != "Not available"
]

In [6]:
# Extract target genes knocked down by low-molecular weight organic
# compounds
target_name_series_compounds = main_csv_df.loc[
    main_csv_df["WellType"] == "COMPOUND"
]["Name_alternatives"]

# The names of genes targeted by small molecules require special
# processing for three reasons: Firstly, some entries encompass multiple
# names separated by commata or the ampersand sign. Secondly, in case of
# proteins belonging to the same family, such as HER1 and HER2, or
# otherwise related proteins, e.g. Aurora kinase and CDK, the individual
# proteins are separated from each other via forward slashes. Thirdly,
# in the case of TGF-beta, a question mark occurs in lieu of the
# corresponding Greek letter. Hence, the question mark has to be
# replaced with the word "beta".
# Unfortunately, iterable unpacking cannot be used in conjunction with
# list comprehensions, which is why for loops must be resorted to
target_names_compounds = []

for name in target_name_series_compounds:
    if "TGF" in name:
        name = name.replace("?", "beta")
    # The forward slash only occurs in two situations, the first of
    # which is "HER1/2" and the second if which is "Aurora / CDK"
    if "1/2" in name:
        name = name.replace("HER1/2", "HER1,HER2")
    if " / " in name:
        name = name.replace(" / ", ",")
    if "," in name:
        target_names_compounds += name.split(",")
    elif "&" in name:
        target_names_compounds += name.split("&")
    else:
        target_names_compounds.append(name)

# As the list is later converted into a set and sets do not allow
# duplicates anyway, applying np.unique() is not necessary
#target_names_compounds = np.unique(target_names_compounds).tolist()

In [7]:
# Now, determine the union of both lists
# Creating the union does not include duplicates
target_names_siRNA_and_compounds = list(
    set(target_names_single_pooled_siRNA_and_esiRNA)
    |
    set(target_names_compounds)
)

In [8]:
print(target_names_siRNA_and_compounds)

['ANO10', 'C1orf151', 'CCDC83', 'LOC126536', 'GDF15', 'GDI2', 'RP9P', 'XPO6', 'C15orf60', 'LOC151171', 'RBM38', 'MRPL15', 'HNRPDL', 'NRSN2', 'FAM123A', 'ZFR2', 'HPX', 'MTA2', 'PICALM', 'ACSF2', 'DBF4', 'MRPL18', 'ATP6V0B', 'OR8K5', 'HBE1', 'PPP1R13B', 'EML5', 'GTPBP6', 'PDE1A', 'OR5H6', 'ACOT9', 'RAB5C', 'PRPH', 'TTK', 'KCNK3', 'SLC2A11', 'LRRC33', 'PCDP1', 'PANX2', 'CDC26', 'C1QL1', 'C9orf24', 'TAF13', 'TIAM1', 'GCN1L1', 'ACOT4', 'C10orf111', 'TFF2', 'SEC11A', 'UGT2B15', 'FOXD1', 'C2orf73', 'KRTAP4-1', 'ALOX5AP', 'AKNA', 'ARID4A', 'KRT36', 'C1orf204', 'PANK3', 'PRKG2', 'OR8I2', 'RAD54L2', 'C9orf82', 'TFE3', 'PCDHGA5', 'ISG20', 'ARMCX4', 'LNX2', 'C20orf11', 'GPAA1', 'CLIP2', 'CASC1', 'KLF14', 'LTN1', 'ZNF530', 'HSF4', 'NALCN', 'POMC', 'CRHR1', 'G6PC', 'CCDC114', 'FAM133A', 'CCR5', 'MGST2', 'ELTD1', 'CIDEB', 'BEAN1', 'HSD17B4', 'TMEM39A', 'KCTD13', 'C20orf202', 'MAP3K15', 'IRX1', 'ACCN4', 'SLC16A14', 'RNF10', 'CC2D1B', 'OSTalpha', 'CSNK1A1L', 'HUNK', 'SPRR2G', 'TMEM98', 'STX5', 'SDHC', 