In [10]:
# The purpose of this Jupyter notebook is to perform an exploration of
# the humongous CSV file
# In detail, for each column, its unique values are determined
# The aim is to identify the individual sub-screens comprised in the
# humongous CSV file, i.e. the kinome-wide screen, the full genome
# screen, etc.

# Due to the CSV file's enormous size, it cannot be loaded by Pandas as
# this would cause the kernel to die
# Instead, it is resorted to Dask, which has specifically been devised
# for handling large quantities of data
import numpy as np
import dask.dataframe as dd

In [11]:
# Note that for certain columns, the data type has to be manually
# specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [3]:
column_names = list(main_csv_df.columns)
unique_values_per_column = []

for column_name in column_names:
    # For each column, determine its unique values and append the
    # respective array to the list
    unique_values_per_column.append(
        np.unique(main_csv_df[column_name]).tolist()
    )

In [6]:
# Save the unique values for all columns of the CSV file in a separate
# file
# with statements are preferred for the purpose of working with files as
# they automatically take care of closing files, even in the case of
# errors/exceptions
with open("unqiue_values_for_all_columns.txt", "w") as f:
    for i, column_name, unique_values_list in zip(
        range(len(column_names)), column_names, unique_values_per_column
    ):
        if i == 0:
            f.write(
                f"Unique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )
        else:
            f.write(
                f"\n\nUnique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )

In [None]:
# Interesting features/columns potentially facilitating the navigation
# through this humongous table are:
# "PLATE_QUALITY_STATUS" in conjunction with
# "PLATE_QUALITY_DESCRIPTION", as these feature allow to distinguish
# reliable measurements from unreliable ones
# "PLATE_TYPE", as this feature allows to differentiate between the
# plate types "CheckerBoard", "MockPlate" and "ScreeningPlate"
# "Experiment", as this feature allows to differentiate between
# different experiments
# "GENESET", as this feature allows to differentiate between different
# target sets, such as the entire genome or the kinome
# "WellType", as this feature allows to distinguish control wells from
# actual test wells and other well types
# "WELL_QUALITY_DESCRIPTION" and "WELL_QUALITY_STATUS", as those
# features allows to distinguish reliable measurements from unreliable
# ones
# "Manufacturer", as this feature allows to filter by the manufacturer
# "Name" and "Name_alternatives", as those features allows to identify
# the gene being knocked down
# "siRNA_error", as this feature allows to distinguish reliable
# measurements from unreliable ones
# "siRNA_number", as up tp six different siRNAs have apparently been
# used
# "Concentration [pmol]", as this feature allows to investigate whether
# increasing the concentration also increases the gene suppression
# "Gene_Symbol" and "Accession", as this feature allows to identify the
# individual genes being knocked down
# -> What is meant by "MD5", as in "Sequence_antisense_5_3_MD5"?
# "Gene_Description", as this feature gives further details regarding
# the gene being knocked down
# -> What is meant by "HMB", as in
# "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB"?
# -> What is meant by "intersection" and "merge"?
# -> What is meant by "Precursor_Name"?

In [12]:
# To start with, the kinome screen is subjected to scrutiny
# The kinome screen is selected, following which the well type "CONTROL"
# is selected
kinome_control_df = main_csv_df.loc[
    (main_csv_df["GENESET"] == "Kinome")
    &
    (main_csv_df["WellType"] == "CONTROL")
]

In [13]:
# Select for feature "PLATE_TYPE" the value "ScreeningPlate"
kinome_control_no_chequerboard_df = kinome_control_df.loc[
    kinome_control_df["PLATE_TYPE"] == "ScreeningPlate"
]

In [14]:
AU_K1_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-AU-K1"
]
print(f"Amount of wells involved in VACCINIA-AU-K1: {len(AU_K1_df)}")

AU_K2_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-AU-K2"
]
print(f"Amount of wells involved in VACCINIA-AU-K2: {len(AU_K2_df)}")

DP_K1_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-DP-K1"
]
print(f"Amount of wells involved in VACCINIA-DP-K1: {len(DP_K1_df)}")

DP_K2_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-DP-K2"
]
print(f"Amount of wells involved in VACCINIA-DP-K2: {len(DP_K2_df)}")

DU_K1_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-DU-K1"
]
print(f"Amount of wells involved in VACCINIA-DU-K1: {len(DU_K1_df)}")

QU_K1_df = kinome_control_no_chequerboard_df.loc[
    kinome_control_no_chequerboard_df["Experiment"] == "VACCINIA-QU-K1"
]
print(f"Amount of wells involved in VACCINIA-QU-K1: {len(QU_K1_df)}")

Amount of wells involved in VACCINIA-AU-K1: 1326
Amount of wells involved in VACCINIA-AU-K2: 1326
Amount of wells involved in VACCINIA-DP-K1: 437
Amount of wells involved in VACCINIA-DP-K2: 437
Amount of wells involved in VACCINIA-DU-K1: 1748
Amount of wells involved in VACCINIA-QU-K1: 1736


In [9]:
# According to the feature "Experiment", there are six different
# experiments
# Prior to filtering out unreliable measurements, the amount of rows,
# i.e. wells each of these six experiments encompasses is determined
# According to the feature "Experiment", there are six different
# experiments
# The amount of rows they encompass is determined in order to find out
# how many wells wells/384 well plates each individual experiment
# involved
# Note that these are the amount of rows after filtering out unreliable
# measurements
AU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-AU-K1"
]
print(f"Amount of wells involved in VACCINIA-AU-K1: {len(AU_K1_df)}")

AU_K2_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-AU-K2"
]
print(f"Amount of wells involved in VACCINIA-AU-K2: {len(AU_K2_df)}")

DP_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DP-K1"
]
print(f"Amount of wells involved in VACCINIA-DP-K1: {len(DP_K1_df)}")

DP_K2_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DP-K2"
]
print(f"Amount of wells involved in VACCINIA-DP-K2: {len(DP_K2_df)}")

DU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DU-K1"
]
print(f"Amount of wells involved in VACCINIA-DU-K1: {len(DU_K1_df)}")

QU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-QU-K1"
]
print(f"Amount of wells involved in VACCINIA-QU-K1: {len(QU_K1_df)}")

Amount of wells involved in VACCINIA-AU-K1: 1326
Amount of wells involved in VACCINIA-AU-K2: 1710
Amount of wells involved in VACCINIA-DP-K1: 1205
Amount of wells involved in VACCINIA-DP-K2: 1205
Amount of wells involved in VACCINIA-DU-K1: 2900
Amount of wells involved in VACCINIA-QU-K1: 2888


In [None]:
# For the control wells of the kinome screen, the unique values of each
# column/feature are determined and saved to a text file
column_names = list(kinome_control_df.columns)
unique_values_per_column = []

for column_name in column_names:
    unique_values_per_column.append(
        np.unique(kinome_control_df[column_name]).tolist()
    )

# Bear in mind that with statements are preferred for the purpose of
# working with files as they automatically take care closing files, even
# in case of exceptions/errors
with open("unique_values_kinome_control.txt", "w") as f:
    for i, column_name, unique_values_list in zip(
        range(len(column_names)), column_names, unique_values_per_column
    ):
        if i == 0:
            f.write(
                f"Unique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )
        else:
            f.write(
                f"\n\nUnique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )

In [4]:
# Now that the control wells of the kinome subset have been chosen, the
# individual control types are determined
# To this end, the unique values of the "Name" column are determined
# Prior to this, unreliable measurements are discarded by exclusing rows
# the value of which for the feature "WELL_QUALITY_STATUS" is "BAD"
kinome_control_df = kinome_control_df.loc[
    kinome_control_df["WELL_QUALITY_STATUS"] != "BAD"
]

# Ensure that filtering by "WELL_QUALITY_STATUS" removes all other
# unreliable measurements
assert (
    (
        "BAD" not in
        np.unique(kinome_control_df["PLATE_QUALITY_STATUS"]).tolist()
    )
    and
    (len(np.unique(kinome_control_df["siRNA_error"])) == 1)
    and
    (np.unique(kinome_control_df["siRNA_error"])[0] == "Not available")
)

In [8]:
unique_values_name_column =  np.unique(kinome_control_df["Name"]).tolist()
print(unique_values_name_column)

['ARPC3', 'ATP6V1A', 'Abi1', 'AllStars Hs Cell Death siRNA', 'CDC42', 'CDH4', 'Cyclohexamine - translation inhibition in eukaryotes', 'Cytosine Arabinoside - incorporates into DNA and inhibits DNA replication', 'FRAP1', 'GFP Duplex III', 'ITGB1', 'Kif11', 'MAP3K7', 'MET', 'MOCK', 'Not available', 'ON-TARGETplus Non-targeting Pool', 'PAK1', 'PI4KB', 'PSMA6', 'PSMC3', 'PXN', 'RAC1', 'SCRAMBLED', 'Silencer_Select_Negative_Control_1', 'Silencer_Select_Negative_Control_2', 'TSG101', 'UNKNOWN', 'egfp']


In [None]:
# 'ARPC3', x
# 'ATP6V1A', x
# 'Abi1', x
# 'AllStars Hs Cell Death siRNA', x
# 'CDC42', x
# 'CDH4', x
# 'Cyclohexamine - translation inhibition in eukaryotes', x
# 'Cytosine Arabinoside - incorporates into DNA and inhibits DNA replication', x
# 'FRAP1', x
# 'GFP Duplex III', x
# 'ITGB1', x
# 'Kif11', x
# 'MAP3K7', x
# 'MET', x
# 'MOCK', x
# 'Not available',
# 'ON-TARGETplus Non-targeting Pool',
# 'PAK1', x
# 'PI4KB', x
# 'PSMA6', x
# 'PSMC3', x
# 'PXN', x
# 'RAC1', x
# 'SCRAMBLED', x
# 'Silencer_Select_Negative_Control_1', x
# 'Silencer_Select_Negative_Control_2', x
# 'TSG101', x
# 'UNKNOWN', x
# 'egfp' x

In [12]:
print(
    f"Amount of unique values in \"Name\": "
    # Bear in mind that inside an f-string, a different type of
    # quotation marks has to be used so as not to prematurely end the
    # f-string
    f"{len(np.unique(kinome_control_df['Name']))}"
)
print(
    f"Amount of unique values in \"ID_openBIS\": "
    f"{len(np.unique(kinome_control_df['ID_openBIS']))}"
)
print(
    f"Amount of unique values in \"PublicationLink_material\": "
    f"{len(np.unique(kinome_control_df['PublicationLink_material']))}"
)

Amount of unique values in "Name": 29
Amount of unique values in "ID_openBIS": 27
Amount of unique values in "PublicationLink_material": 27


In [None]:
# The column "Name" is inconsistent with the naming of the controls and
# also sometimes lacks names where the other two columns do not
# Hence, the column "ID_openBIS" is preferred for the purpose of
# identifying control types

In [5]:
# According to the feature "Experiment", there are six different
# experiments
# The amount of rows they encompass is determined in order to find out
# how many wells wells/384 well plates each individual experiment
# involved
# Note that these are the amount of rows after filtering out unreliable
# measurements
AU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-AU-K1"
]
print(f"Amount of wells involved in VACCINIA-AU-K1: {len(AU_K1_df)}")

AU_K2_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-AU-K2"
]
print(f"Amount of wells involved in VACCINIA-AU-K2: {len(AU_K2_df)}")

DP_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DP-K1"
]
print(f"Amount of wells involved in VACCINIA-DP-K1: {len(DP_K1_df)}")

DP_K2_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DP-K2"
]
print(f"Amount of wells involved in VACCINIA-DP-K2: {len(DP_K2_df)}")

DU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-DU-K1"
]
print(f"Amount of wells involved in VACCINIA-DU-K1: {len(DU_K1_df)}")

QU_K1_df = kinome_control_df.loc[
    kinome_control_df["Experiment"] == "VACCINIA-QU-K1"
]
print(f"Amount of wells involved in VACCINIA-QU-K1: {len(QU_K1_df)}")

Amount of wells involved in VACCINIA-AU-K1: 1326
Amount of wells involved in VACCINIA-AU-K2: 1710
Amount of wells involved in VACCINIA-DP-K1: 976
Amount of wells involved in VACCINIA-DP-K2: 1205
Amount of wells involved in VACCINIA-DU-K1: 2900
Amount of wells involved in VACCINIA-QU-K1: 2504


In [None]:
"""
Questions to ask Artur:
1.) The two columns "ID_openBIS" and "PublicationLink_material"
consistently use the name "SCRAMBLED", whereas the column "Name" uses
both "SCRAMBLED" and "ON-TARGETplus Non-targeting Pool" for this control
type; are they the same? Apart from that, according to the alternative
name of "SCRAMBLED", the scrambled siRNAs consist of both targeting and
non-targeting siRNAs. This, however, contradicts what Artur told me
about the scrambled siRNAs, namely that they are entirely comprised of
non-targeting siRNAs. -> ...
2.) In the case of the kinome controls, the feature "siRNA_number"
consistently has the value 1, probably indicating that one single type
of siRNA was used. However, on scrutinising the feature
"Sequences_sense_5_3", it becomes apparent that multiple siRNA sequences
(4) are listed, thereby contradicting the value for "siRNA_number". The
precise amnount of siRNAs is not mentioned in the manuscript either. So
how many siRNAs were employed for the individual controls? -> According
to the product information provided by Dharmacon, the term "SMARTpool",
which we also have here, indicates that a mixture of 4 siRNAs provided
as a single reagent were employed. While actually 4 individual
siRNAs have been employed, one single reagent comprising these 4 siRNAs
has been used. Hence, the value 1 of the feature "siRNA_number"
presumably refers to the single reagent comprising the 4 individual
siRNAs rather than to the 4 different siRNAs themselves.
3.) Speaking of siRNAs, the letter "u" probably represents overhang
nucleotides, doesn't it?
4.) The feature "REPLICATE" has the unique values 1 and 2, indicating
that only duplicates instead of triplicates have been made; is that
correct? -> indicates biological replicate
5.) From the feature "Experiment", it emerges that many of the
individual experiments have been performed twice, corrobotaring the
assumption that merely duplicates instead of triplicates have been made.
But what do the individual experiment names mean? Apart from that, there
are two experiments for which no duplicate seems to have been performed,
they are VACCINIA-DU-K1 and VACCINIA-QU-K1. -> just come to terms with
it
6.) What is the meaning behind the following features:
Sequence_antisense_5_3_MD5,
Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
RefSeq_ID_OnTarget_RefSeq_20170215, ID_OnTarget_RefSeq_20170215,
Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87,
ID_OnTarget_Ensembl_GRCh38_release_87,
ID_OnTarget_Merge, ID_OnTarget_Intersection, Precursor_Accession,
Precursor_Name. -> ignore for now; look like particular databases; they
will be usefule at a later point
7.) For many wells, the Z-scored values are Nan although the respective
non-standardised values are different from NaN. I have to subject this
to further scrutiny! -> Chequerboard is not icluded in Z' score
computation, hence the value NaN!

Notes for me:
1.) "ARAC" is equivalent to "Cytosine Arabinoside - incorporates into
DNA and inhibits DNA replication".
2.) Experiments have been conducted in 384 well plates. Hence, the wells
range from A1 to P24.
3.) Apparently, two 384 well plates were employed for the first
experiment, which is suggested by the following observations: The
experiment VACCINIA-DP-K1 encompasses 768 rows, which equals exactly
twice the amount of wells a single 384 well plate has. For each of those
rows, the feature "REPLICATE" has the value 1. Note that as the first row is
populated by the feature names, the experiment VACCINIA-DP-K1 ranges
from row 2 to 769. Accordingly, the second replicate of the first
experiment, VACCINIA-DP-K2, ranges from row 770 to 1537. For those 768
rows, the feature "REPLICATE" has the value 2, indicating that they are
a technical replicate of the first 768 rows.

The experiment VACCINIA-QU, however, seems to involve three 384 well
plates, as the first replicate, VACCINIA-QU-K1, ranges from 1538 to
2.689, which equals 3 times 384. The feature "REPLICATE" has the value
1. Accordingly, the second replicate of the second experiment,
VACCINIA-QU-K2, ranges from ... to .... For those ... rows, the feature ...
"""

In [None]:
# Apparently, many different control types are available
# In the following, each of them is briefly elaborated on and it is also
# checked whether the intensity values behave in the expected manner
# ABI1
#
# ALLSTARDEATH
#
# ARAC
#
# ARPC3
#
# ATP6V1A represents a subunit of the enzyme vacuolar ATPase, which
# effects the acidification of eukaryotic intracellular organelles;
# siRNA-mediated knockdown of ATP6V1A prevents virus entry, which makes
# insofar sense as both mature virions (MV) and extracellular virions
# (EV) of vaccinia viruses rely on the acidification of the
# macropinosome for fusion with the macropinosome membrane to take place
# As any fluorescence signal originates from the expression of viral
# genes, it is expected that cells transfected with siRNAs targeting
# ATP6V1A and subsequently exposed to vaccinia virus do not exhibit any
# fluorescence; a glance at ... reveals that ...
#
# CDC42 is the abbreviation for cell division control protein 42 homolog
# and, as its name already suggests, is involved in the regulation of
# the cell cycle; in contrast to other viruses, such as retroviruses,
# Vaccinia virus does not incorporate its genome into the host cell's
# genome and does therefore not rely on host cell replication for the
# poduction of progeny constituents; instead, replication exclusively
# takes place in the cytoplasm; hence, while knockdown of CDC42
# presumably does not inhibit Vaccinia virus replication altogether, it
# is conceivable that the knockdown at least impairs viral replication
# as less host cells are available
#
# CDH4
#
# CHX
#
# EGFP
#
# FRAP1
#
# GFP
#
# ITGB1
#
# Kif11
#
# MAP3K7
#
# MET
#
# MOCK
#
# PAK1
#
# PI4KB is the abbreviation for the enzyme phosphatidylinositol 4-kinase
# beta which, as its name already suggests, catalyses the
# phosphorylation of of phosphatidylinositol at the D-4 position; PI4KB
# is known to be hijacked by many positive-sense single-stranded RNA
# viruses in order to facilitate their replication within the host cell;
# however, Vaccinia virus has a double-stranded DNA genome, which is why
# it is unclear whether knockdown of PI4KB impedes virus replication
# also in the case of Vaccinia virus
#
# PSMA6
#
# PSMC3
#
# PXN
#
# RAC1
#
# SCRAMBLED
#
#
#
#
#
#
#
#

In [None]:
# For controls, only 1 siRNA was used, whereas for non-control targets,
# six different siRNAs were employed individually and together with each
# other, which is referred to as "pooled"
# Many wells have NaN as values, they must be filtered out!