In [2]:
# The purpose of this Jupyter notebook is to perform an exploration of
# the humongous CSV file
# In detail, for each column, its unique values are determined
# The aim is to identify the individual sub-screens comprised in the
# humongous CSV file, i.e. the kinome-wide screen, the full genome
# screen, etc.

# Due to the CSV file's enormous size, it cannot be loaded by Pandas as
# this would cause the kernel to die
# Instead, it is resorted to Dask, which has specifically been devised
# for handling large quantities of data
import numpy as np
import dask.dataframe as dd



In [3]:
# Note that for certain columns, the data type has to be manually
# specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [3]:
column_names = list(main_csv_df.columns)
unique_values_per_column = []

for column_name in column_names:
    # For each column, determine its unique values and append the
    # respective array to the list
    unique_values_per_column.append(
        np.unique(main_csv_df[column_name]).tolist()
    )

In [6]:
# Save the unique values for all columns of the CSV file in a separate
# file
# with statements are preferred for the purpose of working with files as
# they automatically take care of closing files, even in the case of
# errors/exceptions
with open("unqiue_values_for_all_columns.txt", "w") as f:
    for i, column_name, unique_values_list in zip(
        range(len(column_names)), column_names, unique_values_per_column
    ):
        if i == 0:
            f.write(
                f"Unique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )
        else:
            f.write(
                f"\n\nUnique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )

In [None]:
# Interesting features/columns potentially facilitating the navigation
# through this humongous table are:
# "PLATE_QUALITY_STATUS" in conjunction with
# "PLATE_QUALITY_DESCRIPTION", as these feature allow to distinguish
# reliable measurements from unreliable ones
# "PLATE_TYPE", as this feature allows to differentiate between the
# plate types "CheckerBoard", "MockPlate" and "ScreeningPlate"
# "Experiment", as this feature allows to differentiate between
# different experiments
# "GENESET", as this feature allows to differentiate between different
# target sets, such as the entire genome or the kinome
# "WellType", as this feature allows to distinguish control wells from
# actual test wells and other well types
# "WELL_QUALITY_DESCRIPTION" and "WELL_QUALITY_STATUS", as those
# features allows to distinguish reliable measurements from unreliable
# ones
# "Manufacturer", as this feature allows to filter by the manufacturer
# "Name" and "Name_alternatives", as those features allows to identify
# the gene being knocked down
# "siRNA_error", as this feature allows to distinguish reliable
# measurements from unreliable ones
# "siRNA_number", as up tp six different siRNAs have apparently been
# used
# "Concentration [pmol]", as this feature allows to investigate whether
# increasing the concentration also increases the gene suppression
# "Gene_Symbol" and "Accession", as this feature allows to identify the
# individual genes being knocked down
# -> What is meant by "MD5", as in "Sequence_antisense_5_3_MD5"?
# "Gene_Description", as this feature gives further details regarding
# the gene being knocked down
# -> What is meant by "HMB", as in
# "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB"?
# -> What is meant by "intersection" and "merge"?
# -> What is meant by "Precursor_Name"?

In [4]:
# To start with, the kinome screen is subjected to scrutiny
# The kinome screen is selected, following which the well type "CONTROL"
# is selected
kinome_control_df = main_csv_df.loc[
    (main_csv_df["GENESET"] == "Kinome")
    &
    (main_csv_df["WellType"] == "CONTROL")
]

# For the control wells of the kinome screen, the individual values of
# each column/feature are determined and saved to a text file
column_names = list(kinome_control_df.columns)
unique_values_per_column = []

for column_name in column_names:
    unique_values_per_column.append(
        np.unique(kinome_control_df[column_name]).tolist()
    )

# Bear in mind that with statements are preferred for the purpose of
# working with files as they automatically take care closing files, even
# in case of exceptions/errors
with open("unique_values_kinome_control.txt", "w") as f:
    for i, column_name, unique_values_list in zip(
        range(len(column_names)), column_names, unique_values_per_column
    ):
        if i == 0:
            f.write(
                f"Unique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )
        else:
            f.write(
                f"\n\nUnique values for the column \"{column_name}\":\n"
                f"{unique_values_list}"
            )