In [1]:
# Import required libraries
import csv

import numpy as np
import pandas as pd
import dask.dataframe as dd



In [None]:
file_path = "VacciniaReport_20170223-0958_ZScored.csv"


In [2]:
# Multiple columns comprised in the TSV file contain entries with mixed
# data types, which impedes the processing by Dask or Pandas
# Hence, contents of the respective columns are standardised to only one
# data type
# Affected columns are:
# Concentration,
# Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87,
# Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
# Gene_Description,
# ID,
# ID_OnTarget_Ensembl_GRCh38_release_87,
# ID_OnTarget_Merge,
# ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
# ID_OnTarget_RefSeq_20170215,
# ID_manufacturer,
# Name_alternatives,
# PLATE_QUALITY_DESCRIPTION,
# RefSeq_ID_OnTarget_RefSeq_20170215,
# Seed_sequence_common,
# WELL_QUALITY_DESCRIPTION,
# siRNA_error,
# siRNA_number
feature_list = [
    "Concentration", "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87",
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
    "Gene_Description", "ID", "ID_OnTarget_Ensembl_GRCh38_release_87",
    "ID_OnTarget_Merge", "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
    "ID_OnTarget_RefSeq_20170215", "ID_manufacturer",
    "Name_alternatives", "PLATE_QUALITY_DESCRIPTION",
    "RefSeq_ID_OnTarget_RefSeq_20170215", "Seed_sequence_common",
    "WELL_QUALITY_DESCRIPTION", "siRNA_error", "siRNA_number",
]

# Creating empty lists to store the respective column contents in
column_lists = [[] for _ in range(17)]

# Bear in mind that with statements automatically take care of closing
# files
file_path = "VacciniaReport_20170223-0958_ZScored.csv"
with open(file_path) as f:
    # Conveniently enough, the TSV file can be read into a dictionary
    # This allows to retrieve the contents of individual columns by
    # passing the column name as dictionary key
    csv_reader = csv.DictReader(f, delimiter="\t")

    # Iterate over the rows and append the values to the lists
    for row in csv_reader:
        for i, feature in enumerate(feature_list):
            column_lists[i].append(row[feature])

In [4]:
# Save the unique values of the affected columns to a file
# Bear in mind that with statements are preferred for the purpose of
# opening files as they automatically take care of closing it
with open("unique_values_per_feature.txt", "w") as f:
    for i, content in enumerate(column_lists):
        if i != 0:
            f.write(
                "\n\n" + feature_list[i] + ":\n"
                +
                # Converting the NumPy array to an ordinary list ensures
                # that all its entries are displayed
                str(list(np.unique(content)))
            )
        else:
            f.write(
                feature_list[i] + ":\n"
                +
                str(list(np.unique(content)))
            )

In [3]:
# Each feature/column is dealt with one at a time
# With respect to the feature "Concentration", it emerges that the
# entries contain both floats and strings, the latter of which
# represents the concentration unit, in this case "pmol"
# Therefore, the string representing the concentration unit is
# discarded whereas the float is retained
# Additionally, the concentration (pmol) is included in the column name
concentration_list = column_lists[0]

concentration_list = [
    entry.split(" ")[0] if entry != ""
    else np.nan for entry in concentration_list
]

assert list(np.unique(concentration_list)) == ["0.4", "1.6", "nan"]

In [4]:
# With respect to the feature
# "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87", it becomes apparent
# that its entries are comprised of both digits and
# letters/alphabetic characters
# Hence, the data type of the respective column will later be manually
# set to `str`
# The same applies to
# "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
# "Gene_Description", "ID", "ID_OnTarget_Ensembl_GRCh38_release_87",
# "ID_OnTarget_Merge", "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
# "ID_OnTarget_RefSeq_20170215", "ID_manufacturer", "Name_alternatives",
# "PLATE_QUALITY_DESCRIPTION", "RefSeq_ID_OnTarget_RefSeq_20170215",
# "WELL_QUALITY_DESCRIPTION", "siRNA_error" and "siRNA_number"
# Regarding the feature "Seed_sequence_common", only the four RNA
# nucleotides A, C, G and U as well as a single space character occur
# However, for some strange reason, Pandas expects the entries to be
# floats
# Thus, manually setting the data type to `str` is also required for
# this column
seed_sequence_common_list = column_lists[13]
single_char_list = []

for entry in seed_sequence_common_list:
    single_char_list += [*entry]

print(np.unique(single_char_list))

[' ' 'A' 'C' 'G' 'U']


In [5]:
# Now, assemble a new CSV file with the adjusted column for
# "Concentration"
# As a first step, the column name of "Concentration" is altered to
# "Concentration [pmol]"
file_path = "VacciniaReport_20170223-0958_ZScored.csv"
with open(file_path) as f:
    csv_reader = csv.DictReader(f, delimiter="\t")

    # The column names, i.e. dictionary keys are retrieved by extracting
    # the first row of the CSV DictReader object
    # From the first row, in turn, the dictionary keys are retrieved
    first_line_dict = dict(list(csv_reader)[0])

In [7]:
# Extract the dictionary keys, i.e. the column names
columns_names = list(first_line_dict.keys())

# Determine the index of the "Concentration" column and alter the
# element at that position to "Concentration [pmol]"
name_index = columns_names.index("Concentration")
columns_names[name_index] = "Concentration [pmol]"

assert (
    ("Concentration" not in columns_names)
    and
    ("Concentration [pmol]" in columns_names)
)

In [12]:
# Finally, assemble the new CSV file
with open(
    "VacciniaReport_20170223-0958_ZScored_conc_adjusted.csv",
    "w",
    newline=""
) as write_file, open(
    "VacciniaReport_20170223-0958_ZScored.csv", "r"
) as read_file:
    csv_reader = csv.DictReader(read_file, delimiter="\t")

    # Incorporate the new column name for "Concentration", i.e.
    # "Concentration [pmol]"
    fieldnames = columns_names
    csv_writer = csv.DictWriter(
        write_file, fieldnames=fieldnames, delimiter="\t"
    )

    csv_writer.writeheader()
    # The "Concentration" column is the only column the values of which
    # have to be rewritten
    for reader_row, conc_value in zip(csv_reader, concentration_list):
        csv_writer.writerow(
            {
                column_name: (
                    reader_row[column_name]
                    if column_name != "Concentration [pmol]"
                    else conc_value
                )
                for column_name in columns_names
            }
        )

In [16]:
# As a last step, alter the data types of the abovementioned columns
# Note that due to the Excel file's size, trying to load it via Pandas
# causes the kernel to crash
# Hence, the file is loaded via Dask, which has been designed for
# handling large amounts of data
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
}

main_csv_df = dd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

In [17]:
main_csv_df.head()

Unnamed: 0,Barcode,PlateType,PLATE_QUALITY_STATUS,PLATE_QUALITY_DESCRIPTION,BEE.RELEASE.STATUS,PLATE_TYPE,BATCH,Space,Group,Experiment,...,dInfectionDT_eIndex_nZScore,dIntensity_cPathogen_eMean_oCells_nZScore,dIntensity_cPathogen_eMean_oNuclei_nZScore,dIntensity_cPathogen_eMean_oPeriNuclei_nZScore,dIntensity_cPathogen_eMean_oVoronoiCells_nZScore,dIntensity_cLatePathogen_eMean_oCells_nZScore,dIntensity_cLatePathogen_eMean_oNuclei_nZScore,dIntensity_cLatePathogen_eMean_oPeriNuclei_nZScore,dIntensity_cLatePathogen_eMean_oVoronoiCells_nZScore,eCount_oCells_nZScore
0,BB01-1M,PLATE,UNKNOWN,,,CheckerBoard,unknown,INFECTX_PUBLISHED,VACCINIA_TEAM,VACCINIA-DP-K1,...,,,,,,,,,,
1,BB01-1M,PLATE,UNKNOWN,,,CheckerBoard,unknown,INFECTX_PUBLISHED,VACCINIA_TEAM,VACCINIA-DP-K1,...,,,,,,,,,,
2,BB01-1M,PLATE,UNKNOWN,,,CheckerBoard,unknown,INFECTX_PUBLISHED,VACCINIA_TEAM,VACCINIA-DP-K1,...,,,,,,,,,,
3,BB01-1M,PLATE,UNKNOWN,,,CheckerBoard,unknown,INFECTX_PUBLISHED,VACCINIA_TEAM,VACCINIA-DP-K1,...,,,,,,,,,,
4,BB01-1M,PLATE,UNKNOWN,,,CheckerBoard,unknown,INFECTX_PUBLISHED,VACCINIA_TEAM,VACCINIA-DP-K1,...,,,,,,,,,,
