In [2]:
# Import required libraries
import csv

import numpy as np
import pandas as pd
import dask.dataframe as dd

In [3]:
# Multiple columns comprised in the TSV file contain entries with mixed
# data types, which impedes the processing by Dask or Pandas
# Hence, contents of the respective columns are standardised to only one
# data type
# Affected columns are:
# Concentration,
# Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87,
# Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
# Gene_Description,
# ID,
# ID_OnTarget_Ensembl_GRCh38_release_87,
# ID_OnTarget_Merge,
# ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB,
# ID_OnTarget_RefSeq_20170215,
# ID_manufacturer,
# Name_alternatives,
# PLATE_QUALITY_DESCRIPTION,
# RefSeq_ID_OnTarget_RefSeq_20170215,
# Seed_sequence_common,
# WELL_QUALITY_DESCRIPTION,
# siRNA_error,
# siRNA_number
feature_list = [
    "Concentration", "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87",
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
    "Gene_Description", "ID", "ID_OnTarget_Ensembl_GRCh38_release_87",
    "ID_OnTarget_Merge", "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB",
    "ID_OnTarget_RefSeq_20170215", "ID_manufacturer",
    "Name_alternatives", "PLATE_QUALITY_DESCRIPTION",
    "RefSeq_ID_OnTarget_RefSeq_20170215", "Seed_sequence_common",
    "WELL_QUALITY_DESCRIPTION", "siRNA_error", "siRNA_number",
]

# Creating empty lists to store the respective column contents in
column_lists = [[] for _ in range(17)]

# Bear in mind that with statements automatically take care of closing
# files
file_path = "VacciniaReport_20170223-0958_ZScored.csv"
with open(file_path) as f:
    # Conveniently enough, the TSV file can be read into a dictionary
    # This allows to retrieve the contents of individual columns by
    # passing the column name as dictionary key
    csv_reader = csv.DictReader(f, delimiter="\t")

    # Iterate over the rows and append the values to the lists
    for row in csv_reader:
        for i, feature in enumerate(feature_list):
            column_lists[i].append(row[feature])

In [4]:
# Save the unique values of the affected columns to a file
# Bear in mind that with statements are preferred for the purpose of
# opening files as they automatically take care of closing it
with open("unique_values_per_feature.txt", "w") as f:
    for i, content in enumerate(column_lists):
        if i != 0:
            f.write(
                "\n\n" + feature_list[i] + ":\n"
                +
                # Converting the NumPy array to an ordinary list ensures
                # that all its entries are displayed
                str(list(np.unique(content)))
            )
        else:
            f.write(
                feature_list[i] + ":\n"
                +
                str(list(np.unique(content)))
            )

In [11]:
# Each feature/column is dealt with one at a time
# With respect to the feature "Concentration", it emerges that the
# entries contain both floats and strings, the latter of which
# represents the concentration unit, in this case "pmol"
# Therefore, the string representing the concentration unit is
# discarded whereas the float is retained
concentration_list = column_lists[0]

concentration_list = [
    entry.split(" ")[0] if entry != ""
    else np.nan for entry in concentration_list
]

assert list(np.unique(concentration_list)) == ["0.4", "1.6", "nan"]

In [3]:
# With respect to the feature "ID", it becomes apparent that its entries
# are comprised of both digits and letters/alphabetic characters
# Hence, the data type of the respective column will later be manually
# set to `str`
# The same applies to ID_manufacturer, Name_alternatives and
# WELL_QUALITY_DESCRIPTION
# Regarding the feature "Seed_sequence_common", only the four RNA
# nucleotides A, C, G and U as well as a single space character occur
# However, for some strange reason, Pandas expects the entries to be
# floats
# Thus, manually setting the data type to `str` is also required for
# this column
seed_sequence_common_list = column_lists[4]
single_char_list = []

for i, entry in enumerate(seed_sequence_common_list):
    single_char_list += [*entry]

print(np.unique(single_char_list))

[' ' 'A' 'C' 'G' 'U']


TypeError: list.append() takes exactly one argument (3 given)