In [None]:
"""
The Python script "gene_ID_and_off_gene_symbol_check.py" has been
successfully run on the Hemera HPC cluster. However, on throwing a
glance at the output file and subsequently scutinising the
"ID_manufacturer" column in the CSV file, several issues emerged.

The first is that the database query failed for four gene IDs, namely
644862, 441848, 441931 and 441860.

The second is that for some perturbation agents, be they siRNA, pooled
siRNA or esiRNA, more than one target gene is listed. In both the column
"ID_manufacturer" and "Name", the individual entries are separated by
semicolons.

The third is that for some perturbation agents, the entry in both
"ID_manufacturer" and "Name" is "Not available". However, as both the
total amount of perturbation agents this applies to is manageable and
the catalogue number of along with the manufacturer is provided, the
target genes are manually looked up.

As to the first two issues, however, postprocessing is accomplished in
an automated manner.

Apart from that, some records have been discontinued in the NCBI
database, such as the record corresponding to the gene ID 441848. Such
discontinued records contain the sentence "This record was
discontinued". It is decided at a later time how discontinued records
are dealt with.
"""

In [13]:
import time
import string

import numpy as np
import pandas as pd
from biotite.database import entrez

In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

# Dask DataFrames exhibit a peculiarity regarding the index labels: By
# default, the index labels are integers, just as with Pandas
# DataFrames. However, unlike Pandas DataFrames, the index labels do not
# monotonically increase from 0, but restart at 0 for each partition,
# thereby resulting in duplicated index labels (Dask subdivides a Dask
# DataFram into multiple so-called partitions as the whole idea behind
# Dask is to handle large data sets in a memory-efficient way, https://
# docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.reset_
# index.html)
# Hence, performing operations with Dask DataFrames might potentially
# raise the `ValueError: cannot reindex on an axis with duplicate
# labels` error
# In this case, loading the entire data set into a Pandas DataFrame is
# feasible, which is why this is preferred to loading it into a Dask
# DataFrame (strangely enough, this has not been possible in the very
# beginning, which is why Dask was used in the first place)
main_csv_df = pd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

# Bear in mind that due to operator precedence, i.e. "|" (logical OR)
# having precedence over equality checks, the equality checks have to be
# surrounded by parentheses
single_pooled_siRNA_and_esiRNA_df = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]

In [3]:
# Determine the amount of perturbation agents for which the target genes
# are not specified
ID_manufacturer_series = single_pooled_siRNA_and_esiRNA_df[
    "ID_manufacturer"
]

n_target_not_specified = np.count_nonzero(
    ID_manufacturer_series == "Not available"
)

print(
    "Amount of perturbation agents for which the target genes are not "
    f"specified: {n_target_not_specified}"
)

Amount of perturbation agents for which the target genes are not specified: 22


In [4]:
# Now, address the first two issues in an automated manner (failed
# database query for four gene IDs and the listing of several target
# genes separated by semicolons)
# To this end, the CSV file with the updated gene IDs and official gene
# symbols is also loaded
# Unfortunately, when saving the updated Pandas DataFrame to a CSV file,
# the separator has not been specified, which is why the default
# separator has been used
# This, however, conflicts with the usage of commata in some entries
# Thus, prior to loading the updated CSV file, the commata have to be
# replaced with tab stops in a sophisticated manner taking account of
# this difference between actual delimiters and commata which are part
# of entries
# To this end, columns containing commata in their entries have to be
# identified
feature_names = main_csv_df.columns
features_with_commata = []
features_with_commata_indices = []

for i, feature_name in enumerate(feature_names):
    feature_series = main_csv_df[feature_name]
    if feature_series.dtype != "object":
        continue
    # Bear in mind that in order to check for the presence of a
    # substring in a Pandas DataFrame, "pandas.Series.str.contains" has
    # to be used rather than "pandas.Series.isin" as the latter only
    # verifies complete matches between column entries and query strings
    n_commata_in_entries = feature_series.str.contains(",").sum()
    if n_commata_in_entries > 0:
        features_with_commata.append(feature_name)
        features_with_commata_indices.append(i)

# Excel uses upper case letters instead of numbers in order to index
# columns
# Hence, for the sake of convenience, the numeric indices are
# simultaneously mapped to the corresponding alphabetical indices
# The built-in string module allows to fetch a string representing the
# entire alphabet
alphabet_list = list(string.ascii_uppercase)

alphabetic_indices_list = list(string.ascii_uppercase)
for first_letter in alphabet_list[:3]:
    for second_letter in alphabet_list:
        alphabetic_indices_list.append(first_letter + second_letter)

numeric_alphabetic_index_dict = {}
for numeric_index, alphabetic_index in enumerate(alphabetic_indices_list):
    numeric_alphabetic_index_dict[numeric_index] = alphabetic_index

max_feature_name_length = max(map(len, features_with_commata))

print(
    "The following features/columns contain commata in their entries:\n",
    "Feature name".ljust(max_feature_name_length + 1),
    "Numeric index".ljust(14),
    "Alphabetical Index\n",
    "-" * (max_feature_name_length + 1 + 14 + len("Alphabetical index")),
    sep="",
    end=""
)

for i, feature_name in zip(
    features_with_commata_indices, features_with_commata
):
    print(
        "\n",
        feature_name.ljust(max_feature_name_length + 1),
        str(i).ljust(14),
        numeric_alphabetic_index_dict[i],
        sep="",
        end=""
    )

The following features/columns contain commata in their entries:
Feature name      Numeric index Alphabetical Index
--------------------------------------------------
Name_alternatives 30            AE
Gene_Description  62            BK

In [5]:
# Now, the entries of the columns immediately following those the
# entries of which contain commata are scrutinised
# Ideally, they exhibit common characteristics that can be leveraged for
# the distinction between actual delimiters and commata belonging to
# entries
following_series_1 = main_csv_df[feature_names[31]]
following_series_2 = main_csv_df[feature_names[63]]

unique_vals_series_1 = np.unique(following_series_1)
unique_vals_series_2 = np.unique(following_series_2)

print(unique_vals_series_1)
print(unique_vals_series_2)

['MultipleTargets' 'NoTargets' 'Not available' 'OK' 'POOLED_SIRNA_ERROR'
 'TargetMismatch' 'Unknown']
['ENST00000000233;ENST00000463733'
 'ENST00000000233;ENST00000463733;ENST00000415666;ENST00000467281;ENST00000489673;ENST00000459680'
 'ENST00000000233;ENST00000463733;ENST00000415666;ENST00000489673;ENST00000464403'
 ... 'ENST00000515849;ENST00000302763;ENST00000355078' 'ENST00000516084'
 'Not available']


In [6]:
# Regarding the second of the two investigated columns, it emerges that
# the vast majority of its entries begin with the sequence "ENST"
# It is investigated whether this is indeed the case for all entries or
# whether there are some exceptions
print(all([entry[:4] == "ENST" for entry in unique_vals_series_2]))

False


In [7]:
# Apparently, there are entries not starting with the "ENST" sequence
# They are extracted and examined
# The arguably easiest way to accomplish this is boolean indexing, which
# is provided by NumPy
outcast_vals_arr = unique_vals_series_2[
    [entry[:4] != "ENST" for entry in unique_vals_series_2]
]
print(outcast_vals_arr)

['Not available']


In [None]:
# It becomes apparent that I am damn lucky as I indeed am able to
# leverage common characterics in order to distinguish actual delimiters
# from commata belonging to entries

# List comprising unique values of column 31, i.e. the column following
# column 30 ("Name_alternatives")
siRNA_error_options = [
    "MultipleTargets",
    "NoTargets",
    "Not available",
    "OK",
    "POOLED_SIRNA_ERROR",
    "TargetMismatch",
    "Unknown"
]

# Iterate through the lines, modify them accordingly and write the
# adjusted lines to a new output file
# Note that reading all the lines into memory at once via `.readlines()`
# provokes an Out Of Memory error, which is why the file lines are
# iterated over on the fly
with open(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_updated.csv",
    "r"
) as prior_tab_intro_file, open(
    "adjusted_file.csv", "w", newline=""
) as post_tab_intro_file:
    for i, line in enumerate(prior_tab_intro_file):
        # Bear in mind that the first line represents the header, i.e.
        # contains the column names
        # Thus, all commata represent actual delimiters
        if i == 0:
            split_line = [
                i for j in line.split(",") for i in (j, ",")
            ][:-1]

            # Simply replace all commata with tab stops
            split_line_with_tabs = [
                "\t" if i == "," else i for i in split_line
            ]
            
            # Concatenate the entries in the updated list and write the
            # resulting string to the file
            post_tab_intro_file.write("".join(split_line_with_tabs))
            continue

        # When employing the built-in split method for strings, the
        # separation character is not retained, but discarded
        # Hence, by employing a trick involving a nested list
        # comprehension, the separation character is added at its
        # corresponding positions
        # (https://www.geeksforgeeks.org/python-string-split-including-spaces/)
        split_line = [i for j in line.split(",") for i in (j, ",")][:-1]
        
        line_comma_indices = [
            i for i, x in enumerate(split_line) if x == ","
        ]

        # Determine the indices of commata belonging to entries in lieu
        # of being delimiters
        entry_commata_list = []

        # First, deal with column 30, i.e. "Name alternatives"
        # Keep in mind that it is iterated through the list
        # `split_line`, which encompasses both the entries as well as
        # commata
        # Also keep in mind that the numeric index starts with zero, not
        # 1, so that when counting in the "human" way, column 30 has
        # index 31
        # Therefore, the index corresponding to column 30 is not 30, but
        # 30 * 2 = 60 (counting starts with 0, hence the first column
        # has index 0; to account for the remaining 30 entries, 30 * 2
        # is added, yielding 60, the index of the entry corresponding to
        # column 30)
        # Also bear in mind that the column has at least one entry,
        # which is why the index of the first element to query is
        # increased by two, i.e. 62
        entry_index_1 = 62
        subsequent_entry = split_line[entry_index_1]
        while subsequent_entry not in siRNA_error_options:
            entry_commata_list.append(entry_index_1 - 1)
            entry_index_1 += 2
            subsequent_entry = split_line[entry_index_1]
        
        # Now, do the same thing with column 62, i.e. "Gene_Description"
        # Again, the index of the entry in `split_line` corresponding to
        # column 62 is not 62, but 62 * 2 = 124, and as the
        # column contains at least one entry, the index of the first
        # entry to query is increased by two (126)
        # Note that the index of the first entry to investigate has to
        # be adjusted according to the previous amount of "entry
        # commata"
        entry_index_2 = 126 + len(entry_commata_list) * 2
        subsequent_entry = split_line[entry_index_2]
        while (
            (subsequent_entry != "Not available")
            and
            (subsequent_entry[:4] != "ENST")
        ):
            entry_commata_list.append(entry_index_2 - 1)
            entry_index_2 += 2
            subsequent_entry = split_line[entry_index_2]
        
        # Update the list harbouring the row entries along with the
        # delimiters by replacing commata with tab stops at the
        # corresponding positions
        for comma_index in line_comma_indices:
            if comma_index not in entry_commata_list:
                split_line[comma_index] = "\t"
        
        # Finally, the entries in the updated row list are concatenated
        # and the resulting string is written to the file
        # As the `.readlines()` method does not trim line endings, the
        # newline character (\n) does not have to be added
        post_tab_intro_file.write("".join(split_line))

In [10]:
updated_main_csv_df = pd.read_csv(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_"\
    "updated_with_tab_stops.csv",
    sep="\t",
    dtype=dtype_dict
)

In [28]:
# I have a hunch that I might have have made a mistake: I assumed that
# the two columns "ID_manufacturer" and "ID" refer to the same thing
# This, however, is not necessarily the case, as it is also conceivable
# that siRNA directed against a specific target has been employed in
# order to knock down another, maybe structurally related target
# Therefore, it is investigated whether this indeed is the case or
# whether the siRNAs have exclusively been used for the targets
# specified by the manufacturer
# To this end, the unique values of the column "ID_manufacturer" are
# determined, and for each unique value, the amount of unique values
# occurring in the column "Name" in conjunction with that unique
# "ID_manufacturer" are determined
# Bear in mind that the original, non-updated CSV file has to be used
# for this purpose
unique_manufacturer_IDs = np.unique(main_csv_df["ID_manufacturer"])

non_unique_list = []
for manufacturer_ID in unique_manufacturer_IDs:
    names_series = main_csv_df.loc[
        main_csv_df["ID_manufacturer"] == manufacturer_ID
    ]["Name"]
    n_unique_names = len(np.unique(names_series))
    if n_unique_names > 1:
        non_unique_list.append(manufacturer_ID)

print(
    "Amount of manufacturer IDs occurring in conjunction with more "
    f"than one value in the \"Name\" column: {len(non_unique_list)}"
)

Amount of manufacturer IDs occurring in conjunction with more than one value in the "Name" column: 1229


In [31]:
# Save the list harbouring the manufacturer IDs co-occurring with with
# multiple "Name" values to a file
with open("manufacturer_IDs_co-occurring_with_multiple_names.txt", "w") as f:
    for i, manufacturer_ID in enumerate(non_unique_list):
        if i == 0:
            f.write(manufacturer_ID)
        else:
            f.write("\n" + manufacturer_ID)

In [None]:
# Closer scrutiny of the affected manufacturer IDs reveals that the
# occurrence of multiple "Name" values stems from the usage of aliases
# for one and the same gene
# Thus, this issue does not have to be further addressed, as it had been
# resolved by the database query anyway

In [None]:
# Unfortunately, another issue arose: I erroneously assumed that the two
# columns "ID_manufacturer" and "ID" refer to the same thing
# This, however, is not the case, which I found out in the following
# way: For some IDs appearing in "ID_manufacturer", the "Name" column
# can adopt more than one value; for instance, this is the case for the
# ID 441848, which appears in conjunction with the the values
# "LOC441842" and "LOC441848" in the "Name" column

In [21]:
# Eventually, address the four gene IDs for which the database query
# failed, which are 644862, 441848, 441931 and 441860
failed_gene_IDs = ["644862", "441848", "441931", "441860"]

for failed_gene_ID in failed_gene_IDs:
    # Determine the gene name provided by the data set
    name_in_data_set = np.unique(updated_main_csv_df.loc[
        updated_main_csv_df["ID_manufacturer"] == failed_gene_ID
    ]["Name"])
    # assert len(name_in_data_set) == 1, (
    #     "Strangely enough, more than one name occurs for gene ID "
    #     f"{failed_gene_ID}."
    # )
    print(name_in_data_set)
    print()
    continue

    time.sleep(2)

    NCBI_entry = entrez.fetch_single_file(
        uids=[failed_gene_ID],
        file_name=None,
        db_name="gene",
        ret_type="",
        ret_mode="text"
    )
    NCBI_entry_str = NCBI_entry.getvalue()

    # The gene ID has not been changed for any of the four gene IDs, so
    # that is only has be checked whether the official gene symbol has
    # been altered or not
    # Remove blank lines from the string retrieved from the NCBI entry
    NCBI_entry_str_list = NCBI_entry_str.split("\n")
    while "" in NCBI_entry_str_list:
        NCBI_entry_str_list.remove("")

    # Following the removal of empty strings, the official gene symbol
    # is represented by the first list element, but it is preceded by
    # the string "1. ", which encompasses three characters
    # Hence, prior to comparing the gene names provided by the VACV
    # screen data set to the official gene symbols, the first list
    # element has to be sliced accordingly
    official_gene_symbol = NCBI_entry_str_list[0][:3]
    break

['LOC441842']

['LOC441842' 'LOC441848']

['LOC441842' 'VN1R17P']

['LOC441842' 'LOC441860']



In [22]:
failed_gene_IDs = ["644862", "441848", "441931", "441860"]

for failed_gene_ID in failed_gene_IDs:
    time.sleep(2)

    NCBI_entry = entrez.fetch_single_file(
        uids=[failed_gene_ID],
        file_name=None,
        db_name="gene",
        ret_type="",
        ret_mode="text"
    )
    NCBI_entry_str = NCBI_entry.getvalue()
    print(NCBI_entry_str)
    print()


1. RPS28P3
Official Symbol: RPS28P3 and Name: ribosomal protein S28 pseudogene 3 [Homo sapiens (human)]
Other Aliases: RPS28_2_172
Chromosome: 1; Location: 1q42.13
Annotation: Chromosome 1 NC_000001.11 (228134782..228136793, complement)
ID: 644862




1. LOC441848
similar to zinc finger protein 113 [Homo sapiens (human)]
Chromosome: 19; Location: 19q13.11
This record was discontinued.
ID: 441848




1. VN1R17P
Official Symbol: VN1R17P and Name: vomeronasal 1 receptor 17 pseudogene [Homo sapiens (human)]
Other Aliases: GPCR
Other Designations: putative G-protein coupled receptor
Chromosome: 1; Location: 1q44
Annotation: Chromosome 1 NC_000001.11 (247237224..247237850, complement)
ID: 441931




1. LOC441860
similar to dJ54B20.4 (novel KRAB box containing C2H2 type zinc finger protein) [Homo sapiens (human)]
Chromosome: 19; Location: 19q13.41
This record was discontinued.
ID: 441860



