In [1]:
"""
The Python script "gene_ID_and_off_gene_symbol_check.py" has been
successfully run on the Hemera HPC cluster. However, on throwing a
glance at the output file and subsequently scutinising the
"ID_manufacturer" column in the CSV file, several issues emerged.

The first is that the database query failed for four gene IDs, namely
644862, 441848, 441931 and 441860.

The second is that for some perturbation agents, be they siRNA, pooled
siRNA or esiRNA, more than one target gene is listed. In both the column
"ID_manufacturer" and "Name", the individual entries are separated by
semicolons.

The third is that for some perturbation agents, the entry in both
"ID_manufacturer" and "Name" is "Not available". However, as both the
total amount of perturbation agents this applies to is manageable and
the catalogue number of along with the manufacturer is provided, the
target genes are manually looked up.

As to the first two issues, however, postprocessing is accomplished in
an automated manner.

Apart from that, some records have been discontinued in the NCBI
database, such as the record corresponding to the gene ID 441848. Such
discontinued records contain the sentence "This record was
discontinued". It is decided at a later time how discontinued records
are dealt with.
"""

'\nThe Python script "gene_ID_and_off_gene_symbol_check.py" has been\nsuccessfully run on the Hemera HPC cluster. However, on throwing a\nglance at the output file and subsequently scutinising the\n"ID_manufacturer" column in the CSV file, several issues emerged.\n\nThe first is that the database query failed for four gene IDs, namely\n644862, 441848, 441931 and 441860.\n\nThe second is that for some perturbation agents, be they siRNA, pooled\nsiRNA or esiRNA, more than one target gene is listed. In both the column\n"ID_manufacturer" and "Name", the individual entries are separated by\nsemicolons.\n\nThe third is that for some perturbation agents, the entry in both\n"ID_manufacturer" and "Name" is "Not available". However, as both the\ntotal amount of perturbation agents this applies to is manageable and\nthe catalogue number of along with the manufacturer is provided, the\ntarget genes are manually looked up.\n\nAs to the first two issues, however, postprocessing is accomplished in\na

In [2]:
import time
import string

import numpy as np
import pandas as pd
from biotite.database import entrez

In [3]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

# Dask DataFrames exhibit a peculiarity regarding the index labels: By
# default, the index labels are integers, just as with Pandas
# DataFrames. However, unlike Pandas DataFrames, the index labels do not
# monotonically increase from 0, but restart at 0 for each partition,
# thereby resulting in duplicated index labels (Dask subdivides a Dask
# DataFram into multiple so-called partitions as the whole idea behind
# Dask is to handle large data sets in a memory-efficient way, https://
# docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.reset_
# index.html)
# Hence, performing operations with Dask DataFrames might potentially
# raise the `ValueError: cannot reindex on an axis with duplicate
# labels` error
# In this case, loading the entire data set into a Pandas DataFrame is
# feasible, which is why this is preferred to loading it into a Dask
# DataFrame (strangely enough, this has not been possible in the very
# beginning, which is why Dask was used in the first place)
main_csv_df = pd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

# Bear in mind that due to operator precedence, i.e. "|" (logical OR)
# having precedence over equality checks, the equality checks have to be
# surrounded by parentheses
single_pooled_siRNA_and_esiRNA_df = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]

In [4]:
# Determine the amount of perturbation agents for which the target genes
# are not specified
ID_manufacturer_series = single_pooled_siRNA_and_esiRNA_df[
    "ID_manufacturer"
]

n_target_not_specified = np.count_nonzero(
    ID_manufacturer_series == "Not available"
)

print(
    "Amount of perturbation agents for which the target genes are not "
    f"specified: {n_target_not_specified}"
)

Amount of perturbation agents for which the target genes are not specified: 22


In [5]:
# Now, address the first two issues in an automated manner (failed
# database query for four gene IDs and the listing of several target
# genes separated by semicolons)
# To this end, the CSV file with the updated gene IDs and official gene
# symbols is also loaded
# Unfortunately, when saving the updated Pandas DataFrame to a CSV file,
# the separator has not been specified, which is why the default
# separator has been used
# This, however, conflicts with the usage of commata in some entries
# Thus, prior to loading the updated CSV file, the commata have to be
# replaced with tab stops in a sophisticated manner taking account of
# this difference between actual delimiters and commata which are part
# of entries
# To this end, columns containing commata in their entries have to be
# identified
feature_names = main_csv_df.columns
features_with_commata = []
features_with_commata_indices = []

for i, feature_name in enumerate(feature_names):
    feature_series = main_csv_df[feature_name]
    if feature_series.dtype != "object":
        continue
    # Bear in mind that in order to check for the presence of a
    # substring in a Pandas DataFrame, "pandas.Series.str.contains" has
    # to be used rather than "pandas.Series.isin" as the latter only
    # verifies complete matches between column entries and query strings
    n_commata_in_entries = feature_series.str.contains(",").sum()
    if n_commata_in_entries > 0:
        features_with_commata.append(feature_name)
        features_with_commata_indices.append(i)

# Excel uses upper case letters instead of numbers in order to index
# columns
# Hence, for the sake of convenience, the numeric indices are
# simultaneously mapped to the corresponding alphabetical indices
# The built-in string module allows to fetch a string representing the
# entire alphabet
alphabet_list = list(string.ascii_uppercase)

alphabetic_indices_list = list(string.ascii_uppercase)
for first_letter in alphabet_list[:3]:
    for second_letter in alphabet_list:
        alphabetic_indices_list.append(first_letter + second_letter)

numeric_alphabetic_index_dict = {}
for numeric_index, alphabetic_index in enumerate(alphabetic_indices_list):
    numeric_alphabetic_index_dict[numeric_index] = alphabetic_index

max_feature_name_length = max(map(len, features_with_commata))

print(
    "The following features/columns contain commata in their entries:\n",
    "Feature name".ljust(max_feature_name_length + 1),
    "Numeric index".ljust(14),
    "Alphabetical Index\n",
    "-" * (max_feature_name_length + 1 + 14 + len("Alphabetical index")),
    sep="",
    end=""
)

for i, feature_name in zip(
    features_with_commata_indices, features_with_commata
):
    print(
        "\n",
        feature_name.ljust(max_feature_name_length + 1),
        str(i).ljust(14),
        numeric_alphabetic_index_dict[i],
        sep="",
        end=""
    )

The following features/columns contain commata in their entries:
Feature name      Numeric index Alphabetical Index
--------------------------------------------------
Name_alternatives 30            AE
Gene_Description  62            BK

In [6]:
# Now, the entries of the columns immediately following those the
# entries of which contain commata are scrutinised
# Ideally, they exhibit common characteristics that can be leveraged for
# the distinction between actual delimiters and commata belonging to
# entries
following_series_1 = main_csv_df[feature_names[31]]
following_series_2 = main_csv_df[feature_names[63]]

unique_vals_series_1 = np.unique(following_series_1)
unique_vals_series_2 = np.unique(following_series_2)

print(unique_vals_series_1)
print(unique_vals_series_2)

['MultipleTargets' 'NoTargets' 'Not available' 'OK' 'POOLED_SIRNA_ERROR'
 'TargetMismatch' 'Unknown']
['ENST00000000233;ENST00000463733'
 'ENST00000000233;ENST00000463733;ENST00000415666;ENST00000467281;ENST00000489673;ENST00000459680'
 'ENST00000000233;ENST00000463733;ENST00000415666;ENST00000489673;ENST00000464403'
 ... 'ENST00000515849;ENST00000302763;ENST00000355078' 'ENST00000516084'
 'Not available']


In [7]:
# Regarding the second of the two investigated columns, it emerges that
# the vast majority of its entries begin with the sequence "ENST"
# It is investigated whether this is indeed the case for all entries or
# whether there are some exceptions
print(all([entry[:4] == "ENST" for entry in unique_vals_series_2]))

False


In [8]:
# Apparently, there are entries not starting with the "ENST" sequence
# They are extracted and examined
# The arguably easiest way to accomplish this is boolean indexing, which
# is provided by NumPy
outcast_vals_arr = unique_vals_series_2[
    [entry[:4] != "ENST" for entry in unique_vals_series_2]
]
print(outcast_vals_arr)

['Not available']


In [None]:
# It becomes apparent that I am damn lucky as I indeed am able to
# leverage common characterics in order to distinguish actual delimiters
# from commata belonging to entries

# List comprising unique values of column 31, i.e. the column following
# column 30 ("Name_alternatives")
siRNA_error_options = [
    "MultipleTargets",
    "NoTargets",
    "Not available",
    "OK",
    "POOLED_SIRNA_ERROR",
    "TargetMismatch",
    "Unknown"
]

# Iterate through the lines, modify them accordingly and write the
# adjusted lines to a new output file
# Note that reading all the lines into memory at once via `.readlines()`
# provokes an Out Of Memory error, which is why the file lines are
# iterated over on the fly
with open(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_updated.csv",
    "r"
) as prior_tab_intro_file, open(
    "adjusted_file.csv", "w", newline=""
) as post_tab_intro_file:
    for i, line in enumerate(prior_tab_intro_file):
        # Bear in mind that the first line represents the header, i.e.
        # contains the column names
        # Thus, all commata represent actual delimiters
        if i == 0:
            split_line = [
                i for j in line.split(",") for i in (j, ",")
            ][:-1]

            # Simply replace all commata with tab stops
            split_line_with_tabs = [
                "\t" if i == "," else i for i in split_line
            ]
            
            # Concatenate the entries in the updated list and write the
            # resulting string to the file
            post_tab_intro_file.write("".join(split_line_with_tabs))
            continue

        # When employing the built-in split method for strings, the
        # separation character is not retained, but discarded
        # Hence, by employing a trick involving a nested list
        # comprehension, the separation character is added at its
        # corresponding positions
        # (https://www.geeksforgeeks.org/python-string-split-including-spaces/)
        split_line = [i for j in line.split(",") for i in (j, ",")][:-1]
        
        line_comma_indices = [
            i for i, x in enumerate(split_line) if x == ","
        ]

        # Determine the indices of commata belonging to entries in lieu
        # of being delimiters
        entry_commata_list = []

        # First, deal with column 30, i.e. "Name alternatives"
        # Keep in mind that it is iterated through the list
        # `split_line`, which encompasses both the entries as well as
        # commata
        # Also keep in mind that the numeric index starts with zero, not
        # 1, so that when counting in the "human" way, column 30 has
        # index 31
        # Therefore, the index corresponding to column 30 is not 30, but
        # 30 * 2 = 60 (counting starts with 0, hence the first column
        # has index 0; to account for the remaining 30 entries, 30 * 2
        # is added, yielding 60, the index of the entry corresponding to
        # column 30)
        # Also bear in mind that the column has at least one entry,
        # which is why the index of the first element to query is
        # increased by two, i.e. 62
        entry_index_1 = 62
        subsequent_entry = split_line[entry_index_1]
        while subsequent_entry not in siRNA_error_options:
            entry_commata_list.append(entry_index_1 - 1)
            entry_index_1 += 2
            subsequent_entry = split_line[entry_index_1]
        
        # Now, do the same thing with column 62, i.e. "Gene_Description"
        # Again, the index of the entry in `split_line` corresponding to
        # column 62 is not 62, but 62 * 2 = 124, and as the
        # column contains at least one entry, the index of the first
        # entry to query is increased by two (126)
        # Note that the index of the first entry to investigate has to
        # be adjusted according to the previous amount of "entry
        # commata"
        entry_index_2 = 126 + len(entry_commata_list) * 2
        subsequent_entry = split_line[entry_index_2]
        while (
            (subsequent_entry != "Not available")
            and
            (subsequent_entry[:4] != "ENST")
        ):
            entry_commata_list.append(entry_index_2 - 1)
            entry_index_2 += 2
            subsequent_entry = split_line[entry_index_2]
        
        # Update the list harbouring the row entries along with the
        # delimiters by replacing commata with tab stops at the
        # corresponding positions
        for comma_index in line_comma_indices:
            if comma_index not in entry_commata_list:
                split_line[comma_index] = "\t"
        
        # Finally, the entries in the updated row list are concatenated
        # and the resulting string is written to the file
        # As the `.readlines()` method does not trim line endings, the
        # newline character (\n) does not have to be added
        post_tab_intro_file.write("".join(split_line))

In [9]:
updated_main_csv_df = pd.read_csv(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_"\
    "updated_with_tab_stops.csv",
    sep="\t",
    dtype=dtype_dict
)

In [10]:
# I have a hunch that I might have have made a mistake: I assumed that
# the two columns "ID_manufacturer" and "ID" refer to the same thing
# This, however, is not necessarily the case, as it is also conceivable
# that siRNA directed against a specific target has been employed in
# order to knock down another, maybe structurally related target
# Therefore, it is investigated whether this indeed is the case or
# whether the siRNAs have exclusively been used for the targets
# specified by the manufacturer
# To this end, the unique values of the column "ID_manufacturer" are
# determined, and for each unique value, the amount of unique values
# occurring in the column "Name" in conjunction with that unique
# "ID_manufacturer" are determined
# Bear in mind that the original, non-updated CSV file has to be used
# for this purpose
unique_manufacturer_IDs = np.unique(main_csv_df["ID_manufacturer"])

non_unique_list = []
for manufacturer_ID in unique_manufacturer_IDs:
    names_series = main_csv_df.loc[
        main_csv_df["ID_manufacturer"] == manufacturer_ID
    ]["Name"]
    n_unique_names = len(np.unique(names_series))
    if n_unique_names > 1:
        non_unique_list.append(manufacturer_ID)

print(
    "Amount of manufacturer IDs occurring in conjunction with more "
    f"than one value in the \"Name\" column: {len(non_unique_list)}"
)

Amount of manufacturer IDs occurring in conjunction with more than one value in the "Name" column: 1229


In [None]:
# Save the list harbouring the manufacturer IDs co-occurring with with
# multiple "Name" values to a file
with open("manufacturer_IDs_co-occurring_with_multiple_names.txt", "w") as f:
    for i, manufacturer_ID in enumerate(non_unique_list):
        if i == 0:
            f.write(manufacturer_ID)
        else:
            f.write("\n" + manufacturer_ID)

In [11]:
# Closer scrutiny of the affected manufacturer IDs reveals that the
# occurrence of multiple "Name" values stems from the usage of aliases
# for one and the same gene
# Thus, this issue does not have to be further addressed, as it had been
# resolved by the database query anyway
# Apart from that, intentionally targeting genes other than those the
# respective siRNA is directed against defies the whole point of this
# research endeavour, which is to reliably filter out off-target effects

# Moreover, throwing a closer glance at the four gene ID for which the
# database query failed reveals that for all four gene IDs, one wrong
# record has consistently been retrieved, which is LOC441842 (gene ID
# 441842)
# Thus, the database query is simply repeated for those four gene IDs
# Yet another observation is that even though the respective gene IDs
# are not listed in the output file of the SLURM job, some rows have an
# error message as "Name" value
# Two error messages could be identified thus far, which are
# "ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+
# Status%3A+Timeout" as well as "OCTYPE html PUBLIC "-//W3C//DTD XHTML
# 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> "
# The database query also has to be repeated for those rows

# Deal with the four gene IDs listed in the output file of the SLURM job
# Conveniently enough, the database query only has to be retried for one
# of them (the first) since the other three have multiple occurrences in
# the CSV file, all but one of which co-occur with the correct "Name"
# value
# This allows to simply manually set the "Name" value for those three
# gene IDs
failed_gene_IDs = ["644862", "441848", "441931", "441860"]
correct_name_value_dict = {
    1: "LOC441848",
    2: "VN1R17P",
    3: "LOC441860"
}

for i, failed_gene_ID in enumerate(failed_gene_IDs):
    if i == 0:
        try:
            NCBI_entry = entrez.fetch_single_file(
                uids=[failed_gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
        except:
            print("Database query failed!")
        NCBI_entry_string = NCBI_entry.getvalue()

        # Remove blank lines from the string retrieved from the NCBI
        # entry
        NCBI_entry_string_list = NCBI_entry_string.split("\n")
        while "" in NCBI_entry_string_list:
            NCBI_entry_string_list.remove("")
        
        # Following the removal of empty strings, the official gene
        # symbol is represented by the first list element, but it is
        # preceded by the string "1. ", which encompasses three
        # characters
        # Hence, the first list element has to be sliced accordingly
        official_gene_symbol = NCBI_entry_string_list[0][3:]

        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == failed_gene_ID,
            "Name"
        ] = official_gene_symbol
    else:
        correct_name = correct_name_value_dict[i]
        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == failed_gene_ID,
            "Name"
        ] = correct_name

In [12]:
# Determine the unique values of the "Name" column in order to identify
# potential other error messages beyond the two mentioned aboved
unique_names = np.unique(updated_main_csv_df["Name"])
for unique_name in unique_names:
    print(unique_name)

A1BG
A1CF
A2M
A2ML1
A3GALT2
A4GALT
A4GNT
AAAS
AACS
AACSL
AADAC
AADACL2
AADACL4
AADACP1
AADAT
AAGAB
AAK1
AAMDC
AAMP
AANAT
AAR2
AARD
AARS1
AARS2
AARSD1
AASDH
AASDHPPT
AASS
AATBC
AATF
AATK
ABAT
ABCA1
ABCA10
ABCA11P
ABCA12
ABCA13
ABCA17P
ABCA2
ABCA3
ABCA3P1
ABCA4
ABCA5
ABCA6
ABCA7
ABCA8
ABCA9
ABCB1
ABCB10
ABCB11
ABCB4
ABCB5
ABCB6
ABCB7
ABCB8
ABCB9
ABCC1
ABCC10
ABCC11
ABCC12
ABCC13
ABCC2
ABCC3
ABCC4
ABCC5
ABCC6
ABCC8
ABCC9
ABCD1
ABCD2
ABCD3
ABCD4
ABCE1
ABCF1
ABCF2
ABCF3
ABCG1
ABCG2
ABCG4
ABCG5
ABCG8
ABHD1
ABHD10
ABHD11
ABHD12
ABHD12B
ABHD13
ABHD14A
ABHD14B
ABHD15
ABHD16A
ABHD16B
ABHD17A
ABHD17AP8
ABHD17B
ABHD17C
ABHD18
ABHD2
ABHD3
ABHD4
ABHD5
ABHD6
ABHD8
ABI1
ABI2
ABI3
ABI3BP
ABITRAM
ABITRAMP1
ABL1
ABL2
ABLIM1
ABLIM2
ABLIM3
ABO
ABR
ABRA
ABRACL
ABRAXAS1
ABRAXAS2
ABT-869Linifanib
ABT1
ABTB1
ABTB2
ABTB3
AC-220
ACAA1
ACAA2
ACACA
ACACB
ACAD10
ACAD11
ACAD8
ACAD9
ACADL
ACADM
ACADS
ACADSB
ACADVL
ACAN
ACAP1
ACAP2
ACAP3
ACAT1
ACAT2
ACBD3
ACBD4
ACBD5
ACBD6
ACBD7
ACCS
ACCSL
ACD
ACE
ACE2
ACER1
ACER2
ACE

In [13]:
# A closer examination of the unique "Name" values indeed revealed that
# a third, hitherto unnoticed error message exists
# The three identified error messages are as follows:
# OCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
# ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+Status%3A+Timeout
# ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+Status%3A+Unknown

In [14]:
# Determine the amount of rows whose "Name" value has been assigned to
# an error message
n_error_message_rows = np.count_nonzero(
    updated_main_csv_df["Name"].str.contains(
        # Two of the three error messages have the substring below in
        # common; however, in order to be able to search for the plus
        # sign as a literal character, the `regex` keyword argument has
        # to be set to False
        "External+viewer+error",
        regex=False
    )
    |
    updated_main_csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
)

print(
    "Amount of rows the \"Name\" value of which has been assigned to "
    f"an error message: {n_error_message_rows}"
)

Amount of rows the "Name" value of which has been assigned to an error message: 1553


In [15]:
# Check whether all those 1553 rows are also comprised in the
# single/pooled siRNA and esiRNA subset
n_error_message_rows_in_subset = np.count_nonzero(
    (
        updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        )
        |
        updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        )
    )
    &
    (
        (updated_main_csv_df["WellType"] == "SIRNA")
        |
        (updated_main_csv_df["WellType"] == "POOLED_SIRNA")
        |
        (updated_main_csv_df["WellType"] == "ESIRNA")
    )
)

print(
    "Amount of rows in the single/pooled siRNA and esiRNA subset the "
    "\"Name\" value of which has been assigned to an error message: "
    f"{n_error_message_rows_in_subset}"
)

Amount of rows in the single/pooled siRNA and esiRNA subset the "Name" value of which has been assigned to an error message: 1553


In [16]:
# Determine the indices of rows the "Name" value of which has been
# assigned to an error message
error_indices = updated_main_csv_df.index[
    updated_main_csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    updated_main_csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
].to_list()

# As many gene IDs occur multiple times in the data set, it is checked
# whether the database query has been successful for the respective gene
# ID at another location; otherwise, the database query is repeated
n_unique_non_error_names_list = []
for idx in error_indices:
    gene_ID = updated_main_csv_df.iloc[idx]["ID_manufacturer"]

    assigned_names = updated_main_csv_df.loc[
        # Bear in mind that due to operator precedence, i.e. the bitwise
        # AND being evaluated before the equality check (==), the
        # equality check has to be surrounded by parentheses
        (updated_main_csv_df["ID_manufacturer"] == gene_ID)
        &
        # The tilde operator inverts boolean arrays
        ((~updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        ))
        &
        (~updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        )))
    ]["Name"]
    n_unique_names = len(np.unique(assigned_names))

    n_unique_non_error_names_list.append(n_unique_names)

print(all([n <= 1 for n in n_unique_non_error_names_list]))

False


In [17]:
# Determine the amount of gene IDs co-occurring with more than one
# unique non-error "Name" value even after the standardisation
non_unique_boolean_list = [n > 1 for n in n_unique_non_error_names_list]
n_non_unique_non_error_name_after_update = np.count_nonzero(
    non_unique_boolean_list
)

print(
    "Amount of gene IDs co-occurring with more than unique non-error "
    "\"Name\"\nvalues even after the database query: "
    f"{n_non_unique_non_error_name_after_update}"
)

Amount of gene IDs co-occurring with more than unique non-error "Name"
values even after the database query: 1


In [18]:
# Only one gene ID causes trouble; the precise gene ID is determined
idx = non_unique_boolean_list.index(True)
trouble_gene_ID = updated_main_csv_df.iloc[
    error_indices[idx]
]["ID_manufacturer"]
print(
    f"The gene ID causing trouble is {trouble_gene_ID}."
)

The gene ID causing trouble is 3832.


In [19]:
# Determine what the individual names this gene ID co-occurs with are
trouble_names = np.unique(
    updated_main_csv_df.loc[
        updated_main_csv_df["ID_manufacturer"] == "3832"
    ]["Name"]
)

for trouble_name in trouble_names:
    print(trouble_name)

KIF11
Kif11
ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+Status%3A+Timeout


In [20]:
# The official gene symbol for gene ID 3832 is KIF11; hence, all names
# containing lowercase letters are changed to the uppercase equivalent
# (the reason this inconsistency is observed is that the database query
# has been confined to the single/pooled siRNA and esiRNA subset of the
# VACV screen)
updated_main_csv_df.loc[
    updated_main_csv_df["Name"] == "Kif11", "Name"
] = "KIF11"

In [21]:
# Now, verify that all gene IDs co-occur with only one unique "Name"
# value (apart from error messages)
non_unique_list = []
for manufacturer_ID in unique_manufacturer_IDs:
    names_series = updated_main_csv_df.loc[
        (updated_main_csv_df["ID_manufacturer"] == manufacturer_ID)
        &
        (~updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        ))
        &
        (~updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        ))
    ]["Name"]
    n_unique_names = len(np.unique(names_series))
    if n_unique_names > 1:
        non_unique_list.append(manufacturer_ID)

In [22]:
print(len(non_unique_list))
print(non_unique_list)

5
['2475', '5298', 'MIMAT0001630', 'MIMAT0015081', 'Not available']


In [23]:
# The problem still persists with the following gene IDs:
# 2475, 5298, MIMAT0001630, MIMAT0015081 and Not available
# Determine the different names for each of them except "Not available"
non_unique_list = non_unique_list[:-1]

for gene_ID in non_unique_list:
    names = np.unique(
        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == gene_ID, "Name"
        ]
    )
    print(
        f"Unique names co-occurring with gene ID {gene_ID}: "
        f"{names}"
    )

Unique names co-occurring with gene ID 2475: ['FRAP1' 'MTOR']
Unique names co-occurring with gene ID 5298: ['PI4KB' 'PIK4CB']
Unique names co-occurring with gene ID MIMAT0001630: ['HSA-MIR-323B-5P' 'HSA-MIR-453']
Unique names co-occurring with gene ID MIMAT0015081: ['HSA-MIR-548X' 'HSA-MIR-548X-3P']


In [24]:
# The official gene symbol for gene IDs 2475 and 5298 are MTOR and
# PI4KB, respectively
# The two remaining IDs are not addressed as they represent miRNAs
for gene_ID, off_gene_symbol in zip(
    ("2475", "5298"), ("MTOR", "PI4KB")
):
    updated_main_csv_df.loc[
        updated_main_csv_df["ID_manufacturer"] == gene_ID,
        "Name"
    ] = off_gene_symbol

In [25]:
# Save the Pandas DataFrame with the adjustments hitherto made to a CSV
# file
updated_main_csv_df.to_csv(
    "Vaccinia_Report_intermediate_postprocessing.csv",
    sep="\t",
    index=False
)

In [None]:
# Eventually, address the failed database queries
# To this end, a Python script is created, which is executed on the
# Hemera HPC cluster as this task requires roughly an hour

In [None]:
# Following the standardisation of the "Name" values via database
# queries and the repeated query for those rows for which the initial
# query failed, the phenomenon of ... should no longer be observable