In [None]:
"""
The Python script "gene_ID_and_off_gene_symbol_check.py" has been
successfully run on the Hemera HPC cluster. However, on throwing a
glance at the output file and subsequently scutinising the
"ID_manufacturer" column in the CSV file, several issues emerged.

The first is that the database query failed for four gene IDs, namely
644862, 441848, 441931 and 441860.

The second is that for some perturbation agents, be they siRNA, pooled
siRNA or esiRNA, more than one target gene is listed. In both the column
"ID_manufacturer" and "Name", the individual entries are separated by
semicolons.

The third is that for some perturbation agents, the entry in both
"ID_manufacturer" and "Name" is "Not available". However, as both the
total amount of perturbation agents this applies to is manageable and
the catalogue number of along with the manufacturer is provided, the
target genes are manually looked up.

As to the first two issues, however, postprocessing is accomplished in
an automated manner.

Apart from that, some records have been discontinued in the NCBI
database, such as the record corresponding to the gene ID 441848. Such
discontinued records contain the sentence "This record was
discontinued". It is decided at a later time how discontinued records
are dealt with.
-> Artur says: Take them in separate subtable, analyse them at later time
-> Look for discussion forums online dealing with such issues, maybe open
a thread myself
"""

In [1]:
import time
import string

import numpy as np
import pandas as pd
from biotite.database import entrez
from biotite.database import uniprot

In [2]:
# Load the screen data
# Bear in mind that for certain columns, the data type has to be
# manually specified
dtype_dict = {
    "Ensembl_ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "Ensembl_ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "Gene_Description": str,
    "ID": str,
    "ID_OnTarget_Ensembl_GRCh38_release_87": str,
    "ID_OnTarget_Merge": str,
    "ID_OnTarget_NCBI_HeLa_phs000643_v3_p1_c1_HMB": str,
    "ID_OnTarget_RefSeq_20170215": str,
    "ID_manufacturer": str,
    "Name_alternatives": str,
    "PLATE_QUALITY_DESCRIPTION": str,
    "RefSeq_ID_OnTarget_RefSeq_20170215": str,
    "Seed_sequence_common": str,
    "WELL_QUALITY_DESCRIPTION": str,
    "siRNA_error": str,
    "siRNA_number": str,
    "Precursor_Name": str
}

# Dask DataFrames exhibit a peculiarity regarding the index labels: By
# default, the index labels are integers, just as with Pandas
# DataFrames. However, unlike Pandas DataFrames, the index labels do not
# monotonically increase from 0, but restart at 0 for each partition,
# thereby resulting in duplicated index labels (Dask subdivides a Dask
# DataFram into multiple so-called partitions as the whole idea behind
# Dask is to handle large data sets in a memory-efficient way, https://
# docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.reset_
# index.html)
# Hence, performing operations with Dask DataFrames might potentially
# raise the `ValueError: cannot reindex on an axis with duplicate
# labels` error
# In this case, loading the entire data set into a Pandas DataFrame is
# feasible, which is why this is preferred to loading it into a Dask
# DataFrame (strangely enough, this has not been possible in the very
# beginning, which is why Dask was used in the first place)
main_csv_df = pd.read_csv(
    "VacciniaReport_20170223-0958_ZScored_conc_and_NaN_adjusted.csv",
    sep="\t",
    dtype=dtype_dict
)

# Bear in mind that due to operator precedence, i.e. "|" (logical OR)
# having precedence over equality checks, the equality checks have to be
# surrounded by parentheses
single_pooled_siRNA_and_esiRNA_df = main_csv_df.loc[
    (main_csv_df["WellType"] == "SIRNA")
    |
    (main_csv_df["WellType"] == "POOLED_SIRNA")
    |
    (main_csv_df["WellType"] == "ESIRNA")
]

In [None]:
# Determine the amount of perturbation agents for which the target genes
# are not specified
ID_manufacturer_series = single_pooled_siRNA_and_esiRNA_df[
    "ID_manufacturer"
]

n_target_not_specified = np.count_nonzero(
    ID_manufacturer_series == "Not available"
)

print(
    "Amount of perturbation agents for which the target genes are not "
    f"specified: {n_target_not_specified}"
)

In [None]:
# Now, address the first two issues in an automated manner (failed
# database query for four gene IDs and the listing of several target
# genes separated by semicolons)
# To this end, the CSV file with the updated gene IDs and official gene
# symbols is also loaded
# Unfortunately, when saving the updated Pandas DataFrame to a CSV file,
# the separator has not been specified, which is why the default
# separator has been used
# This, however, conflicts with the usage of commata in some entries
# Thus, prior to loading the updated CSV file, the commata have to be
# replaced with tab stops in a sophisticated manner taking account of
# this difference between actual delimiters and commata which are part
# of entries
# To this end, columns containing commata in their entries have to be
# identified
feature_names = main_csv_df.columns
features_with_commata = []
features_with_commata_indices = []

for i, feature_name in enumerate(feature_names):
    feature_series = main_csv_df[feature_name]
    if feature_series.dtype != "object":
        continue
    # Bear in mind that in order to check for the presence of a
    # substring in a Pandas DataFrame, "pandas.Series.str.contains" has
    # to be used rather than "pandas.Series.isin" as the latter only
    # verifies complete matches between column entries and query strings
    n_commata_in_entries = feature_series.str.contains(",").sum()
    if n_commata_in_entries > 0:
        features_with_commata.append(feature_name)
        features_with_commata_indices.append(i)

# Excel uses upper case letters instead of numbers in order to index
# columns
# Hence, for the sake of convenience, the numeric indices are
# simultaneously mapped to the corresponding alphabetical indices
# The built-in string module allows to fetch a string representing the
# entire alphabet
alphabet_list = list(string.ascii_uppercase)

alphabetic_indices_list = list(string.ascii_uppercase)
for first_letter in alphabet_list[:3]:
    for second_letter in alphabet_list:
        alphabetic_indices_list.append(first_letter + second_letter)

numeric_alphabetic_index_dict = {}
for numeric_index, alphabetic_index in enumerate(alphabetic_indices_list):
    numeric_alphabetic_index_dict[numeric_index] = alphabetic_index

max_feature_name_length = max(map(len, features_with_commata))

print(
    "The following features/columns contain commata in their entries:\n",
    "Feature name".ljust(max_feature_name_length + 1),
    "Numeric index".ljust(14),
    "Alphabetical Index\n",
    "-" * (max_feature_name_length + 1 + 14 + len("Alphabetical index")),
    sep="",
    end=""
)

for i, feature_name in zip(
    features_with_commata_indices, features_with_commata
):
    print(
        "\n",
        feature_name.ljust(max_feature_name_length + 1),
        str(i).ljust(14),
        numeric_alphabetic_index_dict[i],
        sep="",
        end=""
    )

In [None]:
# Now, the entries of the columns immediately following those the
# entries of which contain commata are scrutinised
# Ideally, they exhibit common characteristics that can be leveraged for
# the distinction between actual delimiters and commata belonging to
# entries
following_series_1 = main_csv_df[feature_names[31]]
following_series_2 = main_csv_df[feature_names[63]]

unique_vals_series_1 = np.unique(following_series_1)
unique_vals_series_2 = np.unique(following_series_2)

print(unique_vals_series_1)
print(unique_vals_series_2)

In [None]:
# Regarding the second of the two investigated columns, it emerges that
# the vast majority of its entries begin with the sequence "ENST"
# It is investigated whether this is indeed the case for all entries or
# whether there are some exceptions
print(all([entry[:4] == "ENST" for entry in unique_vals_series_2]))

In [None]:
# Apparently, there are entries not starting with the "ENST" sequence
# They are extracted and examined
# The arguably easiest way to accomplish this is boolean indexing, which
# is provided by NumPy
outcast_vals_arr = unique_vals_series_2[
    [entry[:4] != "ENST" for entry in unique_vals_series_2]
]
print(outcast_vals_arr)

In [None]:
# It becomes apparent that I am damn lucky as I indeed am able to
# leverage common characterics in order to distinguish actual delimiters
# from commata belonging to entries

# List comprising unique values of column 31, i.e. the column following
# column 30 ("Name_alternatives")
siRNA_error_options = [
    "MultipleTargets",
    "NoTargets",
    "Not available",
    "OK",
    "POOLED_SIRNA_ERROR",
    "TargetMismatch",
    "Unknown"
]

# Iterate through the lines, modify them accordingly and write the
# adjusted lines to a new output file
# Note that reading all the lines into memory at once via `.readlines()`
# provokes an Out Of Memory error, which is why the file lines are
# iterated over on the fly
with open(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_updated.csv",
    "r"
) as prior_tab_intro_file, open(
    "adjusted_file.csv", "w", newline=""
) as post_tab_intro_file:
    for i, line in enumerate(prior_tab_intro_file):
        # Bear in mind that the first line represents the header, i.e.
        # contains the column names
        # Thus, all commata represent actual delimiters
        if i == 0:
            split_line = [
                i for j in line.split(",") for i in (j, ",")
            ][:-1]

            # Simply replace all commata with tab stops
            split_line_with_tabs = [
                "\t" if i == "," else i for i in split_line
            ]
            
            # Concatenate the entries in the updated list and write the
            # resulting string to the file
            post_tab_intro_file.write("".join(split_line_with_tabs))
            continue

        # When employing the built-in split method for strings, the
        # separation character is not retained, but discarded
        # Hence, by employing a trick involving a nested list
        # comprehension, the separation character is added at its
        # corresponding positions
        # (https://www.geeksforgeeks.org/python-string-split-including-spaces/)
        split_line = [i for j in line.split(",") for i in (j, ",")][:-1]
        
        line_comma_indices = [
            i for i, x in enumerate(split_line) if x == ","
        ]

        # Determine the indices of commata belonging to entries in lieu
        # of being delimiters
        entry_commata_list = []

        # First, deal with column 30, i.e. "Name alternatives"
        # Keep in mind that it is iterated through the list
        # `split_line`, which encompasses both the entries as well as
        # commata
        # Also keep in mind that the numeric index starts with zero, not
        # 1, so that when counting in the "human" way, column 30 has
        # index 31
        # Therefore, the index corresponding to column 30 is not 30, but
        # 30 * 2 = 60 (counting starts with 0, hence the first column
        # has index 0; to account for the remaining 30 entries, 30 * 2
        # is added, yielding 60, the index of the entry corresponding to
        # column 30)
        # Also bear in mind that the column has at least one entry,
        # which is why the index of the first element to query is
        # increased by two, i.e. 62
        entry_index_1 = 62
        subsequent_entry = split_line[entry_index_1]
        while subsequent_entry not in siRNA_error_options:
            entry_commata_list.append(entry_index_1 - 1)
            entry_index_1 += 2
            subsequent_entry = split_line[entry_index_1]
        
        # Now, do the same thing with column 62, i.e. "Gene_Description"
        # Again, the index of the entry in `split_line` corresponding to
        # column 62 is not 62, but 62 * 2 = 124, and as the
        # column contains at least one entry, the index of the first
        # entry to query is increased by two (126)
        # Note that the index of the first entry to investigate has to
        # be adjusted according to the previous amount of "entry
        # commata"
        entry_index_2 = 126 + len(entry_commata_list) * 2
        subsequent_entry = split_line[entry_index_2]
        while (
            (subsequent_entry != "Not available")
            and
            (subsequent_entry[:4] != "ENST")
        ):
            entry_commata_list.append(entry_index_2 - 1)
            entry_index_2 += 2
            subsequent_entry = split_line[entry_index_2]
        
        # Update the list harbouring the row entries along with the
        # delimiters by replacing commata with tab stops at the
        # corresponding positions
        for comma_index in line_comma_indices:
            if comma_index not in entry_commata_list:
                split_line[comma_index] = "\t"
        
        # Finally, the entries in the updated row list are concatenated
        # and the resulting string is written to the file
        # As the `.readlines()` method does not trim line endings, the
        # newline character (\n) does not have to be added
        post_tab_intro_file.write("".join(split_line))

In [9]:
updated_main_csv_df = pd.read_csv(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_"\
    "updated_with_tab_stops.csv",
    sep="\t",
    dtype=dtype_dict
)

In [None]:
# I have a hunch that I might have have made a mistake: I assumed that
# the two columns "ID_manufacturer" and "ID" refer to the same thing
# This, however, is not necessarily the case, as it is also conceivable
# that siRNA directed against a specific target has been employed in
# order to knock down another, maybe structurally related target
# Therefore, it is investigated whether this indeed is the case or
# whether the siRNAs have exclusively been used for the targets
# specified by the manufacturer
# To this end, the unique values of the column "ID_manufacturer" are
# determined, and for each unique value, the amount of unique values
# occurring in the column "Name" in conjunction with that unique
# "ID_manufacturer" are determined
# Bear in mind that the original, non-updated CSV file has to be used
# for this purpose
unique_manufacturer_IDs = np.unique(main_csv_df["ID_manufacturer"])

non_unique_list = []
for manufacturer_ID in unique_manufacturer_IDs:
    names_series = main_csv_df.loc[
        main_csv_df["ID_manufacturer"] == manufacturer_ID
    ]["Name"]
    n_unique_names = len(np.unique(names_series))
    if n_unique_names > 1:
        non_unique_list.append(manufacturer_ID)

print(
    "Amount of manufacturer IDs occurring in conjunction with more "
    f"than one value in the \"Name\" column: {len(non_unique_list)}"
)

In [None]:
# Save the list harbouring the manufacturer IDs co-occurring with with
# multiple "Name" values to a file
with open("manufacturer_IDs_co-occurring_with_multiple_names.txt", "w") as f:
    for i, manufacturer_ID in enumerate(non_unique_list):
        if i == 0:
            f.write(manufacturer_ID)
        else:
            f.write("\n" + manufacturer_ID)

In [11]:
# Closer scrutiny of the affected manufacturer IDs reveals that the
# occurrence of multiple "Name" values stems from the usage of aliases
# for one and the same gene
# Thus, this issue does not have to be further addressed, as it had been
# resolved by the database query anyway
# Apart from that, intentionally targeting genes other than those the
# respective siRNA is directed against defies the whole point of this
# research endeavour, which is to reliably filter out off-target effects

# Moreover, throwing a closer glance at the four gene ID for which the
# database query failed reveals that for all four gene IDs, one wrong
# record has consistently been retrieved, which is LOC441842 (gene ID
# 441842)
# Thus, the database query is simply repeated for those four gene IDs
# Yet another observation is that even though the respective gene IDs
# are not listed in the output file of the SLURM job, some rows have an
# error message as "Name" value
# Two error messages could be identified thus far, which are
# "ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+
# Status%3A+Timeout" as well as "OCTYPE html PUBLIC "-//W3C//DTD XHTML
# 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> "
# The database query also has to be repeated for those rows

# Deal with the four gene IDs listed in the output file of the SLURM job
# Conveniently enough, the database query only has to be retried for one
# of them (the first) since the other three have multiple occurrences in
# the CSV file, all but one of which co-occur with the correct "Name"
# value
# This allows to simply manually set the "Name" value for those three
# gene IDs
failed_gene_IDs = ["644862", "441848", "441931", "441860"]
correct_name_value_dict = {
    1: "LOC441848",
    2: "VN1R17P",
    3: "LOC441860"
}

for i, failed_gene_ID in enumerate(failed_gene_IDs):
    if i == 0:
        try:
            NCBI_entry = entrez.fetch_single_file(
                uids=[failed_gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
        except:
            print("Database query failed!")
        NCBI_entry_string = NCBI_entry.getvalue()

        # Remove blank lines from the string retrieved from the NCBI
        # entry
        NCBI_entry_string_list = NCBI_entry_string.split("\n")
        while "" in NCBI_entry_string_list:
            NCBI_entry_string_list.remove("")
        
        # Following the removal of empty strings, the official gene
        # symbol is represented by the first list element, but it is
        # preceded by the string "1. ", which encompasses three
        # characters
        # Hence, the first list element has to be sliced accordingly
        official_gene_symbol = NCBI_entry_string_list[0][3:]

        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == failed_gene_ID,
            "Name"
        ] = official_gene_symbol
    else:
        correct_name = correct_name_value_dict[i]
        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == failed_gene_ID,
            "Name"
        ] = correct_name

In [None]:
# Determine the unique values of the "Name" column in order to identify
# potential other error messages beyond the two mentioned aboved
unique_names = np.unique(updated_main_csv_df["Name"])
for unique_name in unique_names:
    print(unique_name)

In [13]:
# A closer examination of the unique "Name" values indeed revealed that
# a third, hitherto unnoticed error message exists
# The three identified error messages are as follows:
# OCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
# ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+Status%3A+Timeout
# ror%3A+External+viewer+error%3A+Empty+Response.+Bytes+read%3A+0+Status%3A+Unknown

In [None]:
# Determine the amount of rows whose "Name" value has been assigned to
# an error message
n_error_message_rows = np.count_nonzero(
    updated_main_csv_df["Name"].str.contains(
        # Two of the three error messages have the substring below in
        # common; however, in order to be able to search for the plus
        # sign as a literal character, the `regex` keyword argument has
        # to be set to False
        "External+viewer+error",
        regex=False
    )
    |
    updated_main_csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
)

print(
    "Amount of rows the \"Name\" value of which has been assigned to "
    f"an error message: {n_error_message_rows}"
)

In [None]:
# Check whether all those 1553 rows are also comprised in the
# single/pooled siRNA and esiRNA subset
n_error_message_rows_in_subset = np.count_nonzero(
    (
        updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        )
        |
        updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        )
    )
    &
    (
        (updated_main_csv_df["WellType"] == "SIRNA")
        |
        (updated_main_csv_df["WellType"] == "POOLED_SIRNA")
        |
        (updated_main_csv_df["WellType"] == "ESIRNA")
    )
)

print(
    "Amount of rows in the single/pooled siRNA and esiRNA subset the "
    "\"Name\" value of which has been assigned to an error message: "
    f"{n_error_message_rows_in_subset}"
)

In [None]:
# Determine the indices of rows the "Name" value of which has been
# assigned to an error message
error_indices = updated_main_csv_df.index[
    updated_main_csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    updated_main_csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
].to_list()

# As many gene IDs occur multiple times in the data set, it is checked
# whether the database query has been successful for the respective gene
# ID at another location; otherwise, the database query is repeated
n_unique_non_error_names_list = []
for idx in error_indices:
    gene_ID = updated_main_csv_df.iloc[idx]["ID_manufacturer"]

    assigned_names = updated_main_csv_df.loc[
        # Bear in mind that due to operator precedence, i.e. the bitwise
        # AND being evaluated before the equality check (==), the
        # equality check has to be surrounded by parentheses
        (updated_main_csv_df["ID_manufacturer"] == gene_ID)
        &
        # The tilde operator inverts boolean arrays
        ((~updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        ))
        &
        (~updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        )))
    ]["Name"]
    n_unique_names = len(np.unique(assigned_names))

    n_unique_non_error_names_list.append(n_unique_names)

print(all([n <= 1 for n in n_unique_non_error_names_list]))

In [None]:
# Determine the amount of gene IDs co-occurring with more than one
# unique non-error "Name" value even after the standardisation
non_unique_boolean_list = [n > 1 for n in n_unique_non_error_names_list]
n_non_unique_non_error_name_after_update = np.count_nonzero(
    non_unique_boolean_list
)

print(
    "Amount of gene IDs co-occurring with more than unique non-error "
    "\"Name\"\nvalues even after the database query: "
    f"{n_non_unique_non_error_name_after_update}"
)

In [None]:
# Only one gene ID causes trouble; the precise gene ID is determined
idx = non_unique_boolean_list.index(True)
trouble_gene_ID = updated_main_csv_df.iloc[
    error_indices[idx]
]["ID_manufacturer"]
print(
    f"The gene ID causing trouble is {trouble_gene_ID}."
)

In [None]:
# Determine what the individual names this gene ID co-occurs with are
trouble_names = np.unique(
    updated_main_csv_df.loc[
        updated_main_csv_df["ID_manufacturer"] == "3832"
    ]["Name"]
)

for trouble_name in trouble_names:
    print(trouble_name)

In [20]:
# The official gene symbol for gene ID 3832 is KIF11; hence, all names
# containing lowercase letters are changed to the uppercase equivalent
# (the reason this inconsistency is observed is that the database query
# has been confined to the single/pooled siRNA and esiRNA subset of the
# VACV screen)
updated_main_csv_df.loc[
    updated_main_csv_df["Name"] == "Kif11", "Name"
] = "KIF11"

In [21]:
# Now, verify that all gene IDs co-occur with only one unique "Name"
# value (apart from error messages)
non_unique_list = []
for manufacturer_ID in unique_manufacturer_IDs:
    names_series = updated_main_csv_df.loc[
        (updated_main_csv_df["ID_manufacturer"] == manufacturer_ID)
        &
        (~updated_main_csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        ))
        &
        (~updated_main_csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        ))
    ]["Name"]
    n_unique_names = len(np.unique(names_series))
    if n_unique_names > 1:
        non_unique_list.append(manufacturer_ID)

In [None]:
print(len(non_unique_list))
print(non_unique_list)

In [None]:
# The problem still persists with the following gene IDs:
# 2475, 5298, MIMAT0001630, MIMAT0015081 and Not available
# Determine the different names for each of them except "Not available"
non_unique_list = non_unique_list[:-1]

for gene_ID in non_unique_list:
    names = np.unique(
        updated_main_csv_df.loc[
            updated_main_csv_df["ID_manufacturer"] == gene_ID, "Name"
        ]
    )
    print(
        f"Unique names co-occurring with gene ID {gene_ID}: "
        f"{names}"
    )

In [24]:
# The official gene symbol for gene IDs 2475 and 5298 are MTOR and
# PI4KB, respectively
# The two remaining IDs are not addressed as they represent miRNAs
for gene_ID, off_gene_symbol in zip(
    ("2475", "5298"), ("MTOR", "PI4KB")
):
    updated_main_csv_df.loc[
        updated_main_csv_df["ID_manufacturer"] == gene_ID,
        "Name"
    ] = off_gene_symbol

In [25]:
# Save the Pandas DataFrame with the adjustments hitherto made to a CSV
# file
updated_main_csv_df.to_csv(
    "Vaccinia_Report_intermediate_postprocessing.csv",
    sep="\t",
    index=False
)

In [None]:
# Eventually, address the failed database queries
# To this end, a Python script is created, which is executed on the
# Hemera HPC cluster as this task requires roughly an hour

In [None]:
# Unfortunately, it has turned out that there is an error in the script
# performing the gene ID and official gene symbol check
# To be more precise, in the case of records having been replaced
# altogether with a new one, the new gene ID is comprised in the
# penultimate list element following the splitting of the entry string
# and the removal of blank lines
# However, instead of the penultimate, i.e. second to last list element,
# the very last has inadvertently been retrieved, thereby retaining the
# old, invalid gene ID and name
# Thus, the procedure of the NCBI database query has regrettably be
# repeated
# This time, however, in an effort to reduce the execution time, it is
# not iterated over each and every line of the CSV file
# Instead, the fact that many gene IDs occur multiple times is leveraged
# by iterating over the gene IDs and thereby modifying multiple rows at
# once

In [8]:
# Load the very latest CSV file into a Pandas DataFrame
csv_df = pd.read_csv(
    "Vaccinia_Report_correct_indexing.csv",
    sep="\t",
    dtype=dtype_dict
)

In [14]:
# As after the first NCBI database query, the query was not successful
# for a certain amount of gene IDs
# In this case, there are 13 gene IDs for which the query failed
# The query is simply repeated for those 13 gene IDs
failed_gene_IDs = []
failure_string = "Database query wasn't successful for gene ID"

with open("NCBI_DB_query_correct_list_indexing.out", "r") as f:
    for line in f:
        if (failure_string in line) and ("Not available" not in line):
            # Subsequent to splitting the line using the space character
            # as separator, the gene ID is comprised in the very last
            # list element; however, the gene ID is followed by a dot,
            # which is removed by slicing
            failed_gene_ID = line.split()[-1][:-1]
            failed_gene_IDs.append(failed_gene_ID)

assert len(failed_gene_IDs) == 13, (
    "Something went wrong while gathering the gene IDs for which the "
    "database queries failed!"
)

ID_change_str = "This record was replaced with GeneID:"

for failed_gene_ID in failed_gene_IDs:
    for _ in range(3):
        try:
            NCBI_entry = entrez.fetch_single_file(
                uids=[failed_gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
            break
        except:
            time.sleep(1)
    else:
        print(
            "Database query wasn't successful for gene ID "
            f"{failed_gene_ID}."
        )
        continue
    

    NCBI_entry_string = NCBI_entry.getvalue()

    # Different approaches are necessary depending on whether merely
    # the official gene symbol was altered, whereas the NCBI Gene ID
    # remained unchanged, or the record has been replaced altogether
    # with an entirely new ID
    if ID_change_str in NCBI_entry_string:
        # The respective record has been replaced altogether with a
        # new ID
        # Hence, the new ID is retrieved and used to query NCBI's
        # gene database
        NCBI_entry_string_list = NCBI_entry_string.split("\n")
        # For some strange reason, the string retrieved from the
        # NCBI entry contains blank lines; they are removed
        while "" in NCBI_entry_string_list:
            NCBI_entry_string_list.remove("")
        
        # The new ID is comprised in the penultimate list element
        # and conveniently enough separated from the preceding
        # string by a space character
        new_gene_ID = NCBI_entry_string_list[-2].split()[-1]
        # As the gene ID currently dealt with may well occur outside the
        # subset, the gene ID update is conducted for the entire data
        # set
        csv_df.loc[
            csv_df["ID_manufacturer"] == failed_gene_ID,
            ["ID", "ID_manufacturer"]
        ] = new_gene_ID

        time.sleep(1)
        for _ in range(3):
            try:
                NCBI_entry = entrez.fetch_single_file(
                    uids=[new_gene_ID],
                    file_name=None,
                    db_name="gene",
                    ret_type="",
                    ret_mode="text"
                )
                break
            except:
                time.sleep(1)
        else:
            print(
                "Querying the database wasn't successful for the "
                f"updated gene ID {new_gene_ID} (formerly gene ID "
                f"{failed_gene_ID})."
            )
            continue

        NCBI_entry_string = NCBI_entry.getvalue()
        NCBI_entry_string_list = NCBI_entry_string.split("\n")
        while "" in NCBI_entry_string_list:
            NCBI_entry_string_list.remove("")
        
        # The official gene symbol is comprised in the first list
        # element, but is preceded by the string "1. ", which
        # encompasses three characters
        official_gene_symbol = NCBI_entry_string_list[0][3:]

        # Again, the official gene symbol update is performed for the
        # entire data set
        csv_df.loc[
            csv_df["ID_manufacturer"] == new_gene_ID,
            "Name"
        ] = official_gene_symbol
    else:
        # Remove blank lines from the string retrieved from the NCBI
        # entry
        NCBI_entry_string_list = NCBI_entry_string.split("\n")
        while "" in NCBI_entry_string_list:
            NCBI_entry_string_list.remove("")
        
        # Following the removal of empty strings, the official gene
        # symbol is represented by the first list element, but it is
        # preceded by the string "1. ", which encompasses three
        # characters
        # Hence, the first list element has to be sliced accordingly
        official_gene_symbol = NCBI_entry_string_list[0][3:]
        csv_df.loc[
            csv_df["ID_manufacturer"] == failed_gene_ID,
            "Name"
        ] = official_gene_symbol

        # The corresponding cell of the "ID" feature is populated with
        # the NCBI Gene ID harboured by the "ID_manufacturer" feature
        # (remember that cells of the "ID" feature are not continuously
        # populated)
        csv_df.loc[
            csv_df["ID_manufacturer"] == failed_gene_ID,
            "ID"
        ] = failed_gene_ID

# As a last step, save the new Pandas DataFrame to a new CSV file
csv_df.to_csv(
    "Vaccinia_Report_correct_indexing_failer_queries_repeated.csv",
    sep="\t",
    index=False
)

In [None]:
# As always, some rows contain error messages as "Name" values
# Determine the amount of such rows
n_error_message_rows = np.count_nonzero(
    csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
)

print(
    "Amount of rows the \"Name\" value of which has been assigned to "
    f"an error message: {n_error_message_rows}"
)

In [None]:
# Repeat the database queries for the rows having error messages as
# "Name" values
error_indices = csv_df.index[
    csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    ).to_list()
]

for idx in error_indices:
    NCBI_Gene_ID = csv_df.iloc[idx]["ID_manufacturer"]
    
    # Query NCBI's gene database with the Gene ID currently dealt with
    # Code execution is suspended for one second in order to avoid
    # server-side errors
    time.sleep(1)
    # As simply suspending code execution for a couple of seconds
    # unfortunately does not prevent the occurrence of errors
    # altogether, a try/except statement is incorporated retrying the
    # dabase query for three times in total
    for _ in range(3):
        try:
            NCBI_entry = entrez.fetch_single_file(
                uids=[NCBI_Gene_ID],
                file_name=None,
                db_name="gene",
                ret_type="",
                ret_mode="text"
            )
            break
        except:
            time.sleep(1)
    else:
        print(
            "Database query wasn't successful for the row with index "
            f"{idx}."
        )
        continue
    
    NCBI_entry_str = NCBI_entry.getvalue()

    # Different approaches are necessary depending on whether merely
    # the official gene symbol was altered, whereas the NCBI Gene ID
    # remained unchanged, or the record has been replaced altogether
    # with an entirely new ID
    if ID_change_str in NCBI_entry_str:
        # The respective record has been replaced altogether with a new
        # ID
        # Hence, the new ID is retrieved and used to query NCBI's gene
        # database
        NCBI_entry_str_list = NCBI_entry_str.split("\n")
        # For some strange reason, the string retrieved from the NCBI
        # entry contains blank lines; they are removed
        while "" in NCBI_entry_str_list:
            NCBI_entry_str_list.remove("")
        
        # The new ID is comprised in the penultimate list element and
        # conveniently enough separated from the preceding string by a
        # space character
        new_gene_ID = NCBI_entry_str_list[-2].split()[-1]
        csv_df.at[idx, "ID"] = new_gene_ID
        csv_df.at[idx, "ID_manufacturer"] = new_gene_ID

        # Again, in a bid to prevent the occurrence of server-side
        # errors, code execution is suspended for one second
        time.sleep(1)
        for _ in range(3):
            try:
                NCBI_entry = entrez.fetch_single_file(
                    uids=[new_gene_ID],
                    file_name=None,
                    db_name="gene",
                    ret_type="",
                    ret_mode="text"
                )
                break
            except:
                time.sleep(1)
        else:
            print(
                "Querying the database wasn't successful for the "
                f"updated gene ID {new_gene_ID} (row {idx})."
            )
            continue
        
        NCBI_entry_str = NCBI_entry.getvalue()
        NCBI_entry_str_list = NCBI_entry_str.split("\n")
        while "" in NCBI_entry_str_list:
            NCBI_entry_str_list.remove("")
        
        # The official gene symbol is comprised in the first list
        # element, but is preceded by the string "1. ", which
        # encompasses three characters
        official_gene_symbol = NCBI_entry_str_list[0][3:]

        csv_df.at[idx, "Name"] = official_gene_symbol
    else:
        # Remove blank lines from the string retrieved from the NCBI
        # entry
        NCBI_entry_str_list = NCBI_entry_str.split("\n")
        while "" in NCBI_entry_str_list:
            NCBI_entry_str_list.remove("")
        
        # Following the removal of empty strings, the official gene
        # symbol is represented by the first list element, but it is
        # preceded by the string "1. ", which encompasses three
        # characters
        # Hence, prior to comparing the gene names provided by the VACV
        # screen data set to the official gene symbols, the first list
        # element has to be sliced accordingly
        official_gene_symbol = NCBI_entry_str_list[0][3:]
        csv_df.at[idx, "Name"] = official_gene_symbol

        # In addition, the corresponding cell of the "ID" feature is
        # populated with the NCBI Gene ID harboured by the
        # "ID_manufacturer" feature (remember that cells of the "ID"
        # feature are not continuously populated)
        csv_df.at[idx, "ID"] = NCBI_Gene_ID

csv_df.to_csv(
    "Vaccinie_Report_correct_indexing_error_messages_fixed.csv",
    sep="\t",
    index=False
)

In [None]:
new_csv_df = pd.read_csv(
    "Vaccinie_Report_correct_indexing_error_messages_fixed.csv",
    sep="\t",
    dtype=dtype_dict
)

n_error_messages = np.count_nonzero(
    new_csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    new_csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
)

print(n_error_messages)

In [None]:
# Also determine the amount of rows having multiple gene IDs
# They are identified via the presence of a semicolon in the
# "ID_manufacturer" column
n_multiple_IDs = np.count_nonzero(
    csv_df["ID_manufacturer"].str.contains(
        ";", regex=False
    )
)

print(
    f"Amount of rows having multiple gene IDs: {n_multiple_IDs}"
)

In [None]:
# A separate procedure for rows having multiple gene IDs has been
# implemented in the function `update_gene_ID_and_off_gene_symbol` in
# the file `gene_ID_and_off_gene_symbol_check.py`
# Via this function, the database query has been repeated from the very
# beginning
# Astonishingly enough, no database queries whatsoever failed
# Yet, as usual, some rows have error messages as "Name" entry
# Load the updated CSV file into a Pandas DataFrame and determine the
# total amount of rows having an error message as "Name" value
csv_df = pd.read_csv(
    "Vaccinia_Report_NCBI_Gene_IDs_and_official_gene_symbols_updated.csv",
    sep="\t",
    dtype=dtype_dict
)

n_error_message_rows = np.count_nonzero(
    csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
)

print(
    "Total amount of rows in the updated CSV file having an error "
    f"message as \"Name\" value: {n_error_message_rows}"
)

# Now, determine how many of those rows harbour multiple gene IDs
n_error_message_rows_with_multi_ID = np.count_nonzero(
    (
        csv_df["Name"].str.contains(
            "External+viewer+error",
            regex=False
        )
        |
        csv_df["Name"].str.contains(
            "OCTYPE html PUBLIC",
            regex=False
        )
    )
    &
    csv_df["Name"].str.contains(";", regex=False)
)

print(
    "Amount of rows with multiple gene IDs having an error message "
    f"as \"Name\" value: {n_error_message_rows_with_multi_ID}"
)

In [None]:
# Instead if iterating over each of the affected rows individually, a
# more intelligent approach consists of iterating over the unique gene
# IDs of the affected rows
unique_gene_IDs_error_message_rows = csv_df.loc[
    csv_df["Name"].str.contains(
        "External+viewer+error",
        regex=False
    )
    |
    csv_df["Name"].str.contains(
        "OCTYPE html PUBLIC",
        regex=False
    )
]["ID_manufacturer"].unique()

print(
    "Amount of unique gene IDs of rows having error messages as "
    f"\"Name\" value: {len(unique_gene_IDs_error_message_rows)}"
)

In [13]:
# It emerges that error messages as "Name" value exclusively occur among
# single gene IDs, which is pleasant as it renders addressing the issue
# significantly easier
# Again, assuming the database query to take one second for each of the
# 62 affected gene IDs, the total "waiting time" amounts to slightly
# more than 1 minute
# Therefore, the repeated database query for the error messages is not
# conducted remotely, but locally
from CSV_file_utils import mend_error_messages_single_IDs

csv_df = mend_error_messages_single_IDs(csv_df)

# Save the Pandas DataFrame to a CSV file
csv_df.to_csv(
    "Vaccinia_Report_updated_and_errors_fixed.csv",
    sep="\t",
    index=False
)

In [30]:
# Following the standardisation of the "Name" values via database
# queries and the repeated query for those rows for which the initial
# query failed, the phenomenon of gene IDs co-occurring with multiple
# unique names should no longer be observable
csv_df = pd.read_csv(
    "Vaccinia_Report_updated_and_errors_fixed.csv",
    sep="\t",
    dtype=dtype_dict
)

In [None]:
unique_gene_IDs = csv_df.loc[
    (csv_df["WellType"] == "SIRNA")
    |
    (csv_df["WellType"] == "POOLED_SIRNA")
    |
    (csv_df["WellType"] == "ESIRNA")
]["ID_manufacturer"].unique()

non_unique_list = []
for unique_gene_ID in unique_gene_IDs:
    if unique_gene_ID == "Not available":
        continue
    names_series = csv_df.loc[
        csv_df["ID_manufacturer"] == unique_gene_ID, "Name"
    ]
    if len(names_series.unique()) > 1:
        non_unique_list.append(unique_gene_ID)

print(
    "Amount of manufacturer IDs occurring in conjunction with more "
    f"than one unique value in the \"Name\" column: {len(non_unique_list)}"
)

In [None]:
single_pooled_siRNA_and_esiRNA_names = csv_df.loc[
    # Bear in mind that due to operator precedence, i.e. the logical OR
    # being evaluated before the equality check, the equality check has
    # to be surrounded by parentheses
    (csv_df["WellType"] == "SIRNA")
    |
    (csv_df["WellType"] == "POOLED_SIRNA")
    |
    (csv_df["WellType"] == "ESIRNA")
]["Name"]

n_name_not_specified = np.count_nonzero(
    single_pooled_siRNA_and_esiRNA_names == "Not available"
)

print(
    "Amount of perturbation agents for which the target genes are not "
    f"specified: {n_name_not_specified}"
)

In [7]:
# Since no entries whatsoever are found on the respective website for
# many of the affected 22 perturbation agents and their quality status
# is also unknown, they are neglected from now on
# Of those 22 perturbation agents with unknown targets, 20 are single
# siRNAs from Ambion (purchased by Thermo Fischer) 2 are esiRNAs from
# Sigma
# Among the 20 agents from Ambion, there are 10 unique catalogue numbers

In [31]:
"""
The import statements below are explained on the following pages:
https://discourse.mcneel.com/t/python-import-module-not-updating-after-changed/92458
https://support.enthought.com/hc/en-us/articles/204469240-Jupyter-IPython-After-editing-a-module-changes-are-not-effective-without-kernel-restart
"""
import CSV_file_utils
from imp import reload
reload(CSV_file_utils)

single_pooled_siRNA_and_esiRNA_df = csv_df.loc[
    (csv_df["WellType"] == "SIRNA")
    |
    (csv_df["WellType"] == "POOLED_SIRNA")
    |
    (csv_df["WellType"] == "ESIRNA")
]

valid_siRNA_csv_df = CSV_file_utils.extract_valid_and_named_targets_from_df(
    single_pooled_siRNA_and_esiRNA_df
)

In [None]:
# Save this Pandas DataFrame to a CSV file
valid_siRNA_csv_df.to_csv(
    "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA.csv",
    sep="\t",
    index=False
)

In [3]:
siRNA_df = pd.read_csv(
    "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA.csv",
    sep="\t",
    dtype=dtype_dict
)

In [4]:
# Some rows contain multiple gene IDs, which is actually not supposed to
# be the case as that means that more than one gene is targeted
# Identify those entries containing multiple gene IDs
multiple_names_entries = siRNA_df[
    siRNA_df["Name"].str.contains(";")
]["Name"]

unique_multi_name_entries = multiple_names_entries.unique()

for multi_name_entry in unique_multi_name_entries:
    print(multi_name_entry)

NME2P1;NME2
MAP2K2;LOC407835
LOC729033;LOC729737
RPSAP19;RPSAP29
RPSAP29;RPSA
NUTM2D;NUTM2E
GGT3P;GGT1
LOC729218;LOC729737
LOC730076;LOC441666;LOC441666
CTSLP1;CTSLP4
DUX5;DUX4L1
PMS2P2;PMS2P13
LOC112268070;LOC728417
GOLGA6B;GOLGA6L4
STAG3L1;STAG3


In [1]:
# The following observations can be made for the 15 entries containing
# multiple gene IDs:
#
# NME2P1 (283458) and NME2 (4831): Both genes occur individually as well
# as in conjunction with each other; they are both involved in
# nucleoside diphosphate kinase activity; NME2P1 occurs in conjunction
# with three different Qiagen catalogue numbers, all three of which
# indeed are directed against NME2P1; NME2 occurs in conjunction with
# six Qiagen catalogue numbers, all six of which indeed are directed
# against either NME2 or NME1-NME2; the composite entry "NME2P1;NME2"
# occurs in conjunction with one Qiagen catalogue number directed
# against NME1-NME2; all three (NME2P1, NME2 and NME1-NME2) are distinct
# genes and proteins, respectively
#
# MAP2K2 (5605) and LOC407835 (407835): Both genes occur individually as
# well as in conjunction with one another; the latter is a pseudogene of
# the former; the composite entry "MAP2K2;LOC407835" occurs in
# conjunction with one Qiagen catalogue number directed against MAP2K2;
# MAP2K2 occurs in conjunction with six different Qiagen catalogue
# numbers, each of which indeed is directed against MAP2K2; LOC407835
# occurs in conjunction with five different Qiagen catalogue numbers,
# four of which indeed are directed against LOC407835
#
# LOC729033 (729033) and LOC729737 (729737): Both genes occur
# individually as well as in conjunction with each other; information as
# to their functions/roles in not given; the composite entry
# "LOC729033;LOC729737" occurs in conjunction with one Qiagen catalogue
# number directed against LINC00265; LOC729033 occurs in conjunction
# with two different Qiagen catalogue numbers, all two of which are not
# directed against LOC729033 (at least according to the website;
# LOC729737 occurs in conjunction with five different Qiagen catalogue
# numbers, none of which are directed against LOC729737
#
# RPSAP19 (730029) and RPSAP29 (389141): Both genes occur individually
# as well as in conjunction with each other; both are ribosomal proteins
# pseudogenes; the composite entry "RPSAP19;RPSAP29" occurs in
# conjunction with one Qiagen catalogue number directed against
# LOC388954; RPSAP19 occurs in conjunction with one Qiagen catalogue
# number directed against LOC388954; RPSAP29 occurs in conjunction with
# three Qiagen catalogue numbers, none of which are directed against
# RPSAP29
#
# RPSAP29 (389141) and RPSA (3921): Both genes occur individually as
# well as in conjunction with each other; the composite entry
# "RPSAP29;RPSA" occurs in conjunction with one Qiagen catalogue number
# directed against LOC389848; RPSAP29 has already been investigated
# before; RPSA occurs in conjunction with three different Qiagen
# catalogue numbers, all three of which indeed are directed against RPSA
#
# NUTM2D (728130) and NUTM2E (283008): Both genes occur individually as
# well as in conjunction with each other; the composite entry
# "NUTM2D;NUTM2E" occurs in conjunction with two different QIagen
# catalogue numbers, both of which are directed against LOC387697;
# NUTM2D occurs in conjunction with four different Qiagen catalogue
# numbers, one of which is not directed against NUTM2D, but against
# LOC387697; NUTM2E occurs in conjunction with six Qiagen catalogue
# numbers, none of which are directed against NUTM2E
#
# GGT3P (2679) and GGT1 (2678): Both genes occur individually as well as
# in conjunction with each other; the composite entry "GGT3P;GGT1"
# occurs in conjunction with one Qiagen catalogue number directed
# against LOC440802; GGT3P occurs in conjunction with five Qiagen
# catalogue numbers, only two of which are directed GGT3P; GGT1 occurs
# in conjunction with three Qiagen catalogue numbers, all three of which
# are directed against GGT1
#
# LOC729218 (729218) and LOC729737 (729737): Both genes occur
# individually as well as in conjunction with each other; the composite
# entry "LOC729218;LOC729737" occurs in conjunction with one Qiagen
# catalogue number directed against LOC653056; LOC729218 occurs in
# conjunction with four different Qiagen catalogue numbers, none of
# which are directed against LOC729218; LOC729737 occurs in conjunction
# with five different Qiagen catalogue numbers, none of which are
# directed against LOC729737
#
# LOC730076 (730076) and LOC441666 (441666): Both genes occur
# individually as well as in conjunction with each other; the composite
# entry "LOC730076;LOC441666" occurs in conjunction with one Qiagen
# catalogue number directed against LOC391445; LOC730076 occurs in
# conjunction with one Qiagen catalogue number directed against
# LOC391445; LOC441666 occurs in conjunction with three different Qiagen
# catalogue numbers, only one of which is directed against LOC441666
#
# CTSLP1 (118945) and CTSLP4 (644496): Both genes occur individually as
# well as in conjunction with each other; the composite entry
# "CTSLP1;CTSLP4" occurs in conjunction with two different Qiagen
# catalogue numbers, both of which are directed against LOC340736;
# CTSLP1 occurs in conjunction with one Qiagen catalogue number directed
# against LOC340736; CTSLP4 occurs in conjunction with one Qiagen
# catalogue number directed against LOC441555
#
# DUX5 (26581) and DUX4L1 (22947): Both genes occur individually as well
# as in conjunction with each other; the composite entry "DUX5;DUX4L1"
# occurs in conjunction with one Qiagen catalogue number directed
# against DUX5; DUX5 occurs in conjunction with two different Qiagen
# catalogue numbers, both of which are directed against DUX5; DUX4L1
# occurs in conjunction with three different Qiagen catalogue numbers,
# all of which are directed against DUX4L1
#
# PMS2P2 (5380) and PMS2P13 (441259): Both genes occur individually as
# well as in conjunction with each other; the composite entry
# "PMS2P2;PMS2P13" occurs in conjunction with one Qiagen catalogue
# number directed against LOC402554; PMS2P2 occurs in conjunction with
# two differen Qiagen catalogue numbers, both of which are directed
# against PMS2P2; PMS2P13 occurs in conjunction with two different
# Qiagen catalogue numbers, both of which are directed against LOC402554
#
# LOC112268070 (112268070) and LOC728417 (728417): Both genes occurs
# individually as well as in conjunction with each other; the composite
# entry "LOC112268070;LOC728417" occurs in conjunction with one Qiagen
# catalogue number directed against LOC440469; LOC112268070 occurs in
# conjunction with two Qiagen catalogue numbers, both of which are
# directed against LOC440469; LOC728417 occurs in conjunction with two
# different Qiagen catalogue numbers, both of which are not directed
# against LOC728417
#
# GOLGA6B (55889) and GOLGA6L4 (643707): Both genes occur individually
# as well as in conjunction with each other; both are part of the golgin
# gene family, i.e. duplicons the protein products of which localise in
# the Golgi apparatus; the composite entry "GOLGA6B;GOLGA6L4" occurs in
# conjunction with one Qiagen catalogue number directed against GOLGA6C;
# GOLGA6B occurs in conjunction with five different Qiagen catalogue
# numbers, three of which indeed are directed against GOLGA6B and two of
# which are directed against GOLGA6C; GOLGA6L4 occurs in conjunction
# with nine different Qiagen catalogue numbers, only three of which
# indeed are directed against GOLGA6L4
#
# STAG3L1 (54441) and STAG3 (10734): Both genes occur individually as
# well as in conjunction with each other; both are involved in sister
# chromatid cohesion; the composite entry "STAG3L1;STAG3" occurs in
# conjunction with one Qiagen catalogue number directed against 
# LOC441254 (which is similar to, yet distinct from STAG3); STAG3L1
# occurs in conjunction with seven different Qiagen catalogue numbers,
# three of which indeed are directed against STAG3L1; STAG3 occurs in
# conjunction with three different Qiagen catalogue numbers, all three
# of which indeed are directed against STAG3

In [5]:
# For multi-gene entry, it is investigated whether the constituent
# entries also occur individually
individual_occurrence = []

for multi_name_entry in unique_multi_name_entries:
    for constituent_name in multi_name_entry.split(";"):
        # Bear in mind that contrary to expectation, using the "in"
        # operator in conjunction with a Pandas DataFrame does not check
        # for agreement between the query value and the DataFrame/Series
        # values, but between the query value and the DataFrame/Series
        # indices
        # Hence, explicitly retrieving the DataFrame/Series values is
        # necessary via the .to_list() method
        individual_occurrence.append(
            constituent_name in siRNA_df["Name"].to_list()
        )

print(
    "All constituent gene names occurring in multi-gene entires also "
    f"occur individually: {all(individual_occurrence)}"
)

All constituent gene names occurring in multi-gene entires also occur individually: True


In [6]:
# Determine the manufacturers for which those composite entries occur
unique_composite_manufacturers = siRNA_df[
    siRNA_df["Name"].str.contains(";")
]["Manufacturer"].unique()

print(unique_composite_manufacturers)

['Qiagen']


In [8]:
# Some Qiagen catalogue numbers were found to exhibit inconsistencies
# between the target gene assigned to them in the TSV file and that
# specified on the website
# Therefore, the congruence between the target gene specified in the TSV
# file and that specified on the website is rigorously examined for the
# catalogue numbers of all vendors

# To this end, a TSV file is created for each vendor (Ambion, Dharmacon,
# Qiagen, Sigma) encompassing information as to whether or not the
# aforementioned congruence exists
# The TSV file contains the following columns for each vendor:
# -> Catalogue number
# -> Target specified on website
# -> Target specified in TSV file
# -> Alternative names of target specified in TSV file
# -> Congruence between TSV file and website

# Composite entries, i.e. entries consisting of multiple gene names
# separated by semicolons have already been subjected to close scrutiny
# and are thus discarded
only_single_entries_df = siRNA_df[
    ~siRNA_df["Name"].str.contains(";")
]

manufacturers = ["Ambion", "Dharmacon", "Qiagen", "Sigma"]

for manufacturer in manufacturers:
    manufacturer_df = only_single_entries_df[
        only_single_entries_df["Manufacturer"] == manufacturer
    ]

    # When trying to obtain unique values across multiple columns, i.e.
    # unique combinations of values across multiple columns, the
    # well-known ".unique()" method cannot be used as it only works for
    # Serieses
    # Instead, the ".drop_duplicates()" method has to be employed, which
    # works for both Serieses and DataFrames
    unique_cat_num_target_pairs = manufacturer_df[
        ["Catalog_number", "Name"]
    ].drop_duplicates()

    # Continue here!
    assert (
        len(unique_cat_num_target_pairs)
        ==
        len(manufacturer_df["Catalog_number"].unique())
    ), (
        f""
    )

    # Create the additional columns to be concatenated to the DataFrame
    # later on
    website_target_col = pd.Series(
        data=["To be determined"] * len(),
        dtype=str
    )

assert (
    len(ambion_unique_cat_num_target_pairs)
    ==
    len(ambion_df["Catalog_number"].unique())
), (
    "Some Qiagen catalogue numbers have been assigned to multiple gene "
    "targets or vice versa!"
)

# Create the TSV file for Dharmacon
dharmacon_df = only_single_entries_df[
    only_single_entries_df["Manufacturer"] == "Dharmacon"
]

dharmacon_unique_cat_num_target_pairs = dharmacon_df[
    ["Catalog_number", "Name"]
].drop_duplicates()

assert (
    len(dharmacon_unique_cat_num_target_pairs)
    ==
    len(dharmacon_df["Catalog_number"].unique())
), (
    "Some Dharmacon catalogue numbers have been assigned to multiple "
    "gene targets or vice versa!"
)

# Create the TSV file for Qiagen
qiagen_df = only_single_entries_df[
    only_single_entries_df["Manufacturer"] == "Qiagen"
]

qiagen_unique_cat_num_target_pairs = qiagen_df[
    ["Catalog_number", "Name"]
].drop_duplicates()

assert (
    len(qiagen_unique_cat_num_target_pairs)
    ==
    len(qiagen_df["Catalog_number"].unique())
), (
    "Some Qiagen catalogue numbers have been assigned to multiple gene "
    "targets or vice versa!"
)

# Create the TSV file for Sigma
sigma_df = only_single_entries_df[
    only_single_entries_df["Manufacturer"] == "Sigma"
]

sigma_unique_cat_num_target_pairs = sigma_df[
    ["Catalog_number", "Name"]
].drop_duplicates()

assert (
    len(sigma_unique_cat_num_target_pairs)
    ==
    len(sigma_df["Catalog_number"].unique())
), (
    "Some Sigma catalogue numbers have been assigned to multiple gene "
    "targets or vice versa!"
)

In [8]:
# Investigate the coverage of the single/pooled siRNA and esiRNA subset,
# i.e. whether it covers the entire human genome or only a smaller
# fraction
from CSV_file_utils import determine_coverage

coverage_single_pooled_siRNA_esiRNA_all_manufacs = determine_coverage(
    siRNA_df
)

print(
    "Amount of genes covered by the single/pooled siRNA and esiRNA "
    f"subset: {coverage_single_pooled_siRNA_esiRNA_all_manufacs}"
)
print()

# Additionally, the coverage of the individual manufacturers and their
# individual siRNA types is determined
# Exclusively single siRNAs were obtained from Ambion and Qiagen
ambion_df = siRNA_df[
    siRNA_df["Manufacturer"] == "Ambion"
]
coverage_ambion = determine_coverage(ambion_df)

qiagen_df = siRNA_df[
    siRNA_df["Manufacturer"] == "Qiagen"
]
coverage_qiagen = determine_coverage(qiagen_df)

# Exclusively esiRNAs were obtained from Sigma
sigma_df = siRNA_df[
    siRNA_df["Manufacturer"] == "Sigma"
]
coverage_sigma = determine_coverage(sigma_df)

# Both single and pooled siRNAs have been obtained from Dharmacon
# However, both siRNA modalities do not have composite entries so that
# the distinction between single and pooled siRNAs must not be performed
# via the presence of semicolons
dharmacon_single_df = siRNA_df[
    # Bear in mind that due to operator precedence, i.e. the logical AND
    # being evaluated prior to the equality check, the equality check
    # has to be surrounded by parentheses
    (siRNA_df["Manufacturer"] == "Dharmacon")
    &
    (siRNA_df["WellType"] == "SIRNA")
]
dharmacon_single_coverage = determine_coverage(dharmacon_single_df)

dharmacon_pooled_df = siRNA_df[
    (siRNA_df["Manufacturer"] == "Dharmacon")
    &
    (siRNA_df["WellType"] == "POOLED_SIRNA")
]
dharmacon_pooled_coverage = determine_coverage(dharmacon_pooled_df)

Amount of genes covered by the single/pooled siRNA and esiRNA subset: 20665



In [9]:
# Display the results in a tabular form
column_names = [
    "Manufacturer",
    "siRNA type",
    "Gene coverage"
]
ambion_row = ["Ambion", "single siRNAs", str(coverage_ambion)]
qiagen_row = ["Qiagen", "single siRNAs", str(coverage_qiagen)]
sigma_row = ["Sigma", "esiRNAs", str(coverage_sigma)]
dharmacon_single_row = [
    "Dharmacon", "single siRNAs", str(dharmacon_single_coverage)
]
dharmacon_pooled_row = [
    "", "pooled siRNAs", str(dharmacon_pooled_coverage)
]

rows_bundled = [
    column_names,
    ambion_row,
    qiagen_row,
    sigma_row,
    dharmacon_single_row,
    dharmacon_pooled_row
]

# Determine the largest length for each column
max_length_per_column = []
for column_vals in zip(*rows_bundled):
    max_length_per_column.append(
        max(map(len, column_vals))
    )

horizontal_line_list = [
    "_" * column_width for column_width in max_length_per_column
]
horizontal_buffer_list = [
    " " * column_width for column_width in max_length_per_column
]

# Apply central alignment to each row element in accordance with the
# respective column width
def central_alignment(string, width):
    return string.center(width)

for i, row in enumerate(rows_bundled):
    row = list(map(central_alignment, row, max_length_per_column))
    print(*row, sep=" | ")
    if i == 0:
        print(" | ".join(horizontal_line_list))
        print(" | ".join(horizontal_buffer_list))

Manufacturer |   siRNA type  | Gene coverage
____________ | _____________ | _____________
             |               |              
   Ambion    | single siRNAs |      1327    
   Qiagen    | single siRNAs |     20225    
   Sigma     |    esiRNAs    |      252     
 Dharmacon   | single siRNAs |      709     
             | pooled siRNAs |     17962    


In [11]:
# Now, it is tried to map the gene IDs to UniProt IDs via the mapping
# GUI on https://www.uniprot.org/id-mapping
# To this end, the unique gene IDs comprised in the CSV file are printed
# and pasted into the GUI
gene_IDs = siRNA_df["ID_manufacturer"]

# Keep in mind that there are composite entries requiring special
# handling
single_gene_IDs = [
    single_ID
    for ID_entry in gene_IDs
    for single_ID in ID_entry.split(";")
]

unique_gene_IDs = np.unique(single_gene_IDs)

for unique_ID in unique_gene_IDs:
    print(unique_ID)

1
10
100
1000
10000
10001
10002
10003
100037417
100038246
10004
10005
10006
10007
10008
10009
1001
10010
100101120
100101629
10011
100124537
100124700
100128124
100128252
100128285
100128378
100128385
100128537
100128553
100128554
100128569
100128731
100128762
100128782
100128788
100128927
100129028
100129066
100129075
100129128
100129271
100129347
100129460
100129482
100129550
100129583
100129603
100129645
100129792
100129842
10013
100130302
100130348
100130370
100130417
100130418
100130613
100130691
100130733
100130758
100130873
100130934
100130958
100130967
100131137
100131187
100131213
100131250
100131439
100131551
100131755
100131814
100131827
100131897
100131902
100131997
100132031
100132057
100132062
100132066
100132103
100132247
100132285
100132288
100132341
100132396
100132406
100132417
100132476
100132510
100132565
100132708
100132832
100132948
100132963
100132979
100133284
100133941
100134934
100134938
100137047
100137049
10014
100144748
10015
10016
100169851
10017
100170765

In [12]:
# The mapping is performed with the following settings: "From database:
# Genome annotation databases/GeneID" (Gene ID is the same as NCBI Gene
# ID) and "To database: UniProt/UniProtKB/Swiss-Prot" (Swiss-Prot
# contains only reviewed entries)
# As a result, 18,385 IDs were mapped to 18,478 results, i.e. some gene
# IDs have been mapped to multiple UniProt IDs, which has to be
# subjected to closer scrutiny
# Apart from that, 2,280 ID were not mapped; it is likely that entries
# for those gene IDs are available in UniProt TrEMBL

# Determine the gene IDs that have been mapped to multiple UniProt IDs
# To this end, the TSV result file from the UniProt mapping tool is
# loaded into a Pandas DataFrame
uniprot_mapping_df = pd.read_csv(
    "idmapping_VACV_screen_genes_to_UniProt_only_reviewed.tsv",
    sep="\t"
)

unique_gene_IDs = uniprot_mapping_df["From"].unique()

multiple_mappings_list = []
for gene_ID in unique_gene_IDs:
    if (uniprot_mapping_df["From"] == gene_ID).sum() > 1:
        multiple_mappings_list.append(gene_ID)

In [17]:
print(
    "Amount of gene IDs mapped to multiple UniProt IDs: "
    f"{len(multiple_mappings_list)}"
)

print(multiple_mappings_list[:6])

Amount of gene IDs mapped to multiple UniProt IDs: 73
[100134938, 101060321, 1029, 10326, 10407, 10627]


In [None]:
# The examination of the first five gene IDs mapped to multiple UniProt
# IDs reveals the following:
#
# 100134938 (UPK3BL1): Has been mapped to the protein encoded by UPK3BL1
# as well as to the protein encoded by UPK3BL2, the proteins are
# identical
#
# 101060321 (TBC1D3G): Has been mapped to the protein encoded by TBC1D3G
# as well as to that encoded by TBC1D3C; the two proteins have the same
# length and differ only in a couple of amino acids
#
# 1029 (CDKN2A): Has been mapped to two proteins, both of which are
# encoded by CDKN2A; they are generated by alternative splicing, i.e.
# are isoforms
#
# 10326 (SIRPB1): As for 1029, 10326 has been mapped to two different
# isoforms generated by alternative splicing
#
# 10407 (SPAG11B): Has been mapped to the protein encoded by SPAG11B as
# well as to the protein encoded by SPAG11A; the two protein sequences
# are similar to each other
#
# 10627 (MYL12A): Has been mapped to the protein encoded by MYL12A as
# well as to the protein encoded by MYL12B; the two proteins differ from
# each other significantly in sequence composition

In [1]:
# A couple of the 2,280 gene IDs for which mapping failed are subjected
# to closer examination
#
# 644790: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 145858: Querying NCBI reveals that it is ncRNA and thus not translated
# into a protein
#
# 442661: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 128854: Querying NCBI reveals that it is a pseudogene; nevertheless,
# a UniProt entry exists
#
# 619190: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 338667: Querying NCBI reveals that it is a validated protein coding
# gene; a UniProt entry exists
#
# 442075: Querying NCBI reveals that it is ncRNA and thus not translated
# into a protein
#
# 391827: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 391276: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 2210: Querying NCBI reveals that it is a pseudogene; nevertheless, a
# UniProt entry exists
#
# 389112: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 642389: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry
#
# 730076: Querying NCBI reveals that it is a pseudogene; it has no
# UniProt entry

In [29]:
# Trying to map the UniProt IDs of the human interaction partners of the
# VACV-human interactions to gene IDs revealed that some proteins are
# mapped to multiple gene IDs
# Upon closer scrutiny, it emerged that the affected proteins are
# histones, which are encoded by multiple genes
# Thus, for each of those histones, one gene is selected
# If the histones occur in the screen, the gene name used there will be
# employed
histone_UniProt_ids = [
    "P04908", "P0C0S8", "P62805", "P62807", "P68431", "P84243",
    "Q6FI13", "Q71DI3"
]

histone_num_genes_list = [2, 5, 14, 5, 10, 2, 2, 3]

histone_gene_ids = []

df_gene_names = siRNA_df["Name"].to_list()

for i, histone_UniProt_id in enumerate(histone_UniProt_ids):
    # As no target path is specified in the "uniprot.fetch()" call
    # below, the function returns a StringIO/BytesIO object the content
    # of which can be accessed via the ".getvalue()" method
    UniProt_entry = uniprot.fetch(histone_UniProt_id, format="txt")
    UniProt_entry_str = UniProt_entry.getvalue()
    
    # Conveniently enough, each row in the returned string begins with
    # a descriptive abbreviation
    # As the information we are interested in are the gene names, rows
    # beginning with the abbreviation "GN" are extracted
    gene_rows_list = [
        # Remove the "GN   " preceding each row
        row[5:] for row in UniProt_entry_str.split("\n")
        if row[:2] == "GN"
    ]
    
    # Entries of individual genes begin with "Name="
    # Thus, rows beginning with this string are extracted
    gene_name_rows_list = [
        row for row in gene_rows_list if row[:5] == "Name="
    ]

    assert len(gene_name_rows_list) == histone_num_genes_list[i], (
        "There is a mismatch between the amount of genes for "
        f"{histone_UniProt_id} and the amount of retrieved rows!"
    )
    
    gene_names = []
    for row in gene_name_rows_list:
        # In order to obtain the gene name, two successive string splits
        # have to be performed; for the first split, the space character
        # is used as separator, whereas for the second split, the
        # equality sign is used as separator
        gene_name = (row.split()[0]).split("=")[1]
        gene_names.append(gene_name)
    
    n_gene_names_used = sum([
        name in df_gene_names for name in gene_names
    ])
    print(n_gene_names_used)

2
4
0
5
10
2
1
1


In [7]:
# Investigating a couple of the gene IDs for which the mapping failed
# reveals that while for some gene IDs, UniProt entries indeed do not
# exist, there are other gene IDs which should have been mapped to
# existing UniProt IDs
# It is suspected that the error arises from the large amount of query
# gene IDs, which is why the mapping is repeated with only two gene IDs
# known to have reviewed UniProt entries; however, this also fails,
# which is why the mapping is repeated with the following settings:
# "From database: UniProt/Gene Name",
# "To database: UniProt/UniProtKB/Swiss-Prot",
# "Restrict by organism: Homo sapiens [9606]"
# To this end, the unique gene names are printed
# Bear in mind that some entries of the "Name" column are composite
# entries with the individual gene names separated by semicolons
unique_gene_names = np.unique([
    gene_name for entry in siRNA_df["Name"].unique()
    for gene_name in entry.split(";")
]).tolist()

for gene_name in unique_gene_names:
    print(gene_name)

A1BG
A1CF
A2M
A2ML1
A3GALT2
A4GALT
A4GNT
AAAS
AACS
AACSP1
AADAC
AADACL2
AADACL4
AADACP1
AADAT
AAGAB
AAK1
AAMDC
AAMP
AANAT
AAR2
AARD
AARS1
AARS2
AARSD1
AASDH
AASDHPPT
AASS
AATBC
AATF
AATK
ABAT
ABCA1
ABCA10
ABCA11P
ABCA12
ABCA13
ABCA15P
ABCA17P
ABCA2
ABCA3
ABCA3P1
ABCA4
ABCA5
ABCA6
ABCA7
ABCA8
ABCA9
ABCB1
ABCB10
ABCB11
ABCB4
ABCB5
ABCB6
ABCB7
ABCB8
ABCB9
ABCC1
ABCC10
ABCC11
ABCC12
ABCC13
ABCC2
ABCC3
ABCC4
ABCC5
ABCC6
ABCC8
ABCC9
ABCD1
ABCD2
ABCD3
ABCD4
ABCE1
ABCF1
ABCF2
ABCF3
ABCG1
ABCG2
ABCG4
ABCG5
ABCG8
ABHD1
ABHD10
ABHD11
ABHD12
ABHD12B
ABHD13
ABHD14A
ABHD14B
ABHD15
ABHD16A
ABHD16B
ABHD17A
ABHD17AP1
ABHD17AP4
ABHD17AP8
ABHD17B
ABHD17C
ABHD18
ABHD2
ABHD3
ABHD4
ABHD5
ABHD6
ABHD8
ABI1
ABI2
ABI3
ABI3BP
ABITRAM
ABITRAMP1
ABL1
ABL2
ABLIM1
ABLIM2
ABLIM3
ABO
ABR
ABRA
ABRACL
ABRAXAS1
ABRAXAS2
ABT1
ABTB1
ABTB2
ABTB3
ACAA1
ACAA2
ACACA
ACACB
ACAD10
ACAD11
ACAD8
ACAD9
ACADL
ACADM
ACADS
ACADSB
ACADVL
ACAN
ACAP1
ACAP2
ACAP3
ACAT1
ACAT2
ACBD3
ACBD4
ACBD5
ACBD6
ACBD7
ACCS
ACCSL
ACD
ACE
ACE2
ACER1
ACER

In [None]:
# As a result, 18,955 IDs were mapped to 19,417 results, i.e. some gene
# names have been mapped to multiple UniProt IDs
# Apart from that, 1,710 ID were not mapped
# It becomes apparent that in comparison to the mapping of gene IDs to
# UniProt IDs, the mapping of gene names to UniProt IDs succeeded for
# more genes

In [None]:
# Now, in order to further narrow down the genes for which to perform
# the database query, a second mapping on the UniProt website is
# performed involving the 1,710 genes for which the previous mapping
# failed
# It is assumed that many of those 1,710 genes have unreviewed UniProt
# entries, i.e. entries in the TrEMBL section of UniProt
# Therefore, the mapping for the 1,710 genes in performed with the
# following settings:
# "From database: UniProt/Gene Name",
# "To database: UniProt/UniProtKB",
# "Restrict by organism: Homo sapiens [9606]"

# As a result, 51 IDs were mapped to 72 results, i.e. some gene names
# have been mapped to multiple UniProt IDs
# As the amount of results is manageable, all multiple mappings have
# been manually investigated
# The manual investigation revealed that some genes have been mapped to
# multiple UniProt entries with exactly the same sequence, while others
# have been mapped to UniProt entries with sequences of different
# lengths
# In the case of UniProt entries with identical sequences, simply the
# first is chosen, while the remaining one are discarded
# However, in the case, of UniProt entries with sequences of different
# lengths, the longest sequence is chosen, while the remaining ones are
# discarded
# The rationale behind retaining the longest sequence is the following:
# One approach to off-target filtering consists of employing a PPI
# prediction model, which relies on the amino acid sequences of the
# proteins in question; by retaining the longest sequence, the chance of
# identifying significant interactions increases; in off-target
# filtering, filtering too many correct candidates out is probably more
# dramatic than retaining too many incorrect candidates (?)
# Only keeping one UniProt ID is also not problematic with regard to the
# alternative approach to off-target filtering, which consists of
# employing the interaction information deposited in the STRING database
# STRING also accepts gene names as query term, obviating the necessity
# to manually curate a UniProt ID
# Apart from that, 1,659 ID were not mapped
# For those 1,659 gene names, database queries have to be performed in
# order to determine the precise reason for them not giving rise to a
# protein

In [7]:
# For the time being, composite entries, i.e. siRNA targeting multiple
# genes at once are not taken into account
# Thus, they are discarded and a TSV file containing exclusively single
# entries is created
only_single_entries_df = siRNA_df[
    ~siRNA_df["Name"].str.contains(";")
]

only_single_entries_df.to_csv(
    (
        "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
        "entries_only.tsv"
    ),
    sep="\t",
    index=False
)

In [3]:
only_single_entries_df = pd.read_csv(
    (
        "VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
        "entries_only.tsv"
    ),
    sep="\t",
    dtype=dtype_dict
)

In [4]:
# Now, select one UniProt entry for genes mapped to multiple UniProt
# entries as described above
UniProt_TrEMBL_mappings = pd.read_csv(
    "idmapping_VACV_screen_unmapped_gene_names_to_UniProt_IDs.tsv",
    sep="\t"
)

In [15]:
# Determine the gene names mapped to multiple UniProt TrEMBL IDs
# Gene names are contained in the "From" column
unique_gene_names = UniProt_TrEMBL_mappings["From"].unique()

multiple_mappings_names = [
    name for name in unique_gene_names
    if (UniProt_TrEMBL_mappings["From"] == name).sum() > 1
]

TrEMBL_entries_to_keep = []

for name in multiple_mappings_names:
    # As a first step, retrieve all UniProt TrEMBL entries the
    # respective gene name has been mapped to
    mapping_candidates = UniProt_TrEMBL_mappings[
        UniProt_TrEMBL_mappings["From"] == name
    ]

    # If the proteins of all entries have the same length, simply the
    # first entry is chosen
    # Bear in mind that indexing a Pandas DataFrame or Series by simply
    # using integers in square brackets as one would do for lists refers
    # to the Index value, not to the integer position
    # Hence, in order to refer to the integer position, the ".iloc"
    # method has to be used
    if all(
        length == mapping_candidates["Length"].iloc[0]
        for length in mapping_candidates["Length"]
    ):
        TrEMBL_entries_to_keep.append(
            mapping_candidates["Entry"].iloc[0]
        )
    else:
        # The protein lengths of the UniProt entries the gene name has
        # been mapped to differ so that the entry with the longest
        # protein is chosen
        largest_length = max(mapping_candidates["Length"])

        # For some gene names, more than one UniProt entry has the
        # largest sequence length
        # Therefore, the following approach is pursued: Only entries
        # having the respective largest sequence length are retained
        # From the resulting DataFrame, the first entry is chosen
        max_length_candidates = mapping_candidates[
            mapping_candidates["Length"] == largest_length
        ]

        TrEMBL_entries_to_keep.append(
            max_length_candidates["Entry"].iloc[0]
        )

assert len(multiple_mappings_names) == len(TrEMBL_entries_to_keep), (
    "A mistake occurred during the selection of the UniProt entries!"
)

In [16]:
# Now that one UniProt entry has been chosen for each gene mapped to
# multiple UniProt entries, the rejected entries are discarded from the
# DataFrame and the TSV file is overwritten
# The gene names mapped to multiple UniProt entries and the respectively
# chosen UniProt entries have the same ordering, allowing to jointly
# iterate over them via the "zip()" function
for gene_name, UniProt_ID in zip(
    multiple_mappings_names, TrEMBL_entries_to_keep
):
    # The undesired rows are discarded by retaining the desired ones
    # The desired ones are all rows with a gene name other than the
    # current one as well as the one row having both the current gene
    # name and the chosen UniProt enrtry
    UniProt_TrEMBL_mappings = UniProt_TrEMBL_mappings[
        # Bear in mind that due to operator precedence, i.e. logical OR
        # being evaluated prior to the equality checks, the equality
        # checks have to be enclosed in parentheses
        (UniProt_TrEMBL_mappings["From"] != gene_name)
        |
        (
            (UniProt_TrEMBL_mappings["From"] == gene_name)
            &
            (UniProt_TrEMBL_mappings["Entry"] == UniProt_ID)
        )
    ]

assert len(UniProt_TrEMBL_mappings) == len(unique_gene_names), (
    "Something went wrong while discarding undesired rows!"
)

In [17]:
# Overwrite the TSV file with the new Pandas DataFrame
UniProt_TrEMBL_mappings.to_csv(
    "idmapping_VACV_screen_unmapped_gene_names_to_UniProt_IDs.tsv",
    sep="\t",
    index=False
)

In [22]:
# In order to keep track of which genes encode proteins and which ones
# do not, e.g. since they merely encode ncRNA, an additional column is
# inserted into the TSV file bearing the name "Protein-coding gene"
# If the respective gene indeed gives rise to a protein for which a
# UniProt entry exists, its value is set to "Yes"
# However, if the respective gene is merely transcribed into ncRNA, its
# value is set to "ncRNA"
# The third possibility known thus far is that a gene is actually a
# pseudogene for which a UniProt entry does not exists; in this case,
# the respective gene's value is set to "Pseudogene without UniProt
# entry"
# Further possibilities for not giving rise to a protein with UniProt
# entry are potentially unearthed while performing database queries for
# all the genes for which mapping to UniProt IDs failed

# The "Protein-coding gene" column is supposed to be inserted after the
# "Name" column; to this end, its index is determined
insertion_index = only_single_entries_df.columns.to_list().index("Name")

only_single_entries_df.insert(
    loc=insertion_index,
    column="Protein-coding gene",
    value="Value not set"
)

In [37]:
# Now, the unmapped gene names are dealt with
# In order to keep track of the reason for not having any UniProt entry,
# a Pandas DataFrame is created comprising three columns
# The first column bears the name "Gene_name" and, as its name already
# suggests, harbours the names of the genes for which mapping failed,
# whereas the second column is named "Gene_ID" and contains the NCBI
# gene IDs
# The third column bears the name "Gene_type" and specifies the gene
# type
# The fourth column is named "UniProt_entry_available" indicates whether
# at least one UniProt entry is available or not
# The DataFrame is created from a dictionary of lists

# For the purpose of working with file, the "with" context manager is
# preferred as it automatically takes care of closing files, even in
# case of errors/exceptions
with open(
    "VACV_screen_unmapped_gene_names_to_UniProt_IDs_unmapped_genes.txt",
    "r"
) as f:
    # Conveniently enough, each individual gene name occupies one line
    # This makes usage of the ".readlines()" method appropriate
    unmapped_gene_names = f.readlines()
    # Bear in mind that all entries but the last are followed by a
    # newline character, which has to be removed
    unmapped_gene_names = [
        # Keep in mind that unless raw strings are used, the newline
        # character "\n" is treated as one entity rather than as two
        # individual characters
        # This implies that in order to remove it, "-1" must be used as
        # second index of the slice instead of "-2"
        gene_name[:-1] if "\n" in gene_name else gene_name
        for gene_name in unmapped_gene_names
    ]


gene_type = ["Value not set"] * len(unmapped_gene_names)
UniProt_entry_available = ["Value not set"] * len(unmapped_gene_names)

# Retrieve the NCBI gene ID for each gene
gene_IDs = []
for gene_name in unmapped_gene_names:
    # As all entries of the same gene are supposed to have the same gene
    # ID, simply the first row of the respective gene is taken
    gene_ID = only_single_entries_df[
        only_single_entries_df["Name"] == gene_name
    ].iloc[0]["ID_manufacturer"]
    gene_IDs.append(gene_ID)

unmapped_genes_reasons_df = pd.DataFrame(
    {
        "Gene_name": unmapped_gene_names,
        "Gene_ID": gene_IDs,
        "Gene_type": gene_type,
        "UniProt_entry_available": UniProt_entry_available
    }
)

In [38]:
# The Pandas DataFrame is saved as TSV file
unmapped_genes_reasons_df.to_csv(
    "unmapped_genes_info.tsv",
    sep="\t",
    index=False
)

In [None]:
# In addition to determining the coverage, a column is introduced into
# the Pandas DataFrame representing the UniProt ID equivalents of the
# gene IDs
# To this end, the unique gene IDs are printed and pasted into the ID
# mapping GUI on https://www.uniprot.org/id-mapping
# The mapping settings are as follows: "From database: Genome
# annotation databases/GeneID" (Gene ID is the same as NCBI Gene ID) and
# "To database: UniProt/UniProtKB" (choosing UniProtKB includes both
# UniProtKB sections, i.e. UniProtKB/Swiss-Prot and UniProtKB/TrEMBL)