In [1]:
"""
The purpose of this Jupyter notebook is to set the STRING ID for
proteins that indeed have one.

The necessity to do so arises from the fact that in the initial VACV
report file, official gene symbols were not specified for all targets.
Thus, by querying the NCBI database, the official gene symbols were
retrieved and set. The STRING IDs, on the other hand, were merely set to
"Not available" as it was assumed that they would not be of further
relevance. As it turned out, they indeed are of relevance as STRING uses
these very STRING IDs to specify PPI pairs in its downloadable files.

Moreover, closer scrutiny revealed that in the case of genes a STRING ID
has been assigned to in the TSV file, some assignments are incorrect.
Therefore, the STRING ID update is not confined to gene names with an
associated STRING ID "Not available" or "unknown", but encompasses the
entire VACV report TSV file.

Rather than querying the UniProt database, which is often accompanied by
connection issues, the information for all human proteins is downloaded
from UniProt as XML file. This very XML file is queried in lieu of the
online database so as to retrieve the STRING ID.
"""

'\nThe purpose of this Jupyter notebook is to set the STRING ID for\nproteins that indeed have one.\n\nThe necessity to do so arises from the fact that in the initial VACV\nreport file, official gene symbols were not specified for all targets.\nThus, by querying the NCBI database, the official gene symbols were\nretrieved and set. The STRING IDs, on the other hand, were merely set to\n"Not available" as it was assumed that they would not be of further\nrelevance. As it turned out, they indeed are of relevance as STRING uses\nthese very STRING IDs to specify PPI pairs in its downloadable files.\n\nMoreover, closer scrutiny revealed that in the case of genes a STRING ID\nhas been assigned to in the TSV file, some assignments are incorrect.\nTherefore, the STRING ID update is not confined to gene names with an\nassociated STRING ID "Not available" or "unknown", but encompasses the\nentire VACV report TSV file.\n\nRather than querying the UniProt database, which is often accompanied by\nco

In [2]:
import xml.etree.ElementTree as ET

path_to_xml_file = "uniprotkb_organism_id_9606_AND_reviewed_2025_"\
    "02_14.xml"

tree = ET.parse(path_to_xml_file)
root = tree.getroot()

In [3]:
gene_name_string_id_dict = {}

# Not all entries are associated with a STRING ID
# Find the ones that are and subsequently populate the dictionary
for entry in root.findall(".//*[@type='STRING']/.."):
    # Use an XPath expression to find the official gene symbol
    # Multiple subentries containing the official gene symbol may occur
    # as sometimes, several distinct genes give rise to one and the same
    # protein
    # One prominent example of this scenario are histones
    # For each of the genes, a separate key-value pair is added to the
    # dictionary
    off_gene_symbol_entries = entry.findall(
        "./{http://uniprot.org/uniprot}gene/"
        "{http://uniprot.org/uniprot}name[@type='primary']"
    )

    # Similarly, use an XPath expression to find the STRING ID
    # There has to be only one subentry containing the STRING ID
    string_id_subentry = entry.findall(
        "./{http://uniprot.org/uniprot}dbReference[@type='STRING']"
    )
    string_id = string_id_subentry[0].attrib["id"]
    
    # Finally, populate the dictionary
    for off_gene_symbol_entry in off_gene_symbol_entries:
        off_gene_symbol = off_gene_symbol_entry.text

        gene_name_string_id_dict[off_gene_symbol] = string_id

In [4]:
# Save the dictionary to disk by pickling it
import pickle

# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open(
    "dictionary_mapping_off_gene_symbol_to_STRING_ID.pkl", "wb"
) as f:
    pickle.dump(gene_name_string_id_dict, f)

In [5]:
# Load the pickled dictionary
import pickle

with open(
    "dictionary_mapping_off_gene_symbol_to_STRING_ID.pkl", "rb"
) as f:
    gene_name_string_id_dict = pickle.load(f)

In [6]:
# Now, turn to the VACV report TSV file
# In addition to replacing occurrences of "Not available" and "unknown"
# in the `ID_String` column with the actual STRING ID, if available, all
# other STRING IDs are updated as well
import pandas as pd

path_to_VACV_report_tsv = (
    "../VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
    "entries_only_without_Qiagen_mismatches.tsv"
)

VACV_report_df = pd.read_csv(
    path_to_VACV_report_tsv,
    sep="\t"
)

  VACV_report_df = pd.read_csv(


In [7]:
# Determine the unique gene names
gene_names = VACV_report_df["Name"].unique()

# Iterate over the gene names, retrieve the STRING ID from the
# dictionary and update the Pandas DataFrame
# Simultanaeously, gene names without an associated STRING ID are kept
# track of in a separate list
genes_not_in_dict = []

for gene_name in gene_names:
    try:
        string_id = gene_name_string_id_dict[gene_name]

        # When intending to modify the values of a Pandas DataFrame
        # in-place, it must be kept in mind that `chained indexing` must
        # be avoided, i.e. consecutive indexing in separate indexing
        # operations
        # This is due to the fact that `chained indexing` returns a
        # smaller object, e.g. a Pandas Series, thereby preventing the
        # value modification from being applied to the original
        # DataFrame
        # Instead, both the row and column location of the cells to
        # modify must be specified in a single indexing operation
        VACV_report_df.loc[
            VACV_report_df["Name"] == gene_name, "ID_String"
        ] = string_id
    except KeyError:
        genes_not_in_dict.append(gene_name)

        # Once again, be sure to prevent `chained indexing`
        VACV_report_df.loc[
            VACV_report_df["Name"] == gene_name, "ID_String"
        ] = "Not available"

n_genes_not_mapped = len(genes_not_in_dict)
total_n_gene_names = len(gene_names)

if len(genes_not_in_dict) > 0:
    print(
        f"{n_genes_not_mapped:,} out of {total_n_gene_names:,} "
        "gene names could not be mapped to a STRING ID."
    )
    print("These gene names are the following:")
    for unmapped_name in genes_not_in_dict:
        print(unmapped_name)

2,298 out of 20,653 gene names could not be mapped to a STRING ID.
These gene names are the following:
TUBB7P
PNLIPRP2
USP17L9P
REXO1L1P
RHD
CENPJ
LDHB
KLRA1P
SEC11B
HBG1
GSTT2
AQP4
NAT8B
JMJD7-PLA2G4B
GBA3
DUX1
GPR89B
VEGFA
GPX1
ABHD17AP1
PLSCR2
AKR7L
NMNAT3
HLA-DQA1
MDH1
AKR1C6P
CBS
OPA3
LPAL2
HSPA1B
BORCS8-MEF2B
SULT1A4
TMPRSS11F
GSTT1
UGT2A1
VDR
AMY1A
DDX12P
TMPRSS7
AMD1
VNN3P
MORF4
VN1R5
VN1R10P
NPY4R
OPRL1
ADGRE4P
OPN1MW
OPRK1
CALM3
GUCY2EP
ACP2
MAPK10
CALM2
NUDT4
SACM1L
PRSS42P
FOLH1B
SEC11A
MASP1
CLCA3P
GLRA4
UBE2L3
UBE2NL
UBE2D3
UBE2V1
PKD1L2
LOC644006
FBXL21P
FBXL9P
FBXO44
PLEKHM1P1
ZNF547
TRIM6-TRIM34
MARCHF8
TRIM64C
UBTFL7
KRTAP5-9
KDM1B
TRIM51EP
BIRC8
C10orf71
BTN2A3P
LINC02915
ZNF138
IGF2-AS
HMGB1P1
MIR1-1HG
CDR1
SPMIP5
PI4KAP2
PMS2P5
OR2W5P
PBOV1
XKRYP7
OR10J3
CDRT15P3
TSPY2
LINC02870
SIGLECL1
TTC6
LINC00174
LINC00452
H2AC12
SERPINA13P
TCP10L3
KRTAP4-7
KRTAP5-3
GUSBP2
GAGE2C
PPIAP80
PAK6-AS1
GAGE6
LINC00696
WASH5P
ANKRD20A5P
ZNF580
POTEA
TXLNGY
SPRR2B
LRTOMT
FCGR1CP
BAGE

In [8]:
# As emerges from the output of the previous code cell, there indeed are
# gene names for which no STRING ID could be retrieved from the
# dictionary
# Upon closer scrutiny, it became apparent that this stems from a
# multitude of reasons
# One of them is that no UniProt entry exists as the respective genes
# are pseudogenes or merely give rise to non-coding RNA
# For other genes, a UniProt entry may exist, but they nevertheless do
# not have an associated STRING ID, which may be attributable to the
# fact that no PPIs involving the respective gene have been deposited in
# STRING yet

In [9]:
# Finally, save the updated DataFrame to a TSV file
VACV_report_df.to_csv(
    path_to_VACV_report_tsv,
    sep="\t",
    index=False
)