In [1]:
"""
The purpose of this Jupyter notebook is to process the Dharmacon pooled
Genome 1 and Genome 2 subset of the VACV screen.
"""

'\nThe purpose of this Jupyter notebook is to process the Dharmacon pooled\nGenome 1 and Genome 2 subset of the VACV screen.\n'

In [5]:
import os
import math

import numpy as np
import pandas as pd
from biotite.sequence.io import fasta

#### Extraction of Dharmacon Pooled Genome 1 & 2 Screening Plates Subset

In [32]:
path_to_entire_vacv_screen = (
    "/Users/jacobanter/Documents/Code/VACV_screen/VacciniaReport_"
    "20170223-0958_ZScored_conc_and_NaN_adjusted.csv"
)

entire_vacv_screen_df = pd.read_csv(
    path_to_entire_vacv_screen,
    sep="\t"
)

  entire_vacv_screen_df = pd.read_csv(


In [33]:
# Now, extract the subset of interest, i.e. the Dharmacon pooled genome
# 1 and 2 subset with screening plates and no checkerboard plates
# Note that the `query()` method can be used instead of square brackets
# in order to avoid entering the DataFrame name multiple times
dp_g1_g2_subset_df = entire_vacv_screen_df.query(
    "(Experiment == 'VACCINIA-DP-G1' or Experiment == 'VACCINIA-DP-G2') "
    "and "
    "PLATE_TYPE == 'ScreeningPlate'"
)

In [34]:
assert len(dp_g1_g2_subset_df) == 43776, (
    "Something went wrong while extracting the subset from the screen!"
)

In [35]:
# Save the Dharmacon pooled subset to a TSV file
dp_g1_g2_subset_df.to_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset.tsv",
    sep="\t",
    index=False
)

#### Determining Unique Gene IDs

In [20]:
# Load the Dharmacon pooled subset into a DataFrame
path_to_dharmacon_pooled_subset = (
    "Dharmacon_pooled_G1_G2_screening_plates_subset.tsv"
)

dp_g1_g2_subset_df = pd.read_csv(
    path_to_dharmacon_pooled_subset,
    sep="\t"
)

In [21]:
# Determine the unique values in the "ID_manufacturer" column to query
# NCBI Entrez with
unique_ids = dp_g1_g2_subset_df["ID_manufacturer"].unique()

In [22]:
print(f"There are {len(unique_ids):,} unique gene ids.")

There are 18,042 unique gene ids.


In [23]:
for unique_id in unique_ids:
    try:
        int(unique_id)
    except ValueError:
        print(unique_id)

Not available


In [24]:
# In order to download information from the NCBI Entrez gene database,
# these 18,042 gene ids are put into a text file with one gene ID per
# line
# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open("gene_ids.txt", "w") as f:
    for i, unique_id in enumerate(unique_ids):
        # Note that the unique IDs may comprise some special values,
        # such as "Not available"
        # These need to be omitted by filtering for integers/integer
        # sequences
        try:
            int(unique_id)
        except ValueError:
            continue

        if i != (len(unique_ids) - 1):
            f.write(unique_id + "\n")
        else:
            f.write(unique_id)

#### Processing NCBI Entrez Gene Database Files from the FTP File Server

In [None]:
# As it turns out, downloading files directly from the NCBI Entrez FTP
# file server is far more convenient than using the `datasets` CLI
# Thus, two files have been downloaded, namely `gene_info.gz` as well as
# `gene_history.gz`
# The latter is required in order to identify withdrawn or replaced gene
# IDs
# However, both files contain entries for all species, which is why they
# have to be filtered to include only human entries (taxonomic ID 9606)

# First, process `gene_info`
gene_info_df = pd.read_csv(
    "gene_info",
    sep="\t"
)

  gene_info_df = pd.read_csv(


In [9]:
gene_info_df = gene_info_df[
    gene_info_df["#tax_id"] == 9606
]

In [12]:
gene_info_df.to_csv(
    "gene_info_human_9606.tsv",
    sep="\t",
    index=False
)

In [13]:
gene_history_df = pd.read_csv(
    "gene_history",
    sep="\t"
)

In [14]:
gene_history_df = gene_history_df[
    gene_history_df["#tax_id"] == 9606
]

In [15]:
gene_history_df.to_csv(
    "gene_history_human_9606.tsv",
    sep="\t",
    index=False
)

In [16]:
# Additionally, two other files have been downloaded; they are required
# for the mapping of genes to UniProt accessions, i.e. proteins
# These two files are `gene2accession.gz` and
# `gene_refseq_uniprotkb_collab.gz`
# As for the two previous files, they have to be filtered to include
# only human entries (taxonomic ID 9606)
gene_refseq_uniprotkb_collab_df = pd.read_csv(
    "gene_refseq_uniprotkb_collab",
    sep="\t"
)

In [19]:
gene_refseq_uniprotkb_collab_df = gene_refseq_uniprotkb_collab_df[
    gene_refseq_uniprotkb_collab_df["NCBI_tax_id"] == 9606
]

In [21]:
gene_refseq_uniprotkb_collab_df.to_csv(
    "gene_refseq_uniprotkb_collab_human_9606.tsv",
    sep="\t",
    index=False
)

In [22]:
gene2accession_df = pd.read_csv(
    "gene2accession",
    sep="\t"
)

  gene2accession_df = pd.read_csv(


In [24]:
gene2accession_df = gene2accession_df[
    gene2accession_df["#tax_id"] == 9606
]

In [26]:
gene2accession_df.to_csv(
    "gene2accession_human_9606.tsv",
    sep="\t",
    index=False
)

#### Inserting Columns into the DataFrame

In [2]:
dp_g1_g2_subset_df = pd.read_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset.tsv",
    sep="\t"
)

In [3]:
# The following columns are supposed to be inserted into the DataFrame:
# `Gene_type`, which, as its name already suggests, indicates the type
# of the respecticve gene
# `UniProt_IDs`, which stores the UniProt ID(s) associated with the
# respective gene
# `Withdrawn_by_NCBI`, which, as its name already implied, indicates
# whether the gene ID is still valid or not
# They are supposed to be inserted immediately after the
# "Name_alternatives" column
# Thus, its index has to be determined
columns_list = dp_g1_g2_subset_df.columns.to_list()

insertion_index = columns_list.index("Name_alternatives") + 1

dp_g1_g2_subset_df.insert(
    loc=insertion_index,
    column="Gene_type",
    value="Value not set"
)

insertion_index += 1

dp_g1_g2_subset_df.insert(
    loc=insertion_index,
    column="UniProt_IDs",
    value="Value not set"
)

insertion_index += 1

dp_g1_g2_subset_df.insert(
    loc=insertion_index,
    column="Withdrawn_by_NCBI",
    value="Value not set"
)

#### Conducting the TSV File Update

In [4]:
from importlib import reload

import NCBI_Entrez_utils as utils

reload(utils)

<module 'NCBI_Entrez_utils' from '/Users/jacobanter/Documents/Code/VACV_screen/Processing_Dharmacon_pooled_genome_1_and_2_subset/NCBI_Entrez_utils.py'>

In [5]:
data_updater = utils.NCBI_Entrez_data_lookup(
    "gene_info_human_9606.tsv",
    "gene_history_human_9606.tsv",
    "gene2accession_human_9606.tsv",
    "gene_refseq_uniprotkb_collab_human_9606.tsv",
    "sec_ac.txt",
    "uniprotkb_organism_id_9606_2025_09_19_all_human_prots_Swiss-Prot_"
    "and_TrEMBL_uniprot_acc_headers.fasta"
)

In [6]:
updated_dp_g1_g2_subset_df = data_updater.check_gene_id_and_symbol(
    dp_g1_g2_subset_df
)

In [7]:
updated_dp_g1_g2_subset_df = data_updater.add_uniprot_ids(
    updated_dp_g1_g2_subset_df
)

In [8]:
updated_dp_g1_g2_subset_df.to_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset_updated.tsv",
    sep="\t",
    index=False
)

#### Construction of PPI Pairs for Screen Refinement

In [20]:
# Load the updated TSV file into a Pandas DataFrame
dp_g1_g2_subset_df = pd.read_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset_updated.tsv",
    sep="\t"
)

In [10]:
# Extract the UniProt accessions from the Dharmacon pooled subset
# Bear in mind that many entries are composite entries with semicolons
# as separator
# Also remember that many UniProt accessions probably occur multiple
# times, requiring the removal of redundancies via e.g. the Pandas
# `.drop_duplicates()` method
# Yet another layer of complexity stems from the fact that some entries
# are `Nan`, necessitating filtering
dp_accs_list = (
    dp_g1_g2_subset_df["UniProt_IDs"]
    .dropna()
    .str.split(";")
    .explode()
    .drop_duplicates()
    .to_list()
)

In [11]:
print(
    f"There are {len(dp_accs_list):,} UniProtKB protein accessions in "
    "the Dharmacon pooled G1/G2\nscreening plates subset of the screen."
)

There are 43,639 UniProtKB protein accessions in the Dharmacon pooled G1/G2
screening plates subset of the screen.


In [17]:
# Load the FASTA file containing all 440 VACV WR proteins
path_to_VACV_WR_fasta = (
    "uniprotkb_organism_id_10254_2025_06_19_all_VACV_WR_prots_uniprot_"
    "only_header.fasta"
)

VACV_WR_fasta = fasta.FastaFile.read(path_to_VACV_WR_fasta)

In [18]:
# Extract the VACV WR UniProt accessions from the FASTA file
VACV_WR_uniprot_accs = list(VACV_WR_fasta.keys())

In [14]:
# Now that both the VACV WR UniProt accessions and the Dharmacon pooled
# subset UniProt accessions have been extracted, they are combined such
# that each and every human protein from the screen is paired with each
# of the 440 VACV WR proteins
# This can be achieved in a very elegant way using a Pandas cross join
# (also called a Cartesian product)

# Convert the lists into DataFrames
human_df = pd.DataFrame({"human_protein": dp_accs_list})
vacv_wr_df = pd.DataFrame({"VACV_WR_protein": VACV_WR_uniprot_accs})

# Perform a cross join
human_vacv_ppi_pairs_df = human_df.merge(
    vacv_wr_df,
    how="cross"
)

In [15]:
human_vacv_ppi_pairs_df.to_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset_human-VACV_WR_"
    "PPI_pairs.tsv",
    sep="\t",
    index=False
)

#### Splitting the PPI Pairs TSV File into Chunks

In [7]:
# Load the TSV file harbouring the PPI pairs into a Pandas DataFrame
human_vacv_ppi_pairs_df = pd.read_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset_human-VACV_WR_"
    "PPI_pairs.tsv",
    sep="\t"
)

In [8]:
n_ppi_pairs = len(human_vacv_ppi_pairs_df)

print(
    f"Number of human-VACV WR PPI pairs: {n_ppi_pairs:,}"
)

Number of human-VACV WR PPI pairs: 19,201,160


In [9]:
# In total, there are roughly 19,200,000 PPI pairs
# The maximum time for HPC jobs using GPUs is 48 hours, which may be too
# short
# Thus, the PPI pairs are split into chunks each encompassing 5 million
# PPI pairs (6 hours of the 48 hours are reserved for loading the FASTA
# file and the embeddings; the batch size is 64; one batch takes roughly
# 1 second; 42 hours can be used for actual inference;
# 42 hours * 60 * 60 = 151,200 seconds = 151,200 batches;
# 151,200 batches * 64 = 9,676,800 PPI pairs ≈ 9,600,000 PPI pairs;
# thus, up to 9,000,000 PPI pairs per chunk would be possible)
ppi_pair_chunks_dir_path = "PPI_pair_chunks"

if not os.path.exists(ppi_pair_chunks_dir_path):
    os.makedirs(ppi_pair_chunks_dir_path)

In [10]:
# Determine the total number of chunks
CHUNK_SIZE = 5_000_000

n_chunks = math.ceil(n_ppi_pairs / CHUNK_SIZE)

print(f"In total, there are {n_chunks} chunks.")

In total, there are 4 chunks.


In [None]:
# Now, for each of the 4 chunks, extract the respective subset and save
# it to a separate TSV file
for i in range(n_chunks):
    current_subset = human_vacv_ppi_pairs_df.iloc[
        i * CHUNK_SIZE: (i + 1) * CHUNK_SIZE
    ]

    # Bear in mind that xCAPT5 expects TSV files not to have a header
    current_subset.to_csv(
        os.path.join(
            ppi_pair_chunks_dir_path,
            "Dharmacon_pooled_G1_G2_screening_plates_subset_human-"
            f"VACV_WR_PPI_pairs_chunk_{i}.tsv"
        ),
        sep="\t",
        index=False,
        header=False
    )

#### FASTA File Generation

In [21]:
# Create a FASTA file comprising the 440 VACV WR protein sequences as
# well as the sequences of the human proteins in the Dharmacon pooled
# subset
human_Dharmacon_and_VACV_WR_prots_fasta = fasta.FastaFile()

In [22]:
for header, seq in VACV_WR_fasta.items():
    human_Dharmacon_and_VACV_WR_prots_fasta[header] = seq

In [23]:
# In order to obtain the protein sequences of the human proteins, all
# UniProt Swiss-Prot and UniProt TrEMBL sequences for Homo sapiens (tax
# ID 9606) are downloaded from UniProt
# Only canonical protein sequences are downloaded, i.e. no isoforms
# The download was conducted on September 19th 2025

# Load the FASTA file
all_human_prots_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_2025_09_19_all_human_prots_Swiss-Prot_"
    "and_TrEMBL.fasta"
)

In [16]:
# The FASTA file downloaded from UniProt still has default headers
# The headers have to be modified to contain only the UniProt accession
all_human_prots_simple_header_fasta = fasta.FastaFile()

for header, seq in all_human_prots_fasta.items():
    # Conveniently enough, the header elements are separated by pipes/
    # vertical bars with the UniProt accession being the second element
    uniprot_acc = header.split("|")[1]
    all_human_prots_simple_header_fasta[uniprot_acc] = seq

In [17]:
# Save the new FASTA file to disk
all_human_prots_simple_header_fasta.write(
    "uniprotkb_organism_id_9606_2025_09_19_all_human_prots_Swiss-Prot_"
    "and_TrEMBL_uniprot_acc_headers.fasta"
)

In [7]:
# Load the FASTA file with simplified headers
all_human_prots_fasta = fasta.FastaFile.read(
    "uniprotkb_organism_id_9606_2025_09_19_all_human_prots_Swiss-Prot_"
    "and_TrEMBL_uniprot_acc_headers.fasta"
)

In [30]:
# Quickly verify that all human proteins present in the Dharmacon pooled
# subset are also covered by the FASTA file
# Python set lookups are faster
uniprot_accs_in_fasta = set(all_human_prots_fasta.keys())

coverage_list = [
    acc in uniprot_accs_in_fasta
    for acc in dp_accs_list
]

assert all(coverage_list), (
    "Not all human proteins in the Dharmacon pooled subset are covered "
    "by the FASTA file!"
)

In [31]:
# Finally, add the human protein sequences to the FASTA file
for human_prot_acc in dp_accs_list:
    human_Dharmacon_and_VACV_WR_prots_fasta[human_prot_acc] = (
        all_human_prots_fasta[human_prot_acc]
    )

In [32]:
# Perform a sanity check
assert (
    len(human_Dharmacon_and_VACV_WR_prots_fasta)
    ==
    (440 + 43_639)
), "Something went wrong while populating the FASTA file!"

In [33]:
# Now that the sanity check has been successfully passed, save the FASTA
# file to disk
human_Dharmacon_and_VACV_WR_prots_fasta.write(
    "VACV_WR_and_Dharmacon_pooled_G1_G2_screening_plates_human_prots.fasta"
)

#### Generating PPI Pairs File and FASTA File for Missing UniProt<br>Accessions from PPI Data Set

In [19]:
# In the Jupyter notebook
# `evaluation_of_xCAPT5_performance_on_DP_G1_G2_subset.ipynb`, it has
# been discovered that a certain number of UniProt accessions present in
# the combined PPI data set is not present in the screen subset TSV
# file, which in turn implies that PPI predictions have not been made
# for these UniProt accessions
# Thus, the PPI prediction for these UniProt accessions is made in a
# separate step
# To this end, a PPI pairs TSV file as well as a FASTA file are
# generated
# Just as with the previous PPI pair generation, this PPI pairing is
# performed in a combinatorial manner using a cross join
vacv_wr_df = pd.DataFrame({"VACV_WR_protein": VACV_WR_uniprot_accs})

# Create a list comprising the missing UniProt accessions from the
# combined data set
missing_uniprot_accs_list = [
    # Missing UniProt accessions from confirmed positive PPI instances
    "V9GZ56", "Q99729", "A8MUS3", "E9PDI4", "Q93086", "Q8WWI1",
    "A0A2R8Y5A3", "F8VVA7", "F8WBV6", "H3BSR6",
    # Missing UniProt accessions from reliable negative PPI instances
    "F5GYR3", "F8WE32", "U3KQ75", "G3V5S9", "G3V2M5", "F8VRX4",
    "D6RJF7", "A0A8V8TMR1", "O60531", "E9PPY3", "J3QR28", "A0JLS5",
    "B4DHA6", "B4DRX8", "F8WC81", "D6RC52", "B1AMU7", "R4GNH9",
    "F8WDT8", "Q05CW7", "B3KVX2", "Q5VXM9", "U3KQN5", "F8W8T7",
    "D6RC74", "Q96GC8", "A0A087WWQ2", "B4DQC7", "V9GYP5", "D6RC60",
    "B4E098", "C9J6C5", "C9JJU7", "B4E263", "B3KWS1", "A2VDI1",
    "B7ZAU8", "Q8N7L7", "Q96ES5", "D6RBR7", "D6R8Y9", "D6R9C8",
    "F6VJE8", "J3QR85", "J3KSR7", "A0A8V8TPK8", "A0A8V8TQT0",
    "A0A8V8TP28", "A0A8V8TPD4", "F8WFE7", "B4DXL4", "M0R0P1", "M0R2U2",
    "M0R1H0", "M0R2B0", "M0QYK9", "A0A8I5KT77", "B4DMU5", "C9JZT7",
    "A0PJ56", "H0YBV6", "B3KPN5", "B2RE66", "B4DM91", "F5GWN9",
    "B4DP15", "A0A1U9X8U3", "A8K806", "A8K9A1", "B4DNI0", "H0Y6G3",
    "A0A0G2JJL1", "A0A140T9L0", "H7BZ72", "B3KN82", "A0A1B0GTK2",
    "P0DW28", "Q05DN1", "A8MYC1", "A0A0A0MQS4", "A5D904", "B4DPI9",
    "H0Y9L8", "B4DW33", "E7EX70", "O60747", "B4DHR2", "A0PJ87",
    "I3L3U9", "I3L234", "B4E303", "B4E074", "A0A0A0MRH0", "Q5VU10",
    "B4DJR3", "H0Y9Y4", "E9PKP7", "B4DNQ1", "E9PLY7", "A0A7I2V506",
    "A0A7I2V5M5", "A0A7I2V2U7", "A0A7I2V349", "A0A7I2V699", "B7Z284",
    "B7Z9G4", "U3KQ48", "A0A8Q3WK70", "A0A8Q3WKH7", "A0A994J4Y2",
    "Q53GY5", "V9GYY5", "Q4G0D9", "Q6DKJ9", "A0A075B729", "A0A8I5QKX4",
    "H7C446", "A0A8Q3SHT6"
]

missing_uniprot_accs_df = pd.DataFrame({
    "human_protein": missing_uniprot_accs_list
})

# Finally, perform the cross join
missing_human_accs_vacv_ppi_pairs_df = missing_uniprot_accs_df.merge(
    vacv_wr_df,
    how="cross"
)

In [20]:
# Bear in mind that xCAPT5 expects TSV files not to have a header
missing_human_accs_vacv_ppi_pairs_df.to_csv(
    "missing_human_UniProt_accs_from_combined_PPI_data_set-VACV_WR_"
    "PPI_pairs.tsv",
    sep="\t",
    index=False,
    header=False
)

In [21]:
# Now, turn to the generation of the FASTA file
missing_human_accs_vacv_wr_fasta = fasta.FastaFile()

# Add the 440 VACV WR protein sequences
for header, seq in VACV_WR_fasta.items():
    missing_human_accs_vacv_wr_fasta[header] = seq

In [22]:
# Add the protein sequences of the missing human UniProt accessions
# However, prior to that, quickly verify that all missing human proteins
# are covered by the FASTA file comprising the entire human proteome
# Python set lookups are faster
uniprot_accs_in_human_proteome = set(all_human_prots_fasta.keys())

missing_human_prots_coverage_list = [
    acc in uniprot_accs_in_human_proteome
    for acc in missing_uniprot_accs_list
]

assert all(missing_human_prots_coverage_list), (
    "Not all missing human proteins are covered by the FASTA file "
    "comprising the entire human proteome!"
)

In [23]:
# Now that the sanity check has successfully been passed, add the
# sequences of the missing human proteins
for human_prot_acc in missing_uniprot_accs_list:
    missing_human_accs_vacv_wr_fasta[human_prot_acc] = (
        all_human_prots_fasta[human_prot_acc]
    )

In [24]:
# Perform a sanity check regarding the number of entries in the FASTA
# file
assert (
    len(missing_human_accs_vacv_wr_fasta)
    ==
    (440 + len(missing_uniprot_accs_list))
), "Something went wrong while populating the FASTA file!"

In [25]:
# Save the FASTA file to disk
missing_human_accs_vacv_wr_fasta.write(
    "VACV_WR_and_missing_human_prots_from_combined_PPI_data_set.fasta"
)

#### Incorporating Missing UniProt Accessions from the PPI Data Set

In [None]:
# The missing human UniProt accessions still need to be added to the
# `UniProt_IDs` column of the screen subset TSV file
# For this purpose, the following approach is devised: A dictionary
# mapping gene names to the missing human UniProt accessions is created
# Using the Pandas DataFrame `.apply()` method, the missing human
# UniProt accessions are added to the `UniProt_IDs` entries of the
# corresponding genes; the method implemented for this purpose uses the
# abovementioned dictionary
missing_gene_name_to_human_acc_dict = {
    # Missing UniProt accessions from confirmed positive PPI instances
    "LSM4": ["V9GZ56"],
    "HNRNPAB": ["Q99729"],
    "RPL23A": ["A8MUS3"],
    "LAD1": ["E9PDI4"],
    "P2RX5": ["Q93086"],
    "LMO7": ["Q8WWI1"],
    "CTNNB1": ["A0A2R8Y5A3"],
    "COPZ1": ["F8VVA7"],
    "SERF2": ["F8WBV6"],
    "CX3CL1": ["H3BSR6"],
    # Missing UniProt accessions from reliable negative PPI instances
    "FBL": ['M0R0P1', 'M0R2U2', 'M0R1H0', 'M0R2B0'],
    "UTP25": ['B3KVX2'],
    "GTPBP4": ['O60747', 'B4DHR2'],
    "NEK11": ['D6RJF7'],
    "FRG1": ['E9PLY7'],
    "PHF8": ['A0A8I5KT77'],
    "RPS9": ['A5D904'],
    "ZNF501": ['B2RE66'],
    "POLR1E": ['B4DW33', 'E7EX70'],
    "TAF1B": ['F8WE32', 'U3KQ75'],
    "POP4": ['A8MYC1', 'A0A0A0MQS4'],
    "RIOX2": ['H0Y9L8'],
    "FCF1": ['G3V5S9', 'G3V2M5'],
    "RPS19BP1": ['F8WFE7'],
    "NOP14": ['Q96GC8'],
    "RBM10": ['P0DW28'],
    "DGCR8": ['A0A994J4Y2', 'Q53GY5'],
    "BOP1": ['Q4G0D9', 'Q6DKJ9', 'A0A075B729'],
    "MPHOSPH10": ['U3KQ48', 'A0A8Q3WK70', 'A0A8Q3WKH7'],
    "PRKDC": ['A0A8V8TMR1'],
    "NOP16": ['D6RC60', 'B4E098'],
    "PPAN": ['H7C446'],
    "NOP2": ['F5GYR3'],
    "NOL6": ['B3KPN5'],
    "NCL": ['A0A7I2V506', 'A0A7I2V5M5', 'A0A7I2V2U7', 'A0A7I2V349', 'A0A7I2V699'],
    "RPF2": ['Q5VXM9', 'U3KQN5'],
    "SLX9": ['C9JJU7'],
    "NLE1": ['B4E303', 'B4E074', 'A0A0A0MRH0'],
    "NHP2": ['D6RC52'],
    "NOL11": ['J3QR28'],
    "WDR46": ['B4DP15', 'A0A1U9X8U3', 'A8K806', 'A8K9A1', 'B4DNI0', 'H0Y6G3', 'A0A0G2JJL1', 'A0A140T9L0'],
    "RRP8": ['E9PPY3'],
    "WDR75": ['F8WC81'],
    "EMG1": ['A0A087WWQ2', 'B4DQC7', 'V9GYP5'],
    "DEDD2": ['M0QYK9'],
    "RSL1D1": ['A0PJ87', 'I3L3U9', 'I3L234'],
    "DDX56": ['F8WDT8'],
    "ZBTB11": ['A0A8I5QKX4'],
    "UTP14A": ['O60531'],
    "GNL2": ['B4DPI9'],
    "HEATR1": ['B4E263', 'B3KWS1', 'A2VDI1', 'B7ZAU8', 'Q8N7L7', 'Q96ES5'],
    "NOP58": ['H7BZ72', 'B3KN82'],
    "TBL3": ['A0JLS5'],
    "NOL12": ['V9GYY5'],
    "ZCCHC7": ['Q05DN1'],
    "GNL3": ['B4DMU5', 'C9JZT7'],
    "RPP30": ['Q5VU10', 'B4DJR3'],
    "NOL10": ['A0A8Q3SHT6'],
    "RPA1": ['B7Z284'],
    "LIN28B": ['A0A1B0GTK2'],
    "NOL8": ['B4DM91', 'F5GWN9'],
    "RPS3A": ['H0Y9Y4'],
    "EXOSC1": ['B1AMU7', 'R4GNH9'],
    "UTP18": ['F6VJE8', 'J3QR85', 'J3KSR7'],
    "NAT10": ['Q05CW7'],
    "NOP10": ['A0A8V8TPK8', 'A0A8V8TQT0', 'A0A8V8TP28', 'A0A8V8TPD4'],
    "EBNA1BP2": ['B4DHA6', 'B4DRX8'],
    "DDX54": ['F8VRX4'],
    "ZNF330": ['D6RBR7', 'D6R8Y9', 'D6R9C8'],
    "NOC3L": ['B4DXL4'],
    "UTP6": ['B7Z9G4'],
    "SDAD1": ['F8W8T7', 'D6RC74'],
    "UBTF": ['E9PKP7', 'B4DNQ1'],
    "MAK16": ['A0PJ56', 'H0YBV6'],
    "NIFK": ['C9J6C5']
}

def append_uniprot_acc(row):
    gene_name = row["Name"]
    uniprot_id_entry = row["UniProt_IDs"]

    # Retrieve the UniProt accessions to be appended
    uniprot_accs_to_be_appended = (
        missing_gene_name_to_human_acc_dict.get(gene_name)
    )
    if not uniprot_accs_to_be_appended:
        # Leave the `UniProt_IDs` entry unaltered
        return uniprot_id_entry

    # The current entry is NaN (which is a float)
    if pd.isna(uniprot_id_entry):
        return ";".join(uniprot_accs_to_be_appended)
    # The current entry is a string comprising at least one UniProt
    # accession
    else:
        return uniprot_id_entry + ";" + ";".join(uniprot_accs_to_be_appended)

dp_g1_g2_subset_df["UniProt_IDs"] = dp_g1_g2_subset_df.apply(
    append_uniprot_acc, axis=1
)

In [17]:
# Save the updated Pandas DataFrame to a TSV file
dp_g1_g2_subset_df.to_csv(
    "Dharmacon_pooled_G1_G2_screening_plates_subset_with_missing_"
    "UniProt_IDs.tsv",
    sep="\t",
    index=False
)

#### Z-Scoring of Intensities

In [None]:
# The last step involves Z-scoring the intensity...