In [1]:
"""
The purpose of this Jupyter notebook is to ...
"""

'\nThe purpose of this Jupyter notebook is to ...\n'

In [1]:
import pandas as pd
from biotite.sequence.io import fasta

#### Intersection Between Human Proteins in Confirmed PPIs and the<br>Dharmacon Pooled Subset (Not Specific for a Data Set Split)

In [2]:
# As a first step, determine the intersection between the confirmed
# positive PPIs and the human proteins present in the DP G1/G2 subset
# To this end, the corresponding files need to be loaded

# Load the updated TSV file of the subset into a Pandas DataFrame
dp_g1_g2_subset_path = (
    "Dharmacon_pooled_G1_G2_screening_plates_subset_updated.tsv"
)
dp_g1_g2_subset_df = pd.read_csv(
    dp_g1_g2_subset_path,
    sep="\t"
)

In [3]:
# Extract the unique human UniProt accessions of the subset
# Don't forget to remove NaN
human_prots_in_subset_list = dp_g1_g2_subset_df[
    "UniProt_IDs"
].dropna().to_list()

# Intermediate conversion to a set removes possible duplicates
human_prots_in_subset_list = list(set([
    uniprot_acc
    for entry in human_prots_in_subset_list
    for uniprot_acc in entry.split(";")
]))

In [4]:
# Load the confirmed HVIDB PPIs
pos_ppis_path = "all_HVIDB_VACV_WR_interactions.csv"
pos_ppis_df = pd.read_csv(pos_ppis_path)

In [5]:
# In the CSV file containing the confirmed PPI pairs, the PPI pairs are
# stored in the `Human-virus PPI` column
# The individual entries consist of a human UniProt accession and a VACV
# WR UniProt accession separated by a hyphen, i.e. they have the
# following format:
# <human UniProt accession>-<VACV WR UniProt accession>
# Extract the human interaction partners of the confirmed positive
# interactions
ppi_pairs_list = pos_ppis_df["Human-virus PPI"].to_list()
# Intermediate conversion to a set removes possible duplicates
human_prots_in_HVIDB = list(set([
    ppi_pair.split("-")[0]
    for ppi_pair in ppi_pairs_list
]))

In [6]:
# Finally, determine the intersection between the human proteins
# involved in confirmed positive PPIs and the human proteins
# interrogated in the screen subset
common_uniprot_accs = set.intersection(
    set(human_prots_in_subset_list), set(human_prots_in_HVIDB)
)

In [7]:
print(common_uniprot_accs)
print(
    f"Number of common human UniProt accessions: {len(common_uniprot_accs):,}"
)

{'O00585', 'O75113', 'B2R4S9', 'Q16695', 'Q92878', 'Q9H2G4', 'Q13227', 'Q00839', 'Q13753', 'Q8WXH0', 'G5E9I4', 'Q7Z406', 'P12273', 'O00391', 'P22531', 'P23025', 'Q76M96', 'Q9Y6K9', 'Q15057', 'Q8TAQ2', 'P28838', 'P55327', 'P09651', 'P33176', 'O00571', 'P63010', 'P22362', 'P20671', 'Q96E22', 'P42338', 'Q9NRJ3', 'P55265', 'Q9Y258', 'Q93077', 'Q9BQS8', 'Q14444', 'P04908', 'Q9UBB5', 'Q9Y5Q9', 'Q9HCC0', 'Q96RQ9', 'P42336', 'P06748', 'Q13794', 'Q8IUR0', 'Q9Y618', 'Q7Z7A1', 'O15444', 'Q05639', 'Q92187', 'O75152', 'Q9HCU9', 'Q9BYP7', 'Q9NP74', 'Q9Y3B3', 'Q07866', 'Q96DA0', 'P84243', 'Q9C0F3', 'Q13283', 'B8ZZN6', 'O14920', 'P61978', 'Q15853', 'D9ZGF2', 'Q3ZCM7', 'O00339', 'P27986', 'P20042', 'A3KPC7', 'P13861', 'P04792', 'O15111', 'P20700', 'P62807', 'Q8IWX8', 'O14654', 'P63165', 'F4ZW62', 'A6NFX8', 'Q8TDL5', 'Q05086', 'Q96JH7', 'P04083', 'P62805', 'P05166', 'P24001', 'P35325', 'O95400', 'Q9H6S1', 'Q9P0K7', 'Q9H334', 'Q00325', 'Q8WTS1', 'Q16543', 'Q5JRA6', 'P68371', 'P52179', 'Q9UMR2', 'Q9BYE4',

In [8]:
# Regarding the human UniProt accessions of confirmed positive PPIs,
# there is the possibility that some UniProt accessions have been
# replaced or withdrawn/retracted altogether
# Therefore, it is checked whether all human UniProt accessions of
# confirmed positive interactions occur in the FASTA file for the
# bullet-proof data set
# Load the FASTA file
path_to_bullet_proof_fasta = "bullet-proof_data_set.fasta"

bullet_proof_fasta = fasta.FastaFile.read(path_to_bullet_proof_fasta)

In [9]:
presence_list = [
    human_acc in list(bullet_proof_fasta.keys())
    for human_acc in human_prots_in_HVIDB
]

assert all(presence_list), (
    "Not all human proteins involved in positive confirmed "
    "interactions occur in the FASTA file!"
)

In [10]:
# While all human UniProt accessions indeed are covered by the FASTA
# file, there still is the possibility that some of them are outdated
# Thus, using the NCBI Entrez database file
# gene_refseq_uniprotkb_collab_human_9606.tsv, it is investigate whether
# they are still valid
human_uniprot_collab_path = "gene_refseq_uniprotkb_collab_human_9606.tsv"

human_uniprot_collab_df = pd.read_csv(
    human_uniprot_collab_path,
    sep="\t"
)

In [11]:
valid_uniprot_accs = human_uniprot_collab_df[
    "UniProtKB_protein_accession"
].drop_duplicates().dropna().to_list()

In [12]:
validity_list = [
    human_acc in valid_uniprot_accs
    for human_acc in human_prots_in_HVIDB
]

if not all(validity_list):
    print(
        "Not all human UniProt accessions involved in confirmed "
        "positive PPIs are covered\nby the NCBI Entrez database file!\n"
    )

for human_acc in human_prots_in_HVIDB:
    if human_acc not in valid_uniprot_accs:
        print(human_acc)

Not all human UniProt accessions involved in confirmed positive PPIs are covered
by the NCBI Entrez database file!

F8WBV6
A0A2R8Y5A3
Q8WWI1
H3BSR6
V9GZ56
Q5VTE0
Q93086
Q99729
F8VVA7
A8MUS3
E9PDI4


In [13]:
# As it turns out, 11 human UniProt accessions are not covered by the
# NCBI Entrez database file
# This, however, does not necessarily imply that the corresponding
# accessions are outdated
# Thus, manual UniProt database lookups are necessary
#
# Q5VTE0: is still valid; probably does not occur in the NCBI file since
# it could be the product of a pseudogene
#
# V9GZ56: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (LSM4)
#
# Q99729: is still valid, is part of UniProt Swiss-Prot (reviewed); the
# corresponding gene is also present in the subset (HNRNPAB)
#
# A8MUS3: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (RPL23A)
#
# E9PDI4: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (LAD1)
#
# Q93086: is still valid, is part of UniProt Swiss-Prot (reviewed); the
# corresponding gene is also present in the subset (P2RX5)
#
# Q8WWI1: is still valid, is part of UniProt Swiss-Prot (reviewed); the
# corresponding gene is also present in the subset (LMO7)
#
# A0A2R8Y5A3: is still valid, is part of UniProt TrEMBL (unreviewed);
# the corresponding gene is also present in the subset (CTNNB1)
#
# F8VVA7: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (COPZ1)
#
# F8WBV6: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (SERF2)
#
# H3BSR6: is still valid, is part of UniProt TrEMBL (unreviewed); the
# corresponding gene is also present in the subset (CX3CL1)

# This increases the number of common human UniProt accessions by 10

print(
    "Taking the human UniProt accessions not present in the NCBI "
    "files into account,\nthe number of common human UniProt "
    "accessions increases to 332."
)

Taking the human UniProt accessions not present in the NCBI files into account,
the number of common human UniProt accessions increases to 332.


#### Intersection Between Human Proteins in Negative PPIs and<br>the Dharmacon Pooled Subset (Not Specific for a Data Set Split)

In [14]:
# Also compute the intersection between human UniProt accessions
# involved in reliable negative PPIs and the human proteins present in
# the subset
# To this end, the corresponding TSV file comprising all PPIs must be
# loaded
path_to_all_PPIs = "entire_bullet-proof_ppi_data_set.tsv"

all_PPIs_df = pd.read_csv(
    path_to_all_PPIs,
    sep="\t"
)

In [15]:
# Extract human UniProt accessions involved in negative PPI instances
neg_PPIs_df = all_PPIs_df[
    all_PPIs_df["label"] == 0
]

human_prots_in_neg_PPIs = neg_PPIs_df[
    "Human_prot"
].drop_duplicates().to_list()

In [16]:
# Investigate whether all human UniProt accessions involved in negative
# PPIs are also present in the
# `gene_refseq_uniprotkb_collab_human_9606.tsv` file
validity_list_neg_PPIs = [
    human_acc in valid_uniprot_accs
    for human_acc in human_prots_in_neg_PPIs
]

if not all(validity_list_neg_PPIs):
    print(
        "Not all human UniProt accessions involved in negative PPIs "
        "are covered\nby the NBCI Entrez database file!\n"
    )

n_neg_human_accs_not_covered = (
    len(validity_list_neg_PPIs) - sum(validity_list_neg_PPIs)
)
print(
    "Number of human UniProt accessions involved in negative PPIs\nnot "
    f"covered: {n_neg_human_accs_not_covered}\n"
)

for human_acc in human_prots_in_neg_PPIs:
    if human_acc not in valid_uniprot_accs:
        print(human_acc)

Not all human UniProt accessions involved in negative PPIs are covered
by the NBCI Entrez database file!

Number of human UniProt accessions involved in negative PPIs
not covered: 124

A0A8I5QKX4
Q4G0D9
Q6DKJ9
A0JLT1
Q05BI1
A0A1B0GTK2
B4DP15
A0A1U9X8U3
A8K806
A8K9A1
B4DNI0
H0Y6G3
A0A0G2JJL1
A0A140T9L0
Q96GC8
A0A8I5KT77
U3KQ48
A0A8Q3WK70
A0A8Q3WKH7
B4DM91
Q6AI50
F5GWN9
D6RC52
B4E303
B4E074
A0A0A0MRH0
A0A7I2V506
A0A7I2V5M5
A0A7I2V2U7
A0A7I2V349
A0A7I2V699
B4E263
B3KWS1
A2VDI1
B7ZAU8
A8MPP1
A0A8V8TMR1
B7Z284
O60747
A0JLS5
B3KPN5
G3V0G3
Q05CW7
B4DHR2
E9PKP7
B4DNQ1
Q8N7L7
A0A8Q3SHT6
D6RC60
F8WDT8
B4DHA6
B4DRX8
B4DMU5
C9JZT7
O60531
F8WC81
E9PLY7
F8VRX4
C9JJU7
H7BZ72
E9PPY3
H7C446
A0A087WWQ2
B4DQC7
V9GYP5
F5GYR3
F8WE32
G3V5S9
Q5VU10
B4DJR3
B4DSM4
B4DPI9
A0PJ87
I3L3U9
M0R0P1
D6RBR7
J3QR28
H0Y9Y4
Q05DN1
B3KVX2
B7Z9G4
Q96ES5
B1AMU7
R4GNH9
D6R8Y9
D6R9C8
B4DW33
E7EX70
A0PJ56
C9J6C5
M0R2U2
M0R1H0
M0R2B0
B4E098
H0YBV6
A8MYC1
D6RJF7
F8W8T7
A0A8V8TPK8
A0A8V8TQT0
A0A8V8TP28
A0A8V8TPD4
G3V2M5
F8WFE7
F6V

In [17]:
# ID mapping has been conducted for these 124 human UniProt accessions
# in order to determine the genes encoding them
# Load the resulting TSV file
path_to_neg_PPI_human_prots_id_mapping = (
    "idmapping_2025_09_30_neg_human_UniProt_accs_not_present_in_NCBI_"
    "Entrez_UniProt_collab_file.tsv"
)

id_mapping_neg_PPIs_human_prots_df = pd.read_csv(
    path_to_neg_PPI_human_prots_id_mapping,
    sep="\t"
)

##### Adding Missing Gene Names to the ID Mapping TSV File

In [18]:
# Unfortunately, some gene name entries are NaN
# They have to be determined manually
# Create a dictionary mapping the UniProt accessions to their gene names
acc_to_gene_name_dict = {
    "B4DP15": "WDR46",
    "A0A1U9X8U3": "WDR46",
    "A8K806": "WDR46",
    "A8K9A1": "WDR46",
    "B4DNI0": "WDR46",
    "B4DM91": "NOL8",
    "B4E303": "NLE1",
    "B4E074": "NLE1",
    "B4E263": "HEATR1",
    "B3KWS1": "HEATR1",
    "B7ZAU8": "HEATR1",
    "B7Z284": "RPA1",
    "O60747": "GTPBP4",
    "B3KPN5": "NOL6",
    "B4DHR2": "GTPBP4",
    "B4DNQ1": "UBTF",
    "Q8N7L7": "HEATR1",
    "B4DHA6": "EBNA1BP2",
    "B4DRX8": "EBNA1BP2",
    "O60531": "UTP14A",
    "A0A087WWQ2": "EMG1",
    "B4DQC7": "EMG1",
    "V9GYP5": "EMG1",
    "B4DJR3": "RPP30",
    "B4DSM4": "MKI67IP",
    "B4DPI9": "GNL2",
    "B3KVX2": "UTP25",
    "B7Z9G4": "UTP6",
    "B4DW33": "POLR1E",
    "B4E098": "NOP16",
    "A8MYC1": "POP4",
    "Q53GY5": "DGCR8",
    "B4DXL4": "NOC3L",
    "V9GYY5": "NOL12",
    "B3KN82": "NOP58",
    "B2RE66": "ZNF501"
}

In [32]:
# Update the ID mapping TSV file with the manually determined gene names
for uniprot_acc, gene_name in acc_to_gene_name_dict.items():
    id_mapping_neg_PPIs_human_prots_df.loc[
        id_mapping_neg_PPIs_human_prots_df["From"] == uniprot_acc,
        "Gene Names"
    ] = gene_name

In [33]:
# Perform a sanity check to ensure that all NaN entries have been
# replaced
assert not any(
    id_mapping_neg_PPIs_human_prots_df["Gene Names"].isna().to_list()
), "Not all NaN entries have been replaced with gene names!"

In [34]:
# Finally, overwrite the ID mapping TSV file with the updated Pandas
# DataFrame
id_mapping_neg_PPIs_human_prots_df.to_csv(
    "idmapping_2025_09_30_neg_human_UniProt_accs_not_present_in_NCBI_"
    "Entrez_UniProt_collab_file.tsv",
    sep="\t",
    index=False
)

##### Determining Common Genes Between Negative PPI Instances and the<br>Dharmacon Pooled Subset for PPI Pair Generation

In [19]:
# Load the updated ID mapping TSV file
id_mapping_neg_PPIs_human_prots_df = pd.read_csv(
    "idmapping_2025_09_30_neg_human_UniProt_accs_not_present_in_NCBI_"
    "Entrez_UniProt_collab_file.tsv",
    sep="\t"
)

In [20]:
# Now, determine the intersection of genes between the screen subset and
# the ID mapping TSV file
# This is necessary in order to create PPI pairs (logically, the missing
# UniProt accessions were not present in the `UniProt_IDs` column of the
# subset TSV file and thus not taken into account during PPI pair
# generation)

# Set of genes in the ID mapping TSV file
id_mapping_neg_PPIs_human_prots_genes_set = set(
    id_mapping_neg_PPIs_human_prots_df["Gene Names"].to_list()
)

# Set of genes in the subset
dp_g1_g2_subset_genes_set = set(
    dp_g1_g2_subset_df["Name"].dropna().drop_duplicates().to_list()
)

common_genes_neg_PPIs_screen_subset = set.intersection(
    id_mapping_neg_PPIs_human_prots_genes_set,
    dp_g1_g2_subset_genes_set
)

In [21]:
# Create a dictionary mapping the gene names to a list of UniProt
# accessions in the ID mapping TSV file
# To this end, the `defaultdict` class is used
from collections import defaultdict

id_mapping_gene_name_to_uniprot_acc_dict = defaultdict(list)

for _, row in id_mapping_neg_PPIs_human_prots_df.iterrows():
    gene_name = row["Gene Names"]
    uniprot_acc = row["Entry"]
    id_mapping_gene_name_to_uniprot_acc_dict[gene_name].append(
        uniprot_acc
    )

In [22]:
# For each common gene, print the respective gene and the corresponding
# UniProt accession in a dictionary-like format, i.e.
# "<UniProt accession>": "<gene name>"
for common_gene in common_genes_neg_PPIs_screen_subset:
    for acc in id_mapping_gene_name_to_uniprot_acc_dict[common_gene]:
        print(f"\"{acc}\": \"{common_gene}\",")

"F8WE32": "TAF1B",
"U3KQ75": "TAF1B",
"F8WDT8": "DDX56",
"C9J6C5": "NIFK",
"D6RJF7": "NEK11",
"A0A1B0GTK2": "LIN28B",
"A0A8I5KT77": "PHF8",
"B1AMU7": "EXOSC1",
"R4GNH9": "EXOSC1",
"U3KQ48": "MPHOSPH10",
"A0A8Q3WK70": "MPHOSPH10",
"A0A8Q3WKH7": "MPHOSPH10",
"F5GYR3": "NOP2",
"F6VJE8": "UTP18",
"J3QR85": "UTP18",
"J3KSR7": "UTP18",
"Q5VU10": "RPP30",
"B4DJR3": "RPP30",
"J3QR28": "NOL11",
"B4DW33": "POLR1E",
"E7EX70": "POLR1E",
"B7Z9G4": "UTP6",
"A0PJ56": "MAK16",
"H0YBV6": "MAK16",
"Q05CW7": "NAT10",
"V9GYY5": "NOL12",
"G3V5S9": "FCF1",
"G3V2M5": "FCF1",
"B2RE66": "ZNF501",
"Q5VXM9": "RPF2",
"U3KQN5": "RPF2",
"M0QYK9": "DEDD2",
"B4DP15": "WDR46",
"A0A1U9X8U3": "WDR46",
"A8K806": "WDR46",
"A8K9A1": "WDR46",
"B4DNI0": "WDR46",
"H0Y6G3": "WDR46",
"A0A0G2JJL1": "WDR46",
"A0A140T9L0": "WDR46",
"B4E303": "NLE1",
"B4E074": "NLE1",
"A0A0A0MRH0": "NLE1",
"Q4G0D9": "BOP1",
"Q6DKJ9": "BOP1",
"A0A075B729": "BOP1",
"P0DW28": "RBM10",
"E9PKP7": "UBTF",
"B4DNQ1": "UBTF",
"B4DM91": "NOL8",
"F5GWN9": "NO

In [23]:
# Additionally, also print common genes and their UniProt accessions in
# the following format: <gene name>: <list of UniProt accessions>
for common_gene in common_genes_neg_PPIs_screen_subset:
    uniprot_accs_list = id_mapping_gene_name_to_uniprot_acc_dict[common_gene]
    print(f"\"{common_gene}\": ", uniprot_accs_list, ",", sep="")

"TAF1B": ['F8WE32', 'U3KQ75'],
"DDX56": ['F8WDT8'],
"NIFK": ['C9J6C5'],
"NEK11": ['D6RJF7'],
"LIN28B": ['A0A1B0GTK2'],
"PHF8": ['A0A8I5KT77'],
"EXOSC1": ['B1AMU7', 'R4GNH9'],
"MPHOSPH10": ['U3KQ48', 'A0A8Q3WK70', 'A0A8Q3WKH7'],
"NOP2": ['F5GYR3'],
"UTP18": ['F6VJE8', 'J3QR85', 'J3KSR7'],
"RPP30": ['Q5VU10', 'B4DJR3'],
"NOL11": ['J3QR28'],
"POLR1E": ['B4DW33', 'E7EX70'],
"UTP6": ['B7Z9G4'],
"MAK16": ['A0PJ56', 'H0YBV6'],
"NAT10": ['Q05CW7'],
"NOL12": ['V9GYY5'],
"FCF1": ['G3V5S9', 'G3V2M5'],
"ZNF501": ['B2RE66'],
"RPF2": ['Q5VXM9', 'U3KQN5'],
"DEDD2": ['M0QYK9'],
"WDR46": ['B4DP15', 'A0A1U9X8U3', 'A8K806', 'A8K9A1', 'B4DNI0', 'H0Y6G3', 'A0A0G2JJL1', 'A0A140T9L0'],
"NLE1": ['B4E303', 'B4E074', 'A0A0A0MRH0'],
"BOP1": ['Q4G0D9', 'Q6DKJ9', 'A0A075B729'],
"RBM10": ['P0DW28'],
"UBTF": ['E9PKP7', 'B4DNQ1'],
"NOL8": ['B4DM91', 'F5GWN9'],
"UTP25": ['B3KVX2'],
"ZBTB11": ['A0A8I5QKX4'],
"RRP8": ['E9PPY3'],
"SDAD1": ['F8W8T7', 'D6RC74'],
"PRKDC": ['A0A8V8TMR1'],
"TBL3": ['A0JLS5'],
"NOP10": ['A0A8V

##### Computing Intersections

In [24]:
# Compute the intersection between the human UniProt accessions involved
# in negative PPIs and the human proteins present in the subset
common_uniprot_accs_neg_PPIs = set.intersection(
    set(human_prots_in_subset_list), set(human_prots_in_neg_PPIs)
)

In [25]:
print(common_uniprot_accs_neg_PPIs)
print(
    "Number of common human UniProt accessions (without missing "
    "UniProt accessions): "
    f"{len(common_uniprot_accs_neg_PPIs)}"
)

{'Q8N3Z6', 'E7EW05', 'Q4FZ45', 'B7ZAN7', 'B8ZZ47', 'Q9BZE4', 'Q9P0T8', 'Q9NV06', 'Q15269', 'D6RIC3', 'P52272', 'Q96G21', 'Q14146', 'B4DYZ1', 'Q9H0S4', 'Q9BVJ6', 'E7ENR5', 'Q5U5Z3', 'B3KQ21', 'A0A7I2V3F3', 'A0A994J7C7', 'Q7Z2T5', 'Q68CQ4', 'B9A008', 'A3F769', 'Q8NI36', 'A0JLQ5', 'H7C2Q8', 'Q9UL41', 'Q3B726', 'Q9NW13', 'A0A6M8YDW1', 'P49715', 'Q9NVN8', 'Q9BXY0', 'O43159', 'Q9UL42', 'Q9NYV6', 'B2RD09', 'B2R9F5', 'A0A8J8YW94', 'A0A2R8Y6A4', 'Q5J7U2', 'Q9BRU9', 'Q76FK4', 'Q9H8H0', 'Q99848', 'P17480', 'Q53ES5', 'Q9NY61', 'B4DT66', 'Q08E77', 'Q13823', 'O95625', 'A0AAG2UWQ9', 'Q53T94', 'P78316', 'Q9H633', 'B1ALV0', 'Q96HI0', 'Q9Y3C1', 'Q8N4P8', 'D6RCB9', 'J3KNP2', 'Q9GZS1', 'H0Y714', 'A0A590UJW4', 'A0A087WXF8', 'B9EG90', 'Q9NVX2', 'P78345', 'A3F768', 'Q4ZG72', 'P56182', 'Q15572', 'Q8ND90', 'Q8IZU1', 'Q5T8A7', 'E9PS41', 'A2A2V2', 'Q9UHA3', 'Q969H6', 'A0A8I5KX72', 'B1AMU4', 'A8K6D2', 'Q13428', 'Q92979', 'Q5T0F3', 'Q5VU11', 'Q5TAP6', 'Q96P11', 'O15213', 'A8K330', 'P36954', 'O15226', 'B3KX63', 'Q9

In [26]:
# Additionally, also determine the intersection for the missing UniProt
# accessions
# This has already been done above with the generation of the
# `id_mapping_gene_name_to_uniprot_acc_dict` dict and the determination
# of common genes
n_common_missing_hum_uniprot_accs_neg_PPIs = 0

for common_gene in common_genes_neg_PPIs_screen_subset:
    n_common_uniprot_accs = len(
        id_mapping_gene_name_to_uniprot_acc_dict[common_gene]
    )
    n_common_missing_hum_uniprot_accs_neg_PPIs += n_common_uniprot_accs

In [27]:
print(
    "Number of common missing UniProt accessions: "
    f"{n_common_missing_hum_uniprot_accs_neg_PPIs:,}"
)

Number of common missing UniProt accessions: 118


#### Computing the Intersection for the Test Set

In [28]:
# The Dharmacon pooled subset TSV file has been updated with the missing
# UniProt accessions from both confirmed positive PPIs and reliable
# negative PPIs
# Load the updated subset TSV file
path_to_latest_subset_tsv_file = (
    "Dharmacon_pooled_G1_G2_screening_plates_subset_with_missing_"
    "UniProt_IDs_Z-scored.tsv"
)

dp_g1_g2_subset_df = pd.read_csv(
    path_to_latest_subset_tsv_file,
    sep="\t"
)

In [29]:
# Once again, compute the intersection between the screen subset human
# UniProt accessions and the human UniProt accessions from the combined
# PPI data set
human_prots_in_subset_list = dp_g1_g2_subset_df[
    "UniProt_IDs"
].dropna().to_list()

# Intermediate conversion to a set removes possible duplicates
human_prots_in_subset_list = list(set([
    uniprot_acc
    for entry in human_prots_in_subset_list
    for uniprot_acc in entry.split(";")

]))

In [30]:
# Intersection between the screen subset human UniProt accessions and
# the human proteins involved in confirmed positive PPIs
common_uniprot_accs_pos_ppis = set.intersection(
    set(human_prots_in_subset_list), set(human_prots_in_HVIDB)
)

In [32]:
assert (
    len(common_uniprot_accs_pos_ppis)
    ==
    (322 + 10)
), "Something went wrong while updating the subset TSV file!"

In [33]:
# Intersection between the screen subset human UniProt accessions and
# the human proteins involved in reliable negative PPIs
common_uniprot_accs_neg_ppis = set.intersection(
    set(human_prots_in_subset_list), set(human_prots_in_neg_PPIs)
)

In [34]:
assert (
    len(common_uniprot_accs_neg_ppis)
    ==
    (196 + 118)
), "Something went wrong while updating the subset TSV file!"

In [39]:
# Now that the sanity checks have successfully been passed, finally
# determine the intersection between the screen subset human UniProt
# accessions and the test set human UniProt accessions
# To this end, the test set has to be loaded into a Pandas DataFrame
path_to_test_set = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/new_combined_data_set_creation/data_"
    "set_files/bullet-proof_test_set.tsv"
)

test_set_df = pd.read_csv(
    path_to_test_set,
    sep="\t"
)

In [41]:
# Extract the unique human UniProt accessions from the test set
test_set_human_uniprot_accs = test_set_df[
    "Human_prot"
].drop_duplicates().to_list()

In [43]:
test_set_common_human_uniprot_accs = set.intersection(
    set(human_prots_in_subset_list), set(test_set_human_uniprot_accs)
)

In [45]:
print(
    "Number of common human UniProt accessions shared between the "
    "screen subset\nand the test set: "
    f"{len(test_set_common_human_uniprot_accs):,}"
)

Number of common human UniProt accessions shared between the screen subset
and the test set: 62
