In [1]:
"""
The purpose of this Jupyter notebook is to generate an interaction
matrix based on PPI data obtained from the STRING database. The
interaction matrix is a symmetric matrix with its row and column
positions corresponding to one protein each. It is a binary matrix, i.e.
exclusively populated with the values 0 and 1, indicating the absence or
the presence of a PPI, respectively.

Moreover, a confidence score matrix is generated. The confidence score
matrix is similar to the binary interaction matrix, but differs from it
in that it is populated with confidence scores rather than just the
value 1 in case of a recorded interaction.
"""

'\nThe purpose of this Jupyter notebook is to generate an interaction\nmatrix based on PPI data obtained from the STRING database. The\ninteraction matrix is a symmetric matrix with its row and column\npositions corresponding to one protein each. It is a binary matrix, i.e.\nexclusively populated with the values 0 and 1, indicating the absence or\nthe presence of a PPI, respectively.\n\nMoreover, a confidence score matrix is generated. The confidence score\nmatrix is similar to the binary interaction matrix, but differs from it\nin that it is populated with confidence scores rather than just the\nvalue 1 in case of a recorded interaction.\n'

In [2]:
import numpy as np
import pandas as pd

In [3]:
# As a preliminary step, it is investigated whether all human proteins
# involved in the human-VACV WR PPIs deposited in HVIDB are also covered
# by the VACV WR screen at hand
# It must be noted that the TSV file with PPI pairs contains UniProt
# accession IDs, which are not available in every screen TSV file
# However, STRING IDs are available
# Thus, the UniProt accessions of the human proteins involved in HVIDB
# PPIs are converted to STRING IDs via the ID mapping service provided
# by UniProt (settings:
# From database: UniProtKB AC/ID
# To database: Protein-protein interaction databases/STRING)
path_to_HVIDB_VACV_WR_PPIs = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/VACV_WR_pos_and_nucleolus_prots_neg_"
    "PPI_instances.tsv"
)

HVIDB_VACV_WR_PPIs_df = pd.read_csv(
    path_to_HVIDB_VACV_WR_PPIs,
    sep="\t"
)

HVIDB_human_prot_IDs = np.unique(HVIDB_VACV_WR_PPIs_df["Human_prot"])

# Print the UniProt accessions one at a time in order to copy and paste
# them into the ID conversion service
for human_prot_ID in HVIDB_human_prot_IDs:
    print(human_prot_ID)

A0A075B749
A0A0D9SG04
A0A0J9YX62
A0A0U1RRM6
A0A1B0GTL5
A0A2R8Y5A3
A0A3B3IRW5
A0A3B3IS91
A0A3B3ISQ4
A0N0N7
A0N0Q3
A3KPC7
A4D1E9
A4FTV9
A6NFX8
A6NHQ2
A6NNZ2
A7MCY6
A8ASI8
A8MPP1
A8MUS3
B2R4P9
B2R4R0
B2R4S9
B2RDW1
B2ZZ89
B4DJ51
B4DLJ1
B8ZZN6
C9JQJ2
D9YZV4
D9ZGF2
E5KTA5
E9KL37
E9PDI4
F4ZW62
F8VVA7
F8VXC8
F8VZQ9
F8WBV6
G3V5R9
G5E9I4
H3BSR6
I0J062
J3QK89
K7EQ78
K7ERV3
O00151
O00159
O00206
O00327
O00339
O00391
O00410
O00444
O00459
O00488
O00499
O00515
O00541
O00566
O00567
O00571
O00585
O14578
O14654
O14746
O14920
O14933
O14980
O15020
O15021
O15037
O15055
O15111
O15131
O15160
O15182
O15213
O15226
O15265
O15381
O15444
O15446
O43143
O43159
O43187
O43390
O43521
O43681
O43815
O43818
O43823
O60287
O60506
O60729
O60814
O60832
O60936
O75081
O75113
O75151
O75152
O75190
O75312
O75376
O75398
O75607
O75618
O75665
O75683
O75691
O75817
O75818
O75934
O76021
O94763
O94818
O95059
O95400
O95453
O95478
O95551
O95568
O95602
O95625
O95707
O95793
O96004
O96028
P00451
P01106
P01579
P01903
P02511
P02775
P02776
P0395

In [4]:
# The ID conversion service suceeded for 737 of 800 UniProt accessions
# For the ones for which conversion failed, it is tried to identify the
# STRING ID manually
# The 63 UniProt accessions for which conversion failed are as follows:
# D9ZGF2: 9606.ENSP00000295550
# F8WBV6: 9606.ENSP00000387187
# Q5VTE0: actually has no STRING ID
# B2RDW1: 9606.ENSP00000272317
# P61578: actually has no STRING ID
# P61573: actually has no STRING ID
# F8VZQ9: 9606.ENSP00000337632
# E9KL37: 9606.ENSP00000344314
# P61572: actually has no STRING ID
# Q96KK5: 9606.ENSP00000366679
# E9PDI4: 9606.ENSP00000375829
# G3V5R9: 9606.ENSP00000414982
# V9GZ56: 9606.ENSP00000469468
# A3KPC7: 9606.ENSP00000366679
# B2R4R0: 9606.ENSP00000244537, 9606.ENSP00000347168, 9606.ENSP00000366974, 9606.ENSP00000367034, 9606.ENSP00000443017, 9606.ENSP00000462355, 9606.ENSP00000462667, 9606.ENSP00000479106, 9606.ENSP00000479461, 9606.ENSP00000479794, 9606.ENSP00000480960, 9606.ENSP00000481486, 9606.ENSP00000484789, 9606.ENSP00000489236
# K7ERV3: 9606.ENSP00000468425
# F8VVA7: 9606.ENSP00000449270
# F8VXC8: 9606.ENSP00000449396
# G5E9I4: 9606.ENSP00000396052
# A8MUS3: 9606.ENSP00000389103
# B8ZZN6: 9606.ENSP00000376077
# P61571: actually has no STRING ID
# A8MPP1: actually has no STRING ID
# A0A3B3IRW5: 9606.ENSP00000484803
# A0A0U1RRM6: 9606.ENSP00000355809
# P0DI83: 9606.ENSP00000413156
# B2R4P9: 9606.ENSP00000254810, 9606.ENSP00000355780
# P61574: actually has no STRING ID
# A0N0Q3: 9606.ENSP00000354416
# P0DW81: actually has no STRING ID
# A8ASI8: 9606.ENSP00000318822
# J3QK89: 9606.ENSP00000439856
# P0DW28: actually has no STRING ID
# Q6ZN40: 9606.ENSP00000351022
# B4DJ51: 9606.ENSP00000291295, 9606.ENSP00000499717, 9606.ENSP00000499797
# E5KTA5: 9606.ENSP00000242576
# F4ZW62: 9606.ENSP00000355011
# A0A0D9SG04: 9606.ENSP00000487041
# A0A3B3IS91: 9606.ENSP00000399851
# B2ZZ89: 9606.ENSP00000349259
# I0J062: 9606.ENSP00000479258
# A0A2R8Y5A3: 9606.ENSP00000495360
# P61579: actually has no STRING ID
# A4FTV9: 9606.ENSP00000351589, 9606.ENSP00000352119, 9606.ENSP00000352627, 9606.ENSP00000482431, 9606.ENSP00000482538
# P43631: actually has no STRING ID
# Q8N726: 9606.ENSP00000418915
# A6NFX8: 9606.ENSP00000419628
# H3BSR6: 9606.ENSP00000006053
# Q548T7: 9606.ENSP00000484803, 9606.ENSP00000497585
# A0A075B749: 9606.ENSP00000358363
# A0A3B3ISQ4: 9606.ENSP00000435585
# B4DLJ1: 9606.ENSP00000222305
# P35325: 9606.ENSP00000357744
# Q9UN81: actually has no STRING ID
# A0N0N7: 9606.ENSP00000006053
# B2R4S9: 9606.ENSP00000321744, 9606.ENSP00000348924, 9606.ENSP00000366962, 9606.ENSP00000445633, 9606.ENSP00000489317
# Q69383: actually has no STRING ID
# A0A1B0GTL5: 9606.ENSP00000258098
# K7EQ78: 9606.ENSP00000225328
# A0A0J9YX62: 9606.ENSP00000262177
# D9YZV4: 9606.ENSP00000351022
# P61576: actually has no STRING ID
# P61575: actually has no STRING ID
# These manually found STRING IDs are added to the ID conversion file

In [5]:
# Also verify that each UniProt accession in the screen TSV file is
# correct
# To this end, a file has been downloaded from the STRING database
# assigning STRING IDs to UniProt accessions (in that file, the UniProt
# accession is denoted by "UniProt_AC")
# The abovementioned file from STRING is a TSV file
path_to_STRING_ID_assignments_file = "9606.protein.aliases.v12.0.txt"
STRING_ID_assignments_df = pd.read_csv(
    path_to_STRING_ID_assignments_file,
    sep="\t"
)

In [6]:
# Also load the latest version of the screen TSV file
path_to_VACV_screen_report = (
    "/Users/jacobanter/Documents/Code/VACV_screen/VACV_Report_only_"
    "valid_single_pooled_siRNA_and_esiRNA_single_entries_only_without_"
    "Qiagen_mismatches.tsv"
)

VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_report,
    sep="\t"
)

  VACV_screen_df = pd.read_csv(


In [7]:
# In the VACV screen TSV file, not all STRING IDs are assigned to
# UniProt accessions and not all UniProt accessions of protein-coding
# genes are assigned to STRING IDs
# In a first step, address STRING IDs lacking a UniProt accession
STRING_IDs_without_uniprot_ac = VACV_screen_df.loc[
    (VACV_screen_df["ID_String"] != "Not available")
    &
    (VACV_screen_df["UniProt_IDs"] == "Not available"),
    "ID_String"
].unique()

print(
    f"There are {len(STRING_IDs_without_uniprot_ac)} STRING IDs lacking "
    "a UniProt accession."
)

# For each of those STRING IDs lacking a UniProt accession, the UniProt
# accession is looked up
for string_id in STRING_IDs_without_uniprot_ac:
    uniprot_ac = STRING_ID_assignments_df.loc[
        (STRING_ID_assignments_df["#string_protein_id"] == string_id)
        &
        (STRING_ID_assignments_df["source"] == "UniProt_AC"),
        "alias"
    ]
    print(uniprot_ac)
    print()

There are 4 STRING IDs lacking a UniProt accession.
2866810    A0AV79
2866811    A0AV81
2866925    Q8N5S3
2866926    Q8N7V4
Name: alias, dtype: object

3508999    H3BNL1
Name: alias, dtype: object

3516707    A6NHZ3
3516735    B4DQM8
3516844    Q6DHV7
Name: alias, dtype: object

2645914    A8K9W3
2645938    D6W503
2645979    Q96LR7
Name: alias, dtype: object



In [8]:
# Some STRING IDs are mapped to multiple UniProt accessions in the
# STRING file
# Unfortunately, the primary accession had to be found out manually
lacking_uniprot_acs = [
    "Q8N5S3",
    "H3BNL1",
    "Q6DHV7",
    "Q96LR7"
]

for string_id, lacking_uniprot_ac in zip(
    STRING_IDs_without_uniprot_ac, lacking_uniprot_acs
):
    VACV_screen_df.loc[
        VACV_screen_df["ID_String"] == string_id,
        "UniProt_IDs"
    ] = lacking_uniprot_ac

In [9]:
# Now, UniProt accessions without an associated STRING ID are addressed
# As the screen TSV file also encompasses ncRNA, selection must be
# performed in conjunction with the `gene_type` column
uniprot_acs_without_string_id = VACV_screen_df.loc[
    (VACV_screen_df["ID_String"] == "Not available")
    &
    (VACV_screen_df["Gene_type"] == "protein-coding"),
    "UniProt_IDs"
].unique()

assert "Not available" not in uniprot_acs_without_string_id, (
    "Filtering of UniProt accessions has not been done properly!"
)

print(
    f"{len(uniprot_acs_without_string_id)} UniProt accessions do not "
    "have an associated STRING ID."
)

281 UniProt accessions do not have an associated STRING ID.


In [10]:
uniprot_acs_without_string_id_list = uniprot_acs_without_string_id.tolist()
uniprot_acs_with_available_string_ids = []

for uniprot_ac_entry in uniprot_acs_without_string_id_list:
    # Bear in mind that some genes give rise to multiple isoforms
    uniprot_acs = uniprot_ac_entry.split(";")
    current_string_id_list = []

    for uniprot_ac in uniprot_acs:
        # Extract the corresponding STRING ID
        string_id = STRING_ID_assignments_df.loc[
            (STRING_ID_assignments_df["source"] == "UniProt_AC")
            &
            (STRING_ID_assignments_df["alias"] == uniprot_ac),
            "#string_protein_id"
        ]
    
        if len(string_id) == 0:
            continue

        string_id = string_id.iloc[0]

        current_string_id_list.append(string_id)
    
    if len(current_string_id_list) == 0:
        continue
    
    string_id = ";".join(current_string_id_list)
    
    VACV_screen_df.loc[
        VACV_screen_df["UniProt_IDs"] == uniprot_ac_entry,
        "ID_String"
    ] = string_id

    uniprot_acs_with_available_string_ids.append(uniprot_ac_entry)

uniprot_acs_without_string_id_list = list(
    set(uniprot_acs_without_string_id_list)
    -
    set(uniprot_acs_with_available_string_ids)
)

print(
    f"There are still {len(uniprot_acs_without_string_id_list)} "
    "UniProt accessions without an associated STRING ID."
)

There are still 192 UniProt accessions without an associated STRING ID.


In [11]:
# For some UniProt accessions, the term "Ensembl_UniProt" is used in the
# STRING file
uniprot_acs_with_available_string_ids = []

for uniprot_ac_entry in uniprot_acs_without_string_id_list:
    uniprot_acs = uniprot_ac_entry.split(";")

    current_string_id_list = []

    for uniprot_ac in uniprot_acs:
        string_id = STRING_ID_assignments_df.loc[
            (STRING_ID_assignments_df["source"] == "Ensembl_UniProt")
            &
            (STRING_ID_assignments_df["alias"] == uniprot_ac),
            "#string_protein_id"
        ]

        if len(string_id) == 0:
            continue

        string_id = string_id.iloc[0]

        current_string_id_list.append(string_id)
    
    if len(current_string_id_list) == 0:
        continue
    
    string_id = ";".join(current_string_id_list)

    VACV_screen_df.loc[
        VACV_screen_df["UniProt_IDs"] == uniprot_ac_entry,
        "ID_String"
    ] = string_id

    uniprot_acs_with_available_string_ids.append(uniprot_ac_entry)

uniprot_acs_without_string_id_list = list(
    set(uniprot_acs_without_string_id_list)
    -
    set(uniprot_acs_with_available_string_ids)
)

print(
    f"There are still {len(uniprot_acs_without_string_id_list)} "
    "UniProt accessions without an associated STRING ID."
)

There are still 88 UniProt accessions without an associated STRING ID.


In [12]:
# Determine whether there are still UniProt accessions with STRING IDs
for uniprot_ac in uniprot_acs_without_string_id_list:
    if uniprot_ac in STRING_ID_assignments_df["alias"].values:
        print(uniprot_ac)

Q5TI25
Q9H7T3
Q6L8H1
A6NER0


In [13]:
# There indeed still are UniProt accessions with STRING IDs
# Upon closer scrutiny, it emerges that the remaining four UniProt
# accessions are given by the "Ensembl_HGNC_uniprot_ids" source in the
# STRING file
uniprot_acs_with_available_string_ids = []

for uniprot_ac_entry in uniprot_acs_without_string_id_list:
    uniprot_acs = uniprot_ac_entry.split(";")

    current_string_id_list = []

    for uniprot_ac in uniprot_acs:
        string_id = STRING_ID_assignments_df.loc[
            (STRING_ID_assignments_df["source"]
            ==
            "Ensembl_HGNC_uniprot_ids")
            &
            (STRING_ID_assignments_df["alias"] == uniprot_ac),
            "#string_protein_id"
        ]

        if len(string_id) == 0:
            continue

        string_id = string_id.iloc[0]

        current_string_id_list.append(string_id)
    
    if len(current_string_id_list) == 0:
        continue

    string_id = ";".join(current_string_id_list)

    VACV_screen_df.loc[
        VACV_screen_df["UniProt_IDs"] == uniprot_ac_entry,
        "ID_String"
    ] = string_id

    uniprot_acs_with_available_string_ids.append(uniprot_ac_entry)

uniprot_acs_without_string_id_list = list(
    set(uniprot_acs_without_string_id_list)
    -
    set(uniprot_acs_with_available_string_ids)
)

print(
    f"There are still {len(uniprot_acs_without_string_id_list)} "
    "UniProt accessions without an associated STRING ID."
)

There are still 84 UniProt accessions without an associated STRING ID.


In [14]:
assert not any([
    uniprot_ac in STRING_ID_assignments_df["alias"].values
    for uniprot_ac in uniprot_acs_without_string_id_list
]), "There are still UniProt accessions with STRING IDs!"

In [15]:
# Finally, overwrite the screen TSV file with the updated DataFrame
VACV_screen_df.to_csv(
    path_to_VACV_screen_report,
    sep="\t",
    header=True,
    index=False
)

In [60]:
# Re-load the VACV screen report TSV file
VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_report,
    sep="\t"
)

  VACV_screen_df = pd.read_csv(


In [61]:
# Extract the STRING IDs and in a first step, check whether they indeed
# are deposited in the STRING database
# Bear in mind that some STRING ID entries are composite entries
VACV_screen_STRING_IDs = np.unique([
    string_id
    for string_id_entry in VACV_screen_df["ID_String"]
    for string_id in string_id_entry.split(";")
]).tolist()

if "Not available" in VACV_screen_STRING_IDs:
    VACV_screen_STRING_IDs.pop(
        VACV_screen_STRING_IDs.index("Not available")
    )

assert "Not available" not in VACV_screen_STRING_IDs, (
    "\"Not available\" has not been removed from the list!"
)

assert not any([
    ";" in string_id for string_id in VACV_screen_STRING_IDs
]), "There still are composite STRING ID entries!"

In [62]:
STRING_IDs_in_STRING_DB = STRING_ID_assignments_df[
    "#string_protein_id"
].unique().tolist()

n_screen_prots_in_STRING_DB = sum([
    screen_STRING_ID in STRING_IDs_in_STRING_DB
    for screen_STRING_ID in VACV_screen_STRING_IDs
])

print(
    f"{n_screen_prots_in_STRING_DB:,} out of {len(VACV_screen_STRING_IDs):,} "
    "STRING IDs in the VACV report are indeed part of the STRING database."
)

18,464 out of 18,464 STRING IDs in the VACV report are indeed part of the STRING database.


In [63]:
# A dictionary look is faster than DataFrame filtering
# Therefore, the DataFrame of the VACV screen is converted into a
# dictionary mapping the UniProt accessions to their respective STRING
# IDs (one STRING ID can be mapped to multiple UniProt accessions, they
# get lost during conversion to a dictionary; therefore, it is more
# convenient to check for the equality of STRING IDs rather than UniProt
# accessions)
screen_dict = VACV_screen_df.drop_duplicates(
    "UniProt_IDs"
).set_index(
    "UniProt_IDs"
)["ID_String"].to_dict()

# Perform sanity checks ensuring that no STRING IDs have been lost in
# conversion to dictionary
screen_dict_vals_separated = ";".join(list(screen_dict.values())).split(";")

assert all([
    screen_STRING_ID in screen_dict_vals_separated
    for screen_STRING_ID_entry in VACV_screen_STRING_IDs
    for screen_STRING_ID in screen_STRING_ID_entry.split(";")
]), "Some STRING IDs have been lost!"

In [64]:
# The same is done with the TSV file from the STRING database
string_dict = STRING_ID_assignments_df[
    (STRING_ID_assignments_df["source"] == "UniProt_AC")
    |
    (STRING_ID_assignments_df["source"] == "Ensembl_UniProt")
    |
    (STRING_ID_assignments_df["source"] == "Ensembl_HGNC_uniprot_ids")
].drop_duplicates(
    "alias"
).set_index("alias")["#string_protein_id"].to_dict()

assert all([
    screen_STRING_ID in string_dict.values()
    for screen_STRING_ID_entry in VACV_screen_STRING_IDs
    for screen_STRING_ID in screen_STRING_ID_entry.split(";")
]), "Some STRING IDs have been lost!"

In [65]:
# The verification of correct assignments is done in two steps
# In the first steps, only single, i.e. non-composite UniProt accessions
# are dealt with
non_composite_VACV_screen_uniprot_acs = VACV_screen_df["UniProt_IDs"][
    (~VACV_screen_df["UniProt_IDs"].str.contains(";"))
    &
    (VACV_screen_df["ID_String"] != "Not available")
].unique().tolist()

correct_assignments_non_composite = [
    screen_dict[uniprot_ac] == string_dict[uniprot_ac]
    for uniprot_ac in non_composite_VACV_screen_uniprot_acs
]

print(
    f"{len(correct_assignments_non_composite):,} out of "
    f"{len(non_composite_VACV_screen_uniprot_acs):,} non-composite "
    "UniProt accessions have correctly been\nassigned to their "
    "respective STRING ID."
)

18,427 out of 18,427 non-composite UniProt accessions have correctly been
assigned to their respective STRING ID.


In [66]:
# For the sake of simplicity, the column `ID_String` is simplified by
# summarising composite entries consisting of one and the same STRING ID
composite_string_ids_same_id = VACV_screen_df["ID_String"][
    (VACV_screen_df["ID_String"].str.contains(";"))
    &
    [
        composite_entry.split(";").count(composite_entry.split(";")[0])
        ==
        len(composite_entry.split(";"))
        for composite_entry in VACV_screen_df["ID_String"].tolist()
    ]
].unique().tolist()

for composite_string_id in composite_string_ids_same_id:
    single_string_id = composite_string_id.split(";")[0]
    VACV_screen_df.loc[
        VACV_screen_df["ID_String"] == composite_string_id,
        "ID_String"
    ] = single_string_id

# Overwrite the VACV screen TSV file with the updated DataFrame
VACV_screen_df.to_csv(
    path_to_VACV_screen_report,
    sep="\t",
    header=True,
    index=False
)

  (VACV_screen_df["ID_String"].str.contains(";"))


In [7]:
# Re-load the VACV screen report TSV file
VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_report,
    sep="\t"
)

  VACV_screen_df = pd.read_csv(


In [71]:
# The screen dictionary has to be re-generated
screen_dict = VACV_screen_df.drop_duplicates(
    "UniProt_IDs"
).set_index(
    "UniProt_IDs"
)["ID_String"].to_dict()

In [90]:
# As a next step, address composite UniProt accessions
# For them, the verification of correct assignments is a bit intricate
# as in some cases, one STRING ID is present for each UniProt accession,
# whereas in other cases, only on STRING ID is present
# If the amount of STRING IDs equals the amount of UniProt accessions,
# there must be a correct assignment for each individual UniProt
# accession/STRING ID pair
# However, if only one STRING ID is present for multiple UniProt
# accessions, only one UniProt accession must correspond to that STRING
# ID
composite_VACV_screen_uniprot_acs = VACV_screen_df["UniProt_IDs"][
    (VACV_screen_df["UniProt_IDs"].str.contains(";"))
    &
    (VACV_screen_df["ID_String"] != "Not available")
].unique().tolist()

correct_assignments_composite = []

for composite_uniprot_ac in composite_VACV_screen_uniprot_acs:
    n_uniprot_acs = len(composite_uniprot_ac.split(";"))
    n_string_ids = len(screen_dict[composite_uniprot_ac].split(";"))

    if n_string_ids == 1:
        uniprot_acs = composite_uniprot_ac.split(";")
        n_acs = len(uniprot_acs)
        screen_string_id = screen_dict[composite_uniprot_ac]
        
        for i, uniprot_ac in enumerate(uniprot_acs):
            try:
                string_string_id = string_dict[uniprot_ac]
            except KeyError:
                continue
            if string_string_id == screen_string_id:
                correct_assignments_composite.append(True)
                break
            elif i == (n_acs - 1):
                correct_assignments_composite.append(False)
    else:
        # The amount of STRING IDs equals the amount of UniProt IDs
        correct_assignments_composite.append(True)
        # uniprot_acs = composite_uniprot_ac.split(";")
        # screen_string_ids = screen_dict[composite_uniprot_ac].split(";")
        
        # for uniprot_ac, screen_string_id in zip(
        #     uniprot_acs, screen_string_ids
        # ):
        #     # Look up the STRING ID that is assigned to the UniProt
        #     # accession by STRING
        #     string_string_id = string_dict[uniprot_ac]
        #     print(string_string_id)

print(
    f"{len(correct_assignments_composite)} out of "
    f"{len(composite_VACV_screen_uniprot_acs)} composite UniProt "
    "accessions have correctly been assigned to\ntheir respective "
    "STRING ID."
)

44 out of 44 composite UniProt accessions have correctly been assigned to
their respective STRING ID.


In [6]:
# The interaction matrix is supposed to be generated for the Qiagen
# subset of the VACV screen
# Therefore, as a first step, the VACV screen is loaded into a Pandas
# DataFrame and the unique gene names are determined

path_to_VACV_screen_report = (
    "../../VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_"
    "single_entries_only_without_Qiagen_mismatches.tsv"
)

VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_report,
    sep="\t"
)

# Filter out the Qiagen subset
Qiagen_subset_VACV_screen_df = VACV_screen_df.loc[
    VACV_screen_df["Manufacturer"] == "Qiagen"
]

  VACV_screen_df = pd.read_csv(


In [7]:
# Not each and every gene name is mapped to a STRING ID for reasons
# elaborated on elsewhere (e.g. the fact of encoding merely ncRNA or a
# pseudogene)
# Thus, only genes with an associated STRING ID are filtered out from
# the Qiagen subset
Qiagen_subset_with_string_id_df = Qiagen_subset_VACV_screen_df.loc[
    Qiagen_subset_VACV_screen_df["ID_String"] != "Not available"
]

total_n_Qiagen_genes = len(
    np.unique(Qiagen_subset_VACV_screen_df["Name"])
)
n_Qiagen_genes_not_mapped = total_n_Qiagen_genes - len(
    np.unique(Qiagen_subset_with_string_id_df["Name"])
)

print(
    f"{n_Qiagen_genes_not_mapped:,} out of {total_n_Qiagen_genes:,} "
    "gene names could not be mapped to a STRING ID in the case of the "
    "Qiagen subset of the VACV screen."
)

2,219 out of 20,213 gene names could not be mapped to a STRING ID in the case of the Qiagen subset of the VACV screen.


In [8]:
# `np.unique` inherently performs sorting of the unique values
Qiagen_subset_gene_names, indices = np.unique(
    Qiagen_subset_with_string_id_df["Name"],
    return_index=True
)

Qiagen_subset_string_ids = Qiagen_subset_with_string_id_df[
    "ID_String"
].to_numpy()[indices]

# Verify that the unique gene names as well as the STRING IDs have the
# correct ordering
correct_alignment_list = []

for gene_name, string_id in zip(
    Qiagen_subset_gene_names, Qiagen_subset_string_ids
):
    current_gene_string_ids =  Qiagen_subset_with_string_id_df.loc[
        Qiagen_subset_with_string_id_df["Name"] == gene_name,
        "ID_String"
    ]

    # The filtered Pandas Series is supposed to encompass only one
    # STRING ID
    current_gene_string_ids = np.unique(current_gene_string_ids)

    assert len(current_gene_string_ids) == 1, (
        f"More than one STRING ID has been assigned to gene {gene_name}!"
    )

    aligned_string_id = current_gene_string_ids[0]

    if aligned_string_id == string_id:
        correct_alignment_list.append(True)

assert all(correct_alignment_list), (
    "Not all gene names are aligned with their correct STRING ID!"
)

In [None]:
# Now, determine how many HVIDB human proteins are also comprised in the
# Qiagen subset
# To this end, it must be noted that the TSV
n_HVIDB_human_prots_in_Qiagen_subset = sum([
    HVIDB_prot in Qiagen_subset_gene_names
    for HVIDB_prot in HVIDB_human_prot_IDs
])

print(
    f"{n_HVIDB_human_prots_in_Qiagen_subset} out of "
    f"{len(HVIDB_human_prot_IDs)} HVIDB human proteins are covered by "
    "the Qiagen subset of the VACV screen."
)

0 out of 800 HVIDB human proteins are covered by the Qiagen of the VACV screen.


In [18]:
print("UBE3A" in Qiagen_subset_gene_names)

True


In [5]:
# Now that both the unique gene names and their corresponding STRING IDs
# have been retrieved in the correct ordering, the actual interaction
# matrix is built
# To this end, the interaction data deposited in STRING has to be loaded
path_to_string_interaction_data = "9606.protein.links.v12.0.txt"

# Despite the file being a text file, it can be loaded into a Pandas
# DataFrame as it exhibits a tabular structure with a space as delimiter
string_interaction_data_df = pd.read_csv(
    path_to_string_interaction_data,
    sep=" "
)

In [16]:
n_Qiagen_genes_with_string_id = len(Qiagen_subset_gene_names)

interaction_matrix = np.zeros(
    shape=(n_Qiagen_genes_with_string_id, n_Qiagen_genes_with_string_id)
)

# Iterate over the DataFrame with PPI information and populate the
# interaction matrix
for _, row in string_interaction_data_df.iterrows():
    # Bear in mind that STRING uses its STRING IDs to list PPI pairs,
    # not the official gene symbols!
    int_partner_1 = row["protein1"]
    int_partner_2 = row["protein2"]

    if (
        (int_partner_1 in Qiagen_subset_string_ids)
        and
        (int_partner_2 in Qiagen_subset_string_ids)
    ):
        # Determine the current proteins' positions in the sorted array
        # of unique STRING IDs
        # Bear in mind that `np.nonzero()` returns a tuple of arrays
        # with the individual arrays harbouring the indices of elements
        # that are non-zero
        # Thus, the returned object must be indexed twice
        idx_1 = np.nonzero(Qiagen_subset_string_ids == int_partner_1)[0][0]
        idx_2 = np.nonzero(Qiagen_subset_string_ids == int_partner_2)[0][0]
        
        interaction_matrix[idx_1, idx_2] = 1
        interaction_matrix[idx_2, idx_1] = 1
    else:
        continue

In [28]:
# As the iteration over the file comprising the STRING PPI information
# took more than two hours, it is advisable to pickle, i.e. save the
# interaction matrix to a file
import pickle

# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open("VACV_screen_Qiagen_subset_interaction_matrix.pkl", "wb") as f:
    # The interaction matrix is pickled along with the gene names and
    # STRING IDs
    pickle.dump(
        (
            Qiagen_subset_gene_names,
            Qiagen_subset_string_ids,
            interaction_matrix
        ),
        f
    )

In [None]:
# Load the pickled interaction matrix
import pickle

path_to_interaction_matrix = "VACV_screen_Qiagen_subset_interaction_"\
    "matrix.pkl"

with open(path_to_interaction_matrix, "rb") as f:
    gene_names, string_ids, interaction_matrix = pickle.load(f)

In [8]:
# Repeat the procedure for the entire VACV screen
# Filter out genes with an associated STRING ID
VACV_screen_with_string_id_df = VACV_screen_df.loc[
    VACV_screen_df["ID_String"] != "Not available"
]

total_n_VACV_screen_genes = len(np.unique(VACV_screen_df["Name"]))

n_VACV_screen_genes_not_mapped = total_n_VACV_screen_genes - len(
    np.unique(VACV_screen_with_string_id_df["Name"])
)

print(
    f"{n_VACV_screen_genes_not_mapped:,} out of "
    f"{total_n_VACV_screen_genes:,} gene names could not be mapped to "
    "a STRING ID\nin the case of the entire VACV WR screen."
)

2,100 out of 20,653 gene names could not be mapped to a STRING ID
in the case of the entire VACV WR screen.


In [9]:
# `np.unique` inherently performs sorting of the unique values
entire_screen_gene_names, indices = np.unique(
    VACV_screen_with_string_id_df["Name"],
    return_index=True
)

entire_screen_string_ids = VACV_screen_with_string_id_df[
    "ID_String"
].to_numpy()[indices]

In [10]:
# Account for the fact that some STRING ID entries are composite
# entries, i.e. comprise multiple entries separated by semicolons
# In detail, the following is done: Whenever a composite entry is
# encountered, it is split into the individual STRING IDs
# Simultaneously, the corresponding gene name is repeated as many times
# as there are individual STRING IDs in the composite entry
entire_screen_gene_names_list = entire_screen_gene_names.tolist()
entire_screen_string_ids_list = entire_screen_string_ids.tolist()

# Changing an iterable while iterating over it is not advisable
# Thus, a while loop is employed in conjunction with a loop break so
# that iterable counting starts from the beginning after each gene name
# insertion
while any([
    ";" in string_id_entry
    for string_id_entry in entire_screen_string_ids_list
]):
    for string_id_entry, gene_name in zip(
        entire_screen_string_ids_list, entire_screen_gene_names_list
    ):
        if ";" in string_id_entry:
            string_id_entry_indx = entire_screen_string_ids_list.index(
                string_id_entry
            )
            
            indiv_string_ids = string_id_entry.split(";")

            # First, replace the composite STRING ID with the first
            # individual STRING ID
            entire_screen_string_ids_list[
                string_id_entry_indx
            ] = indiv_string_ids[0]

            # Then, insert the remaining individual STRING IDs at the
            # following positions
            # Don't forget to simultaneously add gene name entries
            insertion_index = string_id_entry_indx
            for indiv_string_id in indiv_string_ids[1:]:
                insertion_index += 1

                entire_screen_string_ids_list.insert(
                    insertion_index, indiv_string_id
                )

                entire_screen_gene_names_list.insert(
                    insertion_index, gene_name
                )
            break

In [12]:
# Perform a couple of sanity checks to ensure the implementation is as
# intended

# The STRING ID and the gene name list should still have the same length
assert (
    len(entire_screen_string_ids_list)
    ==
    len(entire_screen_gene_names_list)
), "The STRING ID list and the gene name list don't have the same length!"

# Apart from that, verify that the mapping between STRING IDs and gene
# names is still correct
# This is done in two steps, the first of which involves non-composite
# STRING IDs
mapping_dict = {
    gene_name: string_id
    for gene_name, string_id in zip(
        entire_screen_gene_names, entire_screen_string_ids
    )
}

correct_mapping = []

for gene_name, string_id in zip(
    entire_screen_gene_names_list, entire_screen_string_ids_list
):
    correct_string_id = mapping_dict[gene_name]
    if ";" in correct_string_id:
        correct_mapping.append(
            any([
                string_id == correct_ind_string_id
                for correct_ind_string_id in correct_string_id.split(";")
            ])
        )
    else:
        correct_mapping.append(
            string_id == correct_string_id
        )

assert any(correct_mapping), (
    "Some mappings between gene name and STRING ID are incorrect for "
    "non-composite STRING IDs!"
)

In [13]:
# Determine how many HVIDB human proteins are comprised in the entire
# VACV screen
# Bear in mind that the HVIDB proteins are provided as UniProt
# accessions
# Also bear in mind that some UniProt accession entries are composite
# entries
entire_screen_uniprot_acs = np.unique([
    uniprot_ac
    for uniprot_entry in VACV_screen_df["UniProt_IDs"]
    for uniprot_ac in uniprot_entry.split(";")
])

n_HVIDB_human_prots_in_entire_screen = sum([
    HVIDB_prot in entire_screen_uniprot_acs
    for HVIDB_prot in HVIDB_human_prot_IDs
])

print(
    f"{n_HVIDB_human_prots_in_entire_screen} out of "
    f"{len(HVIDB_human_prot_IDs)} human proteins are covered by the "
    "VACV screen."
)

737 out of 800 human proteins are covered by the VACV screen.


In [15]:
# Investigate for how many of the HVIDB human proteins there are STRING
# IDs
uniprot_acs_for_screen_with_string_ids = np.unique(
    VACV_screen_with_string_id_df["UniProt_IDs"]
)

n_HVIDB_human_prots_with_string_ids = sum([
    HVIDB_prot in uniprot_acs_for_screen_with_string_ids
    for HVIDB_prot in HVIDB_human_prot_IDs
])

print(
    f"{n_HVIDB_human_prots_with_string_ids} out of "
    f"{len(HVIDB_human_prot_IDs)} human proteins have a STRING ID."
)

730 out of 800 human proteins have a STRING ID.


In [16]:
# Now that both the unique gene names and their corresponding STRING IDs
# have been retrieved in the correct ordering, the actual interaction
# matrix is built
# Additionally, a confidence score matrix is built
# To this end, the interaction data deposited in STRING has to be loaded
path_to_string_interaction_data = "9606.protein.links.v12.0.txt"

# Despite the file being a text file, it can be loaded into a Pandas
# DataFrame as it exhibits a tabular structure with a space as delimiter
string_interaction_data_df = pd.read_csv(
    path_to_string_interaction_data,
    sep=" "
)

In [27]:
n_entire_screen_string_ids = len(entire_screen_gene_names_list)

interaction_matrix = np.zeros(
    shape=(
        n_entire_screen_string_ids,
        n_entire_screen_string_ids
    ),
    dtype=np.uint8
)

confidence_score_matrix = np.zeros(
    shape=(
        n_entire_screen_string_ids,
        n_entire_screen_string_ids
    ),
    dtype=np.float32
)

In [28]:
# In a bid to speed up the population of both matrices, leveraging
# multiprocessing is considered
# While it is beyond question that multiprocessing may speed things up,
# it is decided in favour of addressing the bottlenecks of the previous
# implementation
# In detail, these bottlenecks are the following:
# ...
# Additionally, Numba is employed for just-in-time compilation
from numba import njit, prange

# Precompute the mappings from STRING IDs to matrix indices
string_id_to_index = {
    string_id: i for i, string_id in enumerate(entire_screen_string_ids_list)
}

# Convert the DataFrame to a NumPy array for fast iteration
string_interaction_rows_list = list(string_interaction_data_df.itertuples(index=False, name=None))

def convert_string_ids_to_indices(row, mapping_dict):
    """
    ...
    """
    int_partner_1, int_partner_2, confidence_score = row

    if (int_partner_1 in mapping_dict) and (int_partner_2 in mapping_dict):
        idx_1 = mapping_dict[int_partner_1]
        idx_2 = mapping_dict[int_partner_2]

        return [idx_1, idx_2, confidence_score]

filtered_indices_list = list(map(
    convert_string_ids_to_indices, string_interaction_rows_list,
    [string_id_to_index] * len(string_interaction_rows_list)
))

print(None in filtered_indices_list)

# Bear in mind that `map` always returns something for each element of
# the iterable, i.e. if the function being applied to the iterable does
# not return anything, `None` is returned
# Thus, `None` entries have to be removed
def remove_None(entry):
    if entry == None:
        return False
    else:
        return True

filtered_indices_list = list(filter(remove_None, filtered_indices_list))

print(None in filtered_indices_list)

filtered_indices = np.array(filtered_indices_list, dtype=np.float32)

print(filtered_indices.dtype)

# Finally, populate the two matrices by iterating over the filtered
# STRING interaction data array
# Leveraging Numba requires applying a decorator to a function; this
# function will then be compiled to machine code when it is called
@njit(parallel=True)
def populate_matrices(
    interaction_matrix, confidence_score_matrix, filtered_indices
):
    for i in prange(filtered_indices.shape[0]):
        idx_1 = int(filtered_indices[i, 0])
        idx_2 = int(filtered_indices[i, 1])
        confidence_score = filtered_indices[i, 2]

        interaction_matrix[idx_1, idx_2] = 1
        interaction_matrix[idx_2, idx_1] = 1

        confidence_score_matrix[idx_1, idx_2] = confidence_score
        confidence_score_matrix[idx_2, idx_1] = confidence_score

# Run the optimised function
populate_matrices(
    interaction_matrix, confidence_score_matrix, filtered_indices
)

True
False
float32


In [31]:
# Verify that the two matrices indeed are populated with non-zero values
assert (interaction_matrix > 0).any(), (
    "The interaction matrix (binary matrix) has not been correctly "
    "populated!"
)

assert (confidence_score_matrix > 0).any(), (
    "The confidence score matrix has not been correctly populated!"
)

In [32]:
# As a last step, pickle the two matrices
import pickle

with open("VACV_screen_interaction_matrix.pkl", "wb") as f:
    pickle.dump(
        (
            entire_screen_gene_names_list,
            entire_screen_string_ids_list,
            interaction_matrix
        ),
        f
    )

with open("VACV_screen_confidence_score_matrix.pkl", "wb") as f:
    pickle.dump(
        (
            entire_screen_gene_names_list,
            entire_screen_string_ids_list,
            confidence_score_matrix
        ),
        f
    )