In [1]:
"""
The purpose of this Jupyter notebook is to perform a train-validate-test
split for the VACV WR data set. This is done as follows: Hierarchical
clustering is performed for the 43 VACV WR proteins involved in ...
"""

'\nThe purpose of this Jupyter notebook is to perform a train-validate-test\nsplit for the VACV WR data set. This is done as follows: Hierarchical\nclustering is performed for the 43 VACV WR proteins involved in ...\n'

In [2]:
import pickle

import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform

### Retrieve PPI information

In [3]:
# Load the PPI information from the CSV file
path_to_PPIs_csv = "all_HVIDB_VACV_WR_interactions.csv"
all_VACV_WR_PPIs_df = pd.read_csv(path_to_PPIs_csv)

# Extract the PPIs in the form of tuples
human_int_partners = [
    interaction_pair.split("-")[0]
    for interaction_pair in all_VACV_WR_PPIs_df["Human-virus PPI"]
]
VACV_WR_int_partners = [
    interaction_pair.split("-")[1]
    for interaction_pair in all_VACV_WR_PPIs_df["Human-virus PPI"]
]

ppi_list = [
    (human_prot, vacv_prot)
    for human_prot, vacv_prot 
    in zip(human_int_partners, VACV_WR_int_partners)
]

In [4]:
import networkx as nx

G = nx.Graph()
G.add_edges_from(ppi_list)
components = list(nx.connected_components(G))

for i, component in enumerate(components):
    # For each component, determine the amount of PPIs
    print(
        str(i).ljust(5),
        sum([
            human_int_partners.count(prot) for prot in component
        ])
    )
    print()

0     343

1     2

2     1

3     2

4     10

5     1

6     5

7     3

8     15

9     2

10    2

11    1

12    1

13    1

14    1

15    7

16    5

17    3

18    1

19    2

20    1

21    1

22    2



In [5]:
total_n_ppis = sum([
    343, 2, 1, 2, 10, 1, 5, 3, 15, 2, 2, 1, 1, 1, 1, 7, 5, 3, 1, 2, 1,
    1, 2
])
print(
    f"Total amount of PPIs: {total_n_ppis}"
)
print()
print(
    "The portion of the PPIs assigned to the training set is "
    f"{343/total_n_ppis:.3} (343 of {total_n_ppis})."
)

# The sum of the remaining PPIs is 69; thus, an attempt is made to
# assign approximately 35 PPIs to the remaining two sets each
# (validation and test set)
print(
    "The portion of the PPIs assigned to the validation set is "
    f"{(10 + 5 + 2 + 1 + 1 + 5 + 2 + 3 + 2 + 3)/total_n_ppis:.3} "
    f"({10 + 5 + 2 + 1 + 1 + 5 + 2 + 3 + 2 + 3} of 412)."
)
print(
    "The portion of the PPIs assigned to the test set is "
    f"{(15 + 2 + 1 + 1 + 7 + 1 + 1 + 1 + 2 + 2 + 1 + 1)/total_n_ppis:.3} "
    f"({15 + 2 + 1 + 1 + 7 + 1 + 1 + 1 + 2 + 2 + 1 + 1} of 412)."
)

Total amount of PPIs: 412

The portion of the PPIs assigned to the training set is 0.833 (343 of 412).
The portion of the PPIs assigned to the validation set is 0.0825 (34 of 412).
The portion of the PPIs assigned to the test set is 0.085 (35 of 412).


In [6]:
# When aiming for balance between the two classes (positive and negative
# PPIs), the proportions remain the same
training_factor = 2
val_factor = 2
test_factor = 2

print(
    (343 * training_factor)
    /
    (343 * training_factor + 34 * val_factor + 35 * test_factor)
)
print(
    (34 * val_factor)
    /
    (343 * training_factor + 34 * val_factor + 35 * test_factor)
)
print(
    (35 * test_factor)
    /
    (343 * training_factor + 34 * val_factor + 35 * test_factor)
)

0.8325242718446602
0.0825242718446602
0.08495145631067962


In [7]:
# When generating twice as many negative PPI instances for each set,
# the target proportions of 70/10/20 are satisfied fairly accurately
print(
    (230 * 2)
    /
    ((230 + 33 + 66) * 2)
)

print(
    (33 * 2)
    /
    ((230 + 33 + 66) * 2)
)

print(
    (66 * 2)
    /
    ((230 + 33 + 66) * 2)
)

0.6990881458966566
0.10030395136778116
0.2006079027355623


In [8]:
print(230 + 33 + 66)

329


In [10]:
# Now, the individual components are assigned to the three sets
# The largest component encompassing 343 PPIs is assigned to the
# training set
VACV_WR_prots_training_set = list(components[0])

# As a reminder, components encompassing the following amounts of PPIs
# are assigned to the validation set:
# 10, 5, 2, 1, 1, 5, 2, 3, 2, 3
VACV_WR_prots_validation_set = [
    *components[4], *components[6], *components[1], *components[2],
    *components[5], *components[16], *components[3], *components[7],
    *components[9], *components[17]
]

# As a reminder, components encompassing the following amounts of PPIs
# are assigned to the test set:
# 15, 2, 1, 1, 7, 1, 1, 1, 2, 2, 1, 1
VACV_WR_prots_test_set = [
    *components[8], *components[10], *components[11], *components[12],
    *components[15], *components[13], *components[14], *components[18],
    *components[19], *components[22], *components[20], *components[21]
]

### Hierarchical clustering of the 328 human nucleolus proteins

In [12]:
# Perform hierarchical clustering for the 328 human nucleolus proteins
# To this end, the percent identity matrix for the 328 human nucleolus
# proteins has to be loaded
path_to_percent_ident_mat_eligible_nucleolus_prots = (
    "percent_identity_matrix_by_clustal_omega_eligible_human_nucleolus_"
    "proteins_max_length_1700_AAs.pkl"
)

with open(path_to_percent_ident_mat_eligible_nucleolus_prots, "rb") as f:
    (
        nucleolus_identifier_list,
        percent_ident_mat_nucleolus
    ) = pickle.load(f)

In [13]:
assert (
    (percent_ident_mat_nucleolus.shape[0] == 328)
    and
    (percent_ident_mat_nucleolus.shape[1] == 328)
)

print(percent_ident_mat_nucleolus.shape)

(328, 328)


In [14]:
# Clustering requires a distance matrix
# Thus, the percent identity matrix has to be converted into a distance
# matrix by subtracting the identity value from 100 (i.e. 100 - percent
# identity)
nucleolus_distance_matrix = 100 - percent_ident_mat_nucleolus

assert (
    (np.min(nucleolus_distance_matrix) == 0)
    and
    (np.max(nucleolus_distance_matrix) == 100)
)

# Convert to condensed distance matrix for clustering
nucleolus_condensed_dist_mat = squareform(nucleolus_distance_matrix)

# Perform hierarchical clustering
Z = linkage(nucleolus_condensed_dist_mat, method="average")

# Form flat clusters from the hierarchical clustering
cluster_labels = fcluster(
    Z,
    t=92,
    criterion="distance"
)

In [15]:
print(np.unique(cluster_labels))
print()
for label in np.unique(cluster_labels):
    print(str(label).ljust(5), cluster_labels.tolist().count(label))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14]

1     3
2     8
3     53
4     48
5     17
6     6
7     11
8     172
9     2
10    4
11    1
12    1
13    1
14    1


In [18]:
# Manually assign the clusters to the three sets (train, validation and
# test)
# An attempt is made to match the proportions of the VACV WR proteins,
# i.e. 83% for the training set (272 proteins) and 8.5% for the
# validation and test set each (28 proteins)
nucleolus_prots_training_set = np.array(nucleolus_identifier_list)[
    (cluster_labels == 8)
    |
    (cluster_labels == 3)
    |
    (cluster_labels == 4)
]

nucleolus_prots_validation_set = np.array(nucleolus_identifier_list)[
    (cluster_labels == 6)
    |
    (cluster_labels == 7)
    |
    (cluster_labels == 9)
    |
    (cluster_labels == 10)
    |
    (cluster_labels == 11)
    |
    (cluster_labels == 12)
    |
    (cluster_labels == 13)
    |
    (cluster_labels == 14)
]

nucleolus_prots_test_set = np.array(nucleolus_identifier_list)[
    (cluster_labels == 1)
    |
    (cluster_labels == 2)
    |
    (cluster_labels == 5)
]

In [19]:
print(len(nucleolus_prots_training_set))
print(len(nucleolus_prots_validation_set))
print(len(nucleolus_prots_test_set))

273
27
28


### Construction of the PPI data set/the negative PPI instances

In [20]:
# As a first step, generate a separate file for the training, validation
# and test set each
# Start with the training set
train_set_human_col = np.array(human_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_training_set)
]
train_set_VACV_col = np.array(VACV_WR_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_training_set)
]

assert len(train_set_human_col) == len(train_set_VACV_col)

train_set_label_col = [1] * len(train_set_human_col)

# Address the validation set
validation_set_human_col = np.array(human_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_validation_set)
]
validation_set_VACV_col = np.array(VACV_WR_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_validation_set)
]

assert len(validation_set_human_col) == len(validation_set_VACV_col)

validation_set_label_col = [1] * len(validation_set_human_col)

# Address the test set
test_set_human_col = np.array(human_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_test_set)
]
test_set_VACV_col = np.array(VACV_WR_int_partners)[
    np.isin(np.array(VACV_WR_int_partners), VACV_WR_prots_test_set)
]

assert len(test_set_human_col) == len(test_set_VACV_col)

test_set_label_col = [1] * len(test_set_human_col)

In [22]:
assert (
    len(train_set_human_col)
    ==
    len(train_set_VACV_col)
    ==
    len(train_set_label_col)
    ==
    343
)

assert (
    len(validation_set_human_col)
    ==
    len(validation_set_VACV_col)
    ==
    len(validation_set_label_col)
    ==
    34
)

assert (
    len(test_set_human_col)
    ==
    len(test_set_VACV_col)
    ==
    len(test_set_label_col)
    ==
    35
)

In [25]:
# Determine the VACV WR UniProt accessions for each set (training,
# validation and test)
# Training set
VACV_WR_prots_assigned_to_training_set = []

for prot in VACV_WR_prots_training_set:
    if prot in VACV_WR_int_partners:
        VACV_WR_prots_assigned_to_training_set.append(prot)

# Validation set
VACV_WR_prots_assigned_to_validation_set = []

for prot in VACV_WR_prots_validation_set:
    if prot in VACV_WR_int_partners:
        VACV_WR_prots_assigned_to_validation_set.append(prot)

# Test set
VACV_WR_prots_assigned_to_test_set = []

for prot in VACV_WR_prots_test_set:
    if prot in VACV_WR_int_partners:
        VACV_WR_prots_assigned_to_test_set.append(prot)

In [26]:
print(
    "Amount of VACV WR proteins assigned to the training set: "
    f"{len(VACV_WR_prots_assigned_to_training_set):,}"
)
print(
    "Amount of VACV WR proteins assigned to the validation set: "
    f"{len(VACV_WR_prots_assigned_to_validation_set):,}"
)
print(
    "Amount of VACV WR proteins assigned to the test set: "
    f"{len(VACV_WR_prots_assigned_to_test_set):,}"
)

Amount of VACV WR proteins assigned to the training set: 20
Amount of VACV WR proteins assigned to the validation set: 11
Amount of VACV WR proteins assigned to the test set: 12


In [31]:
import random

random.seed(0)
# Address the construction of negative PPI instances
# The two classes (positive and negative PPIs) are supposed to be
# balanced, i.e. the ratio between positive and negative PPIs is
# supposed to be 1:1
# Begin with the training set; the training set encompasses 343 positive
# PPIs; thus, 343 negative PPIs must be generated
# 273 human nucleolus proteins have been assigned to the training set
# Therefore, these 273 nucleolus proteins as well as the first 70 of
# them are randomly paired with the VACV WR proteins assigned to this
# set
train_set_human_col = train_set_human_col.tolist()
train_set_human_col += nucleolus_prots_training_set.tolist()
train_set_human_col += nucleolus_prots_training_set.tolist()[:70]

train_set_VACV_col = train_set_VACV_col.tolist()
train_set_VACV_col += [
    random.choice(VACV_WR_prots_assigned_to_training_set)
    for _ in range(343)
]

train_set_label_col += [0] * 343

assert (
    len(train_set_human_col)
    ==
    len(train_set_VACV_col)
    ==
    len(train_set_label_col)
    ==
    343 * 2
)

# Turn to the validation set; the validation set encompasses 34 positive
# PPIs; thus, 34 negative PPIs must be generated
# 27 human nucleolus proteins have been assigned to the validation set
# Therefore, these 27 nucleolus proteins as well as the first 7 of them
# are randomly paired with the VACV WR proteins assigned to this set
validation_set_human_col = validation_set_human_col.tolist()
validation_set_human_col += nucleolus_prots_validation_set.tolist()
validation_set_human_col += nucleolus_prots_validation_set.tolist()[:7]

validation_set_VACV_col = validation_set_VACV_col.tolist()
validation_set_VACV_col += [
    random.choice(VACV_WR_prots_assigned_to_validation_set)
    for _ in range(34)
]

validation_set_label_col += [0] * 34

assert (
    len(validation_set_human_col)
    ==
    len(validation_set_VACV_col)
    ==
    len(validation_set_label_col)
    ==
    34 * 2
)

# Finally, deal with the test set; the test set encompasses 35 positive
# PPIs; thus, 35 negative PPIs must be generated
# 28 human nucleolus proteins have been assigned to the test set
# Therefore, these 28 nucleolus proteins as well as the first 7 of them
# are randomly paired with the VACV WR proteins assigned to this set
test_set_human_col = test_set_human_col.tolist()
test_set_human_col += nucleolus_prots_test_set.tolist()
test_set_human_col += nucleolus_prots_test_set.tolist()[:7]

test_set_VACV_col = test_set_VACV_col.tolist()
test_set_VACV_col += [
    random.choice(VACV_WR_prots_assigned_to_test_set)
    for _ in range(35)
]

test_set_label_col += [0] * 35

assert (
    len(test_set_human_col)
    ==
    len(test_set_VACV_col)
    ==
    len(test_set_label_col)
    ==
    35 * 2
)

In [35]:
# As a last sanity check, verify that the three sets (training,
# validation and test) are disjoint from one another, i.e. each protein
# occurs exclusively in one set
uniprot_accs_in_train_set = set(
    train_set_human_col + train_set_VACV_col
)
uniprot_accs_in_validation_set = set(
    validation_set_human_col + validation_set_VACV_col
)
uniprot_accs_in_test_set = set(
    test_set_human_col + test_set_VACV_col
)

assert (
    len(uniprot_accs_in_train_set & uniprot_accs_in_validation_set) == 0
    and
    len(uniprot_accs_in_train_set & uniprot_accs_in_test_set) == 0
    and
    len(uniprot_accs_in_validation_set & uniprot_accs_in_test_set) == 0
)

In [37]:
# Finally, save the PPIs per set to a TSV file
import os

data_set_dir = "data_set_files"
if not os.path.exists(data_set_dir):
    os.makedirs(data_set_dir)

# Address the train set
train_set_data = {
    "Human_prot": train_set_human_col,
    "VACV_prot": train_set_VACV_col,
    "label": train_set_label_col
}

train_set_df = pd.DataFrame(data=train_set_data)

train_set_df.to_csv(
    os.path.join(data_set_dir, "bullet-proof_training_set.tsv"),
    sep="\t",
    index=False
)

# Address the validation set
validation_set_data = {
    "Human_prot": validation_set_human_col,
    "VACV_prot": validation_set_VACV_col,
    "label": validation_set_label_col
}

validation_set_df = pd.DataFrame(data=validation_set_data)

validation_set_df.to_csv(
    os.path.join(data_set_dir, "bullet-proof_validation_set.tsv"),
    sep="\t",
    index=False
)

# Address the test set
test_set_data = {
    "Human_prot": test_set_human_col,
    "VACV_prot": test_set_VACV_col,
    "label": test_set_label_col
}

test_set_df = pd.DataFrame(data=test_set_data)

test_set_df.to_csv(
    os.path.join(data_set_dir, "bullet-proof_test_set.tsv"),
    sep="\t",
    index=False
)

In [38]:
# As a last step, merge all three sets and create a combined TSV file
entire_ppi_set_df = pd.concat(
    [train_set_df, validation_set_df, test_set_df]
)

assert (
    len(entire_ppi_set_df)
    ==
    (
        len(train_set_df)
        +
        len(validation_set_df)
        +
        len(test_set_df)
    )
)

entire_ppi_set_df.to_csv(
    os.path.join(data_set_dir, "entire_bullet-proof_ppi_data_set.tsv"),
    sep="\t",
    index=False
)