In [1]:
"""
The purpose of this Jupyter notebook is to split the data set combining
both positive and negative PPI instances into 10 equally sized chunks so
as to prepare k-fold cross-validation with k being equal to 10 here.

10 is chosen as value for k since the test set is supposed to encompass
10% of the whole data.
"""

'\nThe purpose of this Jupyter notebook is to split the data set combining\nboth positive and negative PPI instances into 10 equally sized chunks so\nas to prepare k-fold cross-validation with k being equal to 10 here.\n\n10 is chosen as value for k since the test set is supposed to encompass\n10% of the whole data.\n'

In [4]:
import math

import pandas as pd
from sklearn.model_selection import KFold
from biotite.sequence.io import fasta

In [3]:
# Load the data set combining both positive and negative instances into
# a Pandas DataFrame
pos_and_neg_VACV_WR_PPIs_df = pd.read_csv(
    "VACV_WR_pos_and_nucleolus_prots_neg_PPI_instances.tsv",
    sep="\t"
)

In [15]:
# Now, perform the k-fold split
# Note that the `split()` method of the `KFold` class merely provides
# the indices of the data points belonging to the respective fold, not
# the data points themselves
# Thus, indexing of the DataFrame has to be performed in order to obtain
# the actual data
kf = KFold(n_splits=10, shuffle=True, random_state=0)

for i, (train_indices, test_indices) in enumerate(
    kf.split(pos_and_neg_VACV_WR_PPIs_df)
):
    pos_and_neg_VACV_WR_PPIs_df.iloc[train_indices].to_csv(
        f"data_set_splits/VACV_WR_pos_and_neg_PPIs_train_val_split_{i}.tsv",
        sep="\t",
        index=False,
        header=False
    )
    pos_and_neg_VACV_WR_PPIs_df.iloc[test_indices].to_csv(
        f"data_set_splits/VACV_WR_pos_and_neg_PPIs_test_split_{i}.tsv",
        sep="\t",
        index=False,
        header=False
    )

In [7]:
# Unfortunately, SENSE-PPI entails the major drawback of reading in the
# embeddings of all protein sequences listed in the given FASTA file at
# once, thereby potentially causing an OutOfMemory (OOM) error
# In an attempt to obviate such an OOM error, the test data set of each
# and every split is subdivided into three chunks
# For each chunk, a separate FASTA file is generated encompassing
# exclusively the sequences of the proteins for that chunk
N_CHUNKS = 3

for i in range(10):
    # Load the test data set of the current split into a Pandas
    # DataFrame
    current_test_set = pd.read_csv(
        f"data_set_splits/VACV_WR_pos_and_neg_PPIs_test_split_{i}.tsv",
        sep="\t"
    )

    n_PPIs = len(current_test_set)
    chunk_size = math.floor(n_PPIs / N_CHUNKS)
    
    for j in range(3):
        if j < 2:
            current_chunk = current_test_set.iloc[
                j * chunk_size : (j + 1) * chunk_size
            ]
        else:
            current_chunk = current_test_set.iloc[j * chunk_size :]
        
        # Save the current chunk to a TSV file
        current_chunk.to_csv(
            (
                f"data_set_splits/data_set_split_{i}/VACV_WR_pos_and_"
                f"neg_PPIs_test_set_split_{i}_chunk_{j}.tsv"
            ),
            sep="\t",
            index=False,
            header=False
        )

In [12]:
# Now that the subdivision into three chunks has been accomplished for
# the test set of each and every split, the corresponding FASTA files
# are created

# Load the FASTA file encompassing all VACV WR and human protein
# sequences
all_VACV_WR_and_human_prots_fasta = fasta.FastaFile.read(
    "human_nucleolus_and_VACV_WR_prot_seqs.fasta"
)

# Iterate over the splits, i.e. the individual test sets
for i in range(10):
    # Iterate over the chunks of an individual test set
    for j in range(3):
        # Load the TSV file of the current split and current chunk
        current_split_and_chunk_PPIs_df = pd.read_csv(
            f"data_set_splits/data_set_split_{i}/VACV_WR_pos_and_"\
                f"neg_PPIs_test_set_split_{i}_chunk_{j}.tsv",
            sep="\t",
            # As the individual chunks have been saved without header,
            # it is important to also load them without header
            # Otherwise, the first line will be interpreted as header
            # Using this option, the columns are labelled with intergers
            # in ascending order, i.e. in this case, the labels are 0, 1
            # and 2
            header=None
        )

        # Extract the unique UniProt IDs of both the human and the VACV
        # WR proteins
        # In order to determine the unique UniProt IDs of both human and
        # VACV WR proteins at once, `pandas.unique()` is employed in
        # conjunction with `pandas.melt()`, which transforms a DataFrame
        # from a wide format into a long format
        # This yields a DataFrame encompassing only two columns bearing
        # the name "variable" and "value", respectively
        # As the third column harbouring interaction information is not
        # of interest, the DataFrame returned by `pandas.melt()` is
        # narroed down to rows with `variable` values of 0 and 1 prior
        # to being fed into `pandas.unique()`
        long_format_PPIs_df = pd.melt(current_split_and_chunk_PPIs_df)
        
        unique_uniprot_IDs = long_format_PPIs_df[
            (long_format_PPIs_df["variable"] == 0)
            |
            (long_format_PPIs_df["variable"] == 1)
        ]["value"].unique()
        
        current_chunk_seqs_fasta = fasta.FastaFile()

        for uniprot_ID in unique_uniprot_IDs:
            current_chunk_seqs_fasta[uniprot_ID] = (
                all_VACV_WR_and_human_prots_fasta[uniprot_ID]
            )
        
        current_chunk_seqs_fasta.write(
            f"data_set_splits/data_set_split_{i}/VACV_WR_pos_and_neg_"
            f"PPIs_test_set_prot_seqs_split_{i}_chunk_{j}.fasta"
        )