In [1]:
"""
The purpose of this Jupyter notebook is to prepare or rather to adjust
the VACV WR data set from HVIDB to the PPI prediction model KSGPPI. The
necessity to do so stems from the fact that the implementation provided
on GitHub only allows the prediction of one PPI pair at once.

In contrast to many other PPI prediction models taking both FASTA and
TSV files as input, KSGPPI only takes FASTA files as input. To be more
precise, two FASTA files have to be provided each containing one protein
sequence; the two protein sequences form the PPI pair to investigate.
Accordingly, the results file harbours the predicted probability for
only that one PPI pair. As a consequence, post-processing of the results
files is necessary as well.
"""

'\nThe purpose of this Jupyter notebook is to prepare or rather to adjust\nthe VACV WR data set from HVIDB to the PPI prediction model KSGPPI. The\nnecessity to do so stems from the fact that the implementation provided\non GitHub only allows the prediction of one PPI pair at once.\n\nIn contrast to many other PPI prediction models taking both FASTA and\nTSV files as input, KSGPPI only takes FASTA files as input. To be more\nprecise, two FASTA files have to be provided each containing one protein\nsequence; the two protein sequences form the PPI pair to investigate.\nAccordingly, the results file harbours the predicted probability for\nonly that one PPI pair. As a consequence, post-processing of the results\nfiles is necessary as well.\n'

In [2]:
import os

import pandas as pd
from biotite.sequence.io import fasta

In [3]:
path_to_all_seqs_fasta_file = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/human_nucleolus_and_VACV_WR_prot_"
    "seqs.fasta"
)

all_seqs_fasta = fasta.FastaFile.read(path_to_all_seqs_fasta_file)

In [12]:
path_to_data_set_splits = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/data_set_splits"
)

split_file_name = "VACV_WR_pos_and_neg_PPIs_test_split_{i}.tsv"

# 10 different splits are available
for i in range(10):
    # Create a directory for the current split
    dir_path = f"HVIDB_VACV_WR_split_{i}"
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    else:
        print(f"Directory \"{dir_path}\" already exists.")
    
    # Iterate over the PPI pairs of the current split and create a
    # subdirectory for each
    current_split_path = os.path.join(
        path_to_data_set_splits, split_file_name
    )
    current_split_df = pd.read_csv(
        current_split_path.format(i=i),
        sep="\t",
        header=None
    )
    
    for j, (prot_1, prot_2, _) in current_split_df.iterrows():
        subdir_path = f"PPI_pair_{j}"

        if not os.path.exists(subdir_path):
            os.mkdir(os.path.join(dir_path, subdir_path))
        
        # Create for each protein of the current PPI pair a separate
        # FASTA file within the current subdirectory
        prot_1_fasta = fasta.FastaFile()
        prot_2_fasta = fasta.FastaFile()

        prot_1_fasta[prot_1] = all_seqs_fasta[prot_1]
        prot_2_fasta[prot_2] = all_seqs_fasta[prot_2]

        prot_1_fasta.write(os.path.join(
            dir_path, subdir_path, f"{prot_1}.fasta"
        ))
        prot_2_fasta.write(os.path.join(
            dir_path, subdir_path, f"{prot_2}.fasta"
        ))

Directory "HVIDB_VACV_WR_split_0" already exists.
Directory "HVIDB_VACV_WR_split_1" already exists.
Directory "HVIDB_VACV_WR_split_2" already exists.
Directory "HVIDB_VACV_WR_split_3" already exists.
Directory "HVIDB_VACV_WR_split_4" already exists.
Directory "HVIDB_VACV_WR_split_5" already exists.
Directory "HVIDB_VACV_WR_split_6" already exists.
Directory "HVIDB_VACV_WR_split_7" already exists.
Directory "HVIDB_VACV_WR_split_8" already exists.
Directory "HVIDB_VACV_WR_split_9" already exists.


In [21]:
# Perform a couple of sanity checks
# The first sanity check verifies whether the amount of subdirectories
# in each data set split equals the amount of PPI pairs of that split
for i in range(10):
    dir_path = f"HVIDB_VACV_WR_split_{i}"

    current_split_path = os.path.join(
        path_to_data_set_splits, split_file_name
    )
    current_split_df = pd.read_csv(
        current_split_path.format(i=i),
        sep="\t",
        header=None
    )

    n_subdirs = len([
        subdir for subdir in os.listdir(dir_path)
        if os.path.isdir(os.path.join(dir_path, subdir))
    ])
    n_PPI_pairs = len(current_split_df)

    assert n_subdirs == n_PPI_pairs, (
        f"For data set split {i}, the amount of subdirectories does "
        "not equal the amount of PPI pairs in the respective split!"
    )

# The second sanity check verifies whether each subdirectory comprises
# two FASTA files
for i in range(10):
    dir_path = f"HVIDB_VACV_WR_split_{i}"
    subdir_list = [
        subdir for subdir in os.listdir(dir_path)
        if "PPI" in subdir
    ]

    for j, subdir in enumerate(subdir_list):
        current_subdir_path = os.path.join(dir_path, subdir)

        assert len(os.listdir(current_subdir_path)) == 2, (
            f"FASTA file creation for PPI pair number {j} of data set "
            f"split {i} was not successful!"
        )

In [7]:
# In a bid to speed up code execution, the source code has been altered
# such that multiple PPI pairs are processed at once
# In other words, it is now tried to process one split at a time rather
# than one PPI pair at a time
# This, however, requires the generation of two FASTA files per split
# with the first FASTA and the second FASTA file harbouring the first
# and the second interaction partner, respectively
path_to_data_set_splits = (
    "/Users/jacobanter/Documents/Code/VACV_screen/HVIDB_pos_instances_"
    "with_nucleolus_neg_instances/data_set_splits"
)

split_file_name = "VACV_WR_pos_and_neg_PPIs_test_split_{i}.tsv"

split_file_path = os.path.join(path_to_data_set_splits, split_file_name)

In [5]:
if not os.path.exists("whole_splits_FASTA_files"):
    os.mkdir("whole_splits_FASTA_files")
else:
    print("Directory \"whole_splits_FASTA_files\" already exists.")

In [11]:
# Iterate over the individual splits and create one FASTA file for the
# first and second interaction partner each
for i in range(10):
    subdir_name = f"whole_splits_FASTA_files/split_{i}"
    if not os.path.exists(subdir_name):
        os.mkdir(subdir_name)
    else:
        print(f"Directory \"{subdir_name}\" already exists.")

    current_split_path = split_file_path.format(i=i)

    current_split_df = pd.read_csv(
        current_split_path,
        sep="\t",
        header=None
    )

    # Extract the first and the second interaction partners
    first_int_partners = current_split_df.iloc[:, 0]
    second_int_partners = current_split_df.iloc[:, 1]

    # Finally, create the FASTA files for the first and second
    # interaction partners
    first_int_partner_fasta = fasta.FastaFile()

    for prot in first_int_partners:
        first_int_partner_fasta[prot] = all_seqs_fasta[prot]
    
    first_int_partner_fasta.write(os.path.join(
        subdir_name, "first_interaction_partners.fasta"
    ))

    second_int_partner_fasta = fasta.FastaFile()
    
    for prot in second_int_partners:
        second_int_partner_fasta[prot] = all_seqs_fasta[prot]

    second_int_partner_fasta.write(os.path.join(
        subdir_name, "second_interaction_partners.fasta"
    ))

Directory "whole_splits_FASTA_files/split_0" already exists.


In [12]:
# Perform a sanity check
# The sanity check consists of verifying that each subdirectory contains
# 2 FASTA files
for i in range(10):
    subdir_name = f"whole_splits_FASTA_files/split_{i}"

    files_in_subdir = os.listdir(subdir_name)

    assert len(files_in_subdir) == 2, (
        f"The subdirectory for split {i} does not contain two files!"
    )

    assert all([
        filename.split(".")[1] == "fasta" for filename in files_in_subdir
    ]), (
        f"The subdirectory for split {i} does not (exclusively) "
        "contain FASTA files!"
    )