In [1]:
"""
The purpose of this Jupyter notebook is to generate an interaction
matrix based on PPI data obtained from the STRING database. The
interaction matrix is a symmetric matrix with its row and column
positions corresponding to one protein each. It is a binary matrix, i.e.
exclusively populated with the values 0 and 1, indicating the absence or
the presence of a PPI, respectively.
"""

'\nThe purpose of this Jupyter notebook is to generate an interaction\nmatrix based on PPI data obtained from the STRING database. The\ninteraction matrix is a symmetric matrix with its row and column\npositions corresponding to one protein each. It is a binary matrix, i.e.\nexclusively populated with the values 0 and 1, indicating the absence or\nthe presence of a PPI, respectively.\n'

In [2]:
# The interaction matrix is supposed to be generated for the Qiagen
# subset of the VACV screen
# Therefore, as a first step, the VACV screen is loaded into a Pandas
# DataFrame and the unique gene names are determined
import numpy as np
import pandas as pd

path_to_VACV_screen_report = (
    "../VACV_Report_only_valid_single_pooled_siRNA_and_esiRNA_single_"
    "entries_only_without_Qiagen_mismatches.tsv"
)

VACV_screen_df = pd.read_csv(
    path_to_VACV_screen_report,
    sep="\t"
)

# Filter out the Qiagen subset
Qiagen_subset_VACV_screen_df = VACV_screen_df.loc[
    VACV_screen_df["Manufacturer"] == "Qiagen"
]

  VACV_screen_df = pd.read_csv(


In [3]:
# Not each and every gene name is mapped to a STRING ID for reasons
# elaborated on elsewhere (e.g. the fact of encoding merely ncRNA or a
# pseudogene)
# Thus, only genes with an associated STRING ID are filtered out from
# the Qiagen subset
Qiagen_subset_with_string_id_df = Qiagen_subset_VACV_screen_df.loc[
    Qiagen_subset_VACV_screen_df["ID_String"] != "Not available"
]

total_n_Qiagen_genes = len(
    np.unique(Qiagen_subset_VACV_screen_df["Name"])
)
n_Qiagen_genes_not_mapped = total_n_Qiagen_genes - len(
    np.unique(Qiagen_subset_with_string_id_df["Name"])
)

print(
    f"{n_Qiagen_genes_not_mapped:,} out of {total_n_Qiagen_genes:,} "
    "gene names could not be mapped to a STRING ID in the case of the "
    "Qiagen subset of the VACV screen."
)

2,219 out of 20,213 gene names could not be mapped to a STRING ID in the case of the Qiagen subset of the VACV screen.


In [4]:
# `np.unique` inherently performs sorting of the unique values
Qiagen_subset_gene_names, indices = np.unique(
    Qiagen_subset_with_string_id_df["Name"],
    return_index=True
)

Qiagen_subset_string_ids = Qiagen_subset_with_string_id_df[
    "ID_String"
].to_numpy()[indices]

# Verify that the unique gene names as well as the STRING IDs have the
# correct ordering
correct_alignment_list = []

for gene_name, string_id in zip(
    Qiagen_subset_gene_names, Qiagen_subset_string_ids
):
    current_gene_string_ids =  Qiagen_subset_with_string_id_df.loc[
        Qiagen_subset_with_string_id_df["Name"] == gene_name,
        "ID_String"
    ]

    # The filtered Pandas Series is supposed to encompass only one
    # STRING ID
    current_gene_string_ids = np.unique(current_gene_string_ids)

    assert len(current_gene_string_ids) == 1, (
        f"More than one STRING ID has been assigned to gene {gene_name}!"
    )

    aligned_string_id = current_gene_string_ids[0]

    if aligned_string_id == string_id:
        correct_alignment_list.append(True)

assert all(correct_alignment_list), (
    "Not all gene names are aligned with their correct STRING ID!"
)

In [5]:
# Now that both the unique gene names and their corresponding STRING IDs
# have been retrieved in the correct ordering, the actual interaction
# matrix is built
# To this end, the interaction data deposited in STRING has to be loaded
path_to_string_interaction_data = "9606.protein.links.v12.0.txt"

# Despite the file being a text file, it can be loaded into a Pandas
# DataFrame as it exhibits a tabular structure with a space as delimiter
string_interaction_data_df = pd.read_csv(
    path_to_string_interaction_data,
    sep=" "
)

In [16]:
n_Qiagen_genes_with_string_id = len(Qiagen_subset_gene_names)

interaction_matrix = np.zeros(
    shape=(n_Qiagen_genes_with_string_id, n_Qiagen_genes_with_string_id)
)

# Iterate over the DataFrame with PPI information and populate the
# interaction matrix
for _, row in string_interaction_data_df.iterrows():
    # Bear in mind that STRING uses its STRING IDs to list PPI pairs,
    # not the official gene symbols!
    int_partner_1 = row["protein1"]
    int_partner_2 = row["protein2"]

    if (
        (int_partner_1 in Qiagen_subset_string_ids)
        and
        (int_partner_2 in Qiagen_subset_string_ids)
    ):
        # Determine the current proteins' positions in the sorted array
        # of unique STRING IDs
        # Bear in mind that `np.nonzero()` returns a tuple of arrays
        # with the individual arrays harbouring the indices of elements
        # that are non-zero
        # Thus, the returned object must be indexed twice
        idx_1 = np.nonzero(Qiagen_subset_string_ids == int_partner_1)[0][0]
        idx_2 = np.nonzero(Qiagen_subset_string_ids == int_partner_2)[0][0]
        
        interaction_matrix[idx_1, idx_2] = 1
        interaction_matrix[idx_2, idx_1] = 1
    else:
        continue

In [28]:
# As the iteration over the file comprising the STRING PPI information
# took more than two hours, it is advisable to pickle, i.e. save the
# interaction matrix to a file
import pickle

# Bear in mind that in the context of working with files, the `with`
# context manager is preferred as it automatically takes care of closing
# files, even in case of errors/exceptions
with open("VACV_screen_Qiagen_subset_interaction_matrix.pkl", "wb") as f:
    # The interaction matrix is pickled along with the gene names and
    # STRING IDs
    pickle.dump(
        (
            Qiagen_subset_gene_names,
            Qiagen_subset_string_ids,
            interaction_matrix
        ),
        f
    )

In [None]:
# Load the pickled interaction matrix
import pickle

path_to_interaction_matrix = "VACV_screen_Qiagen_subset_interaction_"\
    "matrix.pkl"

with open(path_to_interaction_matrix, "rb") as f:
    gene_names, string_ids, interaction_matrix = pickle.load(f)