# Create Datasets

This notebook takes the input networks and returns both validation and training datasets.

Inputs:
- gene-drug network
- gene-gene network
- gene-disease network
- full gene-drug network (to recover removed edges)
- output embeddings from BIONIC/BERTwalk

Outputs:
- `final_df`: dataset for training and testing downstream models (containing gene-drug interactions left)
- `validation_df`: dataset for validating downstream models (containing gene-drug interactions removed)

Import libraries and set paths for inputs and outputs

In [None]:
import pandas as pd
import numpy as np
import random

# Final_df part
path_to_gene_drug = ""
path_to_gene_gene = ""
path_to_gene_disease = ""

path_to_gene_drug_embs = ""
path_to_gene_gene_embs = ""
path_to_gene_disease_embs = ""
path_to_embs = ""
path_save_final_df = "" # where to save final_df

n_samples = 10000 # number of samples per group in final_df (positive/negative triplets).

# Validation_df part
path_to_gene_drug_full = ""
n_val = 5000 # number of samples per group in validation_df (positive/negative triplets)

path_save_validation_df = ""

path_to_drug_list = ""
path_to_disease_list = ""
path_to_train_genes = ""
# Set seeds
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## Create final_df

### Load Data

Load networks and embeddings depending on what you used for BIONIC (es. BIONIC outputs embeddings on five_edge nets, use them)

In [70]:
# Load networks
gene_drug = pd.read_csv(path_to_gene_drug, sep = " ", header = None, names = ["Gene", "Drug", "Ass"])
gene_disease = pd.read_csv(path_to_gene_disease, sep = " ", header = None, names = ["Gene", "Disease", "Ass"])
gene_gene = pd.read_csv(path_to_gene_gene, sep = " ", header = None, names = ["Gene1", "Gene2", "Ass"])

# Load embeddings
embs_drug = pd.read_csv(path_to_gene_drug_embs, sep = "\t", index_col = 0)
embs_gene = pd.read_csv(path_to_gene_gene_embs, sep = "\t", index_col = 0)
embs_disease = pd.read_csv(path_to_gene_disease_embs, sep = "\t", index_col = 0)

# Subset
embs_drug = embs_drug[[True if i not in list(embs_gene.index) else False for i in embs_drug.index]]
embs_disease = embs_disease[[True if i not in list(embs_gene.index) else False for i in embs_disease.index]]


embs = pd.concat([embs_drug, embs_gene, embs_disease])
embs = embs[~embs.index.duplicated()]

embs = pd.read_csv(path_to_embs, sep = "\t", index_col = 0)

# Reconstruct embeddings
embs = embs.apply(np.array, axis = 1).to_frame()
embs.columns = ["Embedding"]

### Create triplets

Create __positive__ triplets starting from associations coming from the input networks.
This is done with a join between gene-drug network and gene-disease network on column 'Gene'.

In [71]:
# Create dataframe with id of the triplets (drug - gene - disease)
merged_df = pd.merge(gene_drug, gene_disease, on='Gene')

# Map IDs to their respective embeddings
cols_to_map = ["Gene", "Drug", "Disease"]
for col in cols_to_map:
    merged_df[col] = merged_df[col].map(embs["Embedding"])


# Drop unnecessary columns and add labels
merged_df = merged_df[cols_to_map]
merged_df["Label"] = 1

Create __negative__ triplets by shuffling embeddings of genes, drugs and diseases at random

In [72]:
# Compute list for names of the entities
drug_list = pd.read_csv(path_to_drug_list)["0"].tolist()
disease_list = pd.read_csv(path_to_disease_list)["0"].tolist()

# Create column for entity type
embs["Entity_type"] = embs.index.map(lambda x: "Drug" if x in drug_list else "Disease" if x in disease_list else "Gene")

# Sample random rows from the dataframe for each entity type (assuming embeddings for different entities are grouped together)
random_drug_embeddings = embs[embs['Entity_type'] == 'Drug'].sample(n=round(len(merged_df)), replace=True)
random_gene_embeddings = embs[embs['Entity_type'] == 'Gene'].sample(n=round(len(merged_df)), replace=True)
random_disease_embeddings = embs[embs['Entity_type'] == 'Disease'].sample(n=round(len(merged_df)), replace=True)

# Combine the sampled embeddings into triplets
random_triplets_df = pd.DataFrame({
    'Gene': random_gene_embeddings['Embedding'].values,
    'Drug': random_drug_embeddings['Embedding'].values,
    'Disease': random_disease_embeddings['Embedding'].values
})

# add labels to indicate that these triplets are randomly generated
random_triplets_df['Label'] = 0  

### Save dataframe

Filter by number of rows (taking `n_samples` rows per group) and save the dataframe

In [73]:
if len(merged_df) > n_samples:
    merged_df = merged_df.sample(n=n_samples)
    random_triplets_df = random_triplets_df.sample(n=n_samples)

# Now concatenate the two dataframes and aggregate the embeddings
final_df = pd.concat([merged_df, random_triplets_df], axis = 0, ignore_index=True)

# Now aggregate the columns to reform the embeddings
final_df["Concat_emb"] = final_df[cols_to_map].apply(np.concatenate, axis = 1)

# Drop unnecesssary columns
final_df = final_df.drop(cols_to_map, axis = 1)

# Save final df
final_df.to_csv(path_save_final_df, sep = "\t", index = False)

## Create validation_df

### Load Data

Loading full gene-drug network to recover removed edges

In [74]:
# Load full gene-drug network
gene_drug_full = pd.read_csv(path_to_gene_drug_full, sep = " ", header = None, names = ["Gene", "Drug", "Ass"])

# Concatenate and remove duplicates (together with their first copy) to obtain only removed edges
gene_drug_val = pd.concat([gene_drug_full, gene_drug], ignore_index=True).drop_duplicates(keep = False)

### Create triplets

Create __positive__ triplets starting from removed gene-drug associations.
This is done similarly as done for final_df.

In [75]:
# Reconstruct the previous associations
removed_true_ass = pd.merge(gene_drug_val, gene_disease, on = "Gene")
removed_true_ass = removed_true_ass.drop(["Ass_x", "Ass_y"], axis = 1)

# Retain only test genes
only_train_genes = pd.read_csv(path_to_train_genes, header=None)[0].tolist()
removed_true_ass = removed_true_ass[~removed_true_ass["Gene"].isin(only_train_genes)]

# Now map to embeddings
for col in removed_true_ass.columns:
    removed_true_ass[col] = removed_true_ass[col].map(embs["Embedding"])

# Add labels 
removed_true_ass["Label"] = 1


# Fix order of columns
removed_true_ass = removed_true_ass[["Gene", "Drug", "Disease", "Label"]]

# Sample n_val rows
removed_true_ass = removed_true_ass.sample(n_val)

Create __negative__ triplets by shuffling embeddings of genes, drugs and diseases at random

In [76]:
# Sample random rows from the dataframe for each entity type (assuming embeddings for different entities are grouped together)
random_drug_embeddings = embs[embs['Entity_type'] == 'Drug'].sample(n=n_val, replace=True)
random_gene_embeddings = embs[embs['Entity_type'] == 'Gene'].sample(n=n_val, replace=True)
random_disease_embeddings = embs[embs['Entity_type'] == 'Disease'].sample(n=n_val, replace=True)

# Combine the sampled embeddings into triplets
random_triplets_val = pd.DataFrame({
    'Gene': random_gene_embeddings['Embedding'].values,
    'Drug': random_drug_embeddings['Embedding'].values,
    'Disease': random_disease_embeddings['Embedding'].values
})

# Optionally, you can add labels to indicate that these triplets are randomly generated
random_triplets_val['Label'] = 0  # Assuming 0 indicates randomly generated

### Save Dataframe

Create `validation_df` by concatenating positive and negative triplets

In [77]:
# Now create the two validation dataframes
validation_embs = pd.concat([removed_true_ass, random_triplets_val], axis = 0, ignore_index=True)

# Concatenate embeddings and drop unnecessary columns
validation_embs["Concat_emb"] = validation_embs[["Gene", "Drug", "Disease"]].apply(np.concatenate, axis = 1)
validation_embs = validation_embs.drop(["Gene", "Drug", "Disease"], axis = 1)

# Save df
validation_embs.to_csv(path_save_validation_df, sep = "\t", index = False)