In [108]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os
from torch_geometric.transforms import NormalizeFeatures
from sklearn.model_selection import train_test_split
from tqdm import tqdm

<h1>Target matrix</h1>

In [109]:
class IdMapper():
    sorted_diseases = []
    sorted_genes = []

    def __init__(self, gene_file, disease_file):
        genes = pd.read_csv(gene_file, sep="\t")
        self.genes = genes["genes"].sort_values().unique()

        disieses = pd.read_csv(disease_file, sep="\t")
        diseases_filtered = disieses.groupby("diseaseId").filter(lambda x: len(x) > 0)
        self.diseases = diseases_filtered["diseaseId"].sort_values().unique()

    def diseases_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.diseases)}
    
    def diseases_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.diseases)}
    
    def genes_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.genes)}
   
    def genes_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.genes)}

In [110]:
mapper = IdMapper("./data/raw/gtex_genes_test.csv", "./data/raw/disgenet_test.csv")

In [111]:
disiese_gene_matrix = pd.read_csv("./data/raw/disgenet_test.csv", sep="\t")
genes_features = pd.read_csv("./data/raw/gtex_genes_test.csv", sep="\t")

In [112]:

disgenet_filtered = disiese_gene_matrix.groupby("diseaseId").filter(lambda x: len(x) > 0)
genes = genes_features["genes"].sort_values().unique()
diseases = disgenet_filtered["diseaseId"].sort_values().unique()

matrix = pd.DataFrame(np.zeros((len(genes), len(diseases)),))

gene_id_to_idx = mapper.genes_id_to_idx_map()
disease_id_to_idx = mapper.diseases_id_to_idx_map()
disgenet_filtered["geneId"] = disgenet_filtered["geneId"].map(gene_id_to_idx) 
disgenet_filtered["diseaseId"] = disgenet_filtered["diseaseId"].map(disease_id_to_idx)
disgenet_filtered


Unnamed: 0,geneId,diseaseId
0,5,7
1,5,8
2,2,8
3,2,0
4,2,1
5,2,2
6,2,3
7,2,4
8,2,5
9,2,6


In [113]:
tuples_array = [row for row in disgenet_filtered.itertuples(index=False, name=None)]

In [114]:
row, col = tuples_array[0]
col, row, matrix.loc[row, col]

(7, 5, np.float64(0.0))

In [115]:
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
len(tuples_array)

11

In [117]:
for row, col in tqdm(tuples_array):
    matrix.loc[row, col] = 1

matrix.to_csv("./data/raw/disease_gene_matrix.csv")

100%|██████████| 11/11 [00:00<?, ?it/s]


<h1>Test, Validation and Test masks</h1>

In [118]:
disgenet_grouped = disgenet_filtered.groupby(by="diseaseId", group_keys=False)
disgenet_grouped.size()

diseaseId
0    1
1    1
2    1
3    1
4    1
5    1
6    2
7    1
8    2
dtype: int64

In [119]:
test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))
train = disgenet_filtered.drop(test_validation.index)

train

  test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))


Unnamed: 0,geneId,diseaseId
0,5,7
2,2,8
3,2,0
4,2,1
5,2,2
6,2,3
7,2,4
8,2,5
10,1,6


In [120]:
test_validation

Unnamed: 0,geneId,diseaseId
9,2,6
1,5,8


In [121]:
test_validation = test_validation.groupby(by="diseaseId", group_keys=False)
test_validation.size()

diseaseId
6    1
8    1
dtype: int64

In [122]:
#Group by is needed before sample function call!!!
test = test_validation.apply(lambda x: x.sample(frac=0.50, random_state=1))
drop_indices = pd.concat([train, test]).index
validation = disgenet_filtered.drop(drop_indices)
validation.groupby(by="diseaseId", group_keys=False).size()

  test = test_validation.apply(lambda x: x.sample(frac=0.50, random_state=1))


diseaseId
6    1
8    1
dtype: int64

In [123]:
test.groupby(by="diseaseId", group_keys=False).size()

Series([], dtype: int64)

<h3>Get the negativ gene-disease conections </h3>

In [124]:
genes_frame = pd.DataFrame(disgenet_filtered["geneId"].unique(), columns=["geneId"])
diseases_frame = pd.DataFrame(disgenet_filtered["diseaseId"].unique(), columns=["diseaseId"])

In [125]:
gene_disease_descartes_product = genes_frame.merge(diseases_frame, how="cross")


In [126]:
gene_disease_descartes_product

Unnamed: 0,geneId,diseaseId
0,5,7
1,5,8
2,5,0
3,5,1
4,5,2
5,5,3
6,5,4
7,5,5
8,5,6
9,2,7


In [None]:
disgenet_inverse = gene_disease_descartes_product.merge(disgenet_filtered, on=['geneId', 'diseaseId'], how='left', indicator=True)
disgenet_inverse = disgenet_inverse[disgenet_inverse['_merge'] == 'left_only'].drop(columns='_merge')
disgenet_inverse

Unnamed: 0,geneId,diseaseId
2,5,0
3,5,1
4,5,2
5,5,3
6,5,4
7,5,5
8,5,6
9,2,7
18,1,7
19,1,8


In [133]:
disgenet_inverse_grouped = disgenet_inverse.groupby(by="diseaseId", group_keys=False)
disgenet_inverse_grouped.size()

diseaseId
0    2
1    2
2    2
3    2
4    2
5    2
6    1
7    2
8    1
dtype: int64

In [135]:
test_validation_n = disgenet_inverse_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))
train_n = disgenet_inverse.drop(test_validation_n.index)

train_n

  test_validation_n = disgenet_inverse_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))


Unnamed: 0,geneId,diseaseId
8,5,6
18,1,7
19,1,8
20,1,0
21,1,1
22,1,2
23,1,3
24,1,4
25,1,5


In [136]:
test_validation_n_grouped = test_validation_n.groupby(by="diseaseId", group_keys=False)
test_validation_n_grouped.size()

diseaseId
0    1
1    1
2    1
3    1
4    1
5    1
7    1
dtype: int64

In [137]:
test_n = test_validation_n_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))
drop_indices_n = pd.concat([train_n, test_n]).index
validation_n = disgenet_inverse.drop(drop_indices_n)
validation_n.groupby(by="diseaseId", group_keys=False).size()

  test_n = test_validation_n_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))


diseaseId
0    1
1    1
2    1
3    1
4    1
5    1
7    1
dtype: int64

In [139]:
test_n.groupby(by="diseaseId").size()

Series([], dtype: int64)