In [2]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os
from torch_geometric.transforms import NormalizeFeatures
from sklearn.model_selection import train_test_split
from tqdm import tqdm

<h1>Target matrix</h1>

In [28]:
class IdMapper():
    sorted_diseases = []
    sorted_genes = []

    def __init__(self, gene_file, disease_file):
        genes = pd.read_csv(gene_file, sep="\t")
        self.genes = genes["genes"].sort_values().unique()

        disieses = pd.read_csv(disease_file, sep="\t")
        diseases_filtered = disieses.groupby("diseaseId").filter(lambda x: len(x) > 7)
        self.diseases = diseases_filtered["diseaseId"].sort_values().unique()

    def diseases_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.diseases)}
    
    def diseases_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.diseases)}
    
    def genes_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.genes)}
   
    def genes_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.genes)}

In [29]:
mapper = IdMapper("./data/raw/gtex_genes.csv", "./data/raw/disgenet_with_gene_id.csv")

In [30]:
disiese_gene_matrix = pd.read_csv("./data/raw/disgenet_with_gene_id.csv", sep="\t")
genes_features = pd.read_csv("./data/raw/gtex_genes.csv", sep="\t")
disgenet_filtered = disiese_gene_matrix.groupby("diseaseId").filter(lambda x: len(x) > 7)
genes = genes_features["genes"].sort_values().unique()
diseases = disgenet_filtered["diseaseId"].sort_values().unique()

matrix = pd.DataFrame(np.zeros((len(genes), len(diseases)),))

gene_id_to_idx = mapper.genes_id_to_idx_map()
disease_id_to_idx = mapper.diseases_id_to_idx_map()
disgenet_filtered["geneId"] = disgenet_filtered["geneId"].map(gene_id_to_idx) 
disgenet_filtered["diseaseId"] = disgenet_filtered["diseaseId"].map(disease_id_to_idx)
disgenet_filtered


Unnamed: 0,geneId,diseaseId
0,4843,274
1,4843,542
2,12461,18
3,12461,93
4,12461,129
...,...,...
92954,16963,1811
92955,16016,1637
92956,16016,1811
92957,15538,1637


In [31]:
tuples_array = [row for row in disgenet_filtered.itertuples(index=False, name=None)]

In [33]:
row, col = tuples_array[0]
col, row, matrix.loc[row, col]

(274, 4843, np.float64(0.0))

In [34]:
len(tuples_array)

76130

In [35]:
for row, col in tqdm(tuples_array):
    matrix.loc[row, col] = 1

matrix.to_csv("./data/raw/disease_gene_matrix.csv")

100%|██████████| 76130/76130 [00:02<00:00, 32448.68it/s]


In [36]:
arr = []

for g, d in tuples_array:
    arr.append(matrix.loc[g, d])

arr = np.array(arr)
len(arr), arr.sum()

(76130, np.float64(76130.0))

<h1>Test, Validation and Test masks</h1>

In [7]:
disgenet_grouped = disgenet_filtered.groupby(by="diseaseId", group_keys=False)
disgenet_grouped.size()

diseaseId
C0000768     11
C0000772     16
C0000786    114
C0000822    114
C0001197     11
           ... 
C4721845      9
C4721952     20
C4722327     67
C4746851      8
C4747850      8
Length: 1814, dtype: int64

In [8]:
test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))
train = disgenet_filtered.drop(test_validation.index)

train.groupby(by="diseaseId", group_keys=False).size()

  test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))


diseaseId
C0000768     9
C0000772    13
C0000786    91
C0000822    91
C0001197     9
            ..
C4721845     7
C4721952    16
C4722327    54
C4746851     6
C4747850     6
Length: 1814, dtype: int64

In [9]:
test_validation = test_validation.groupby(by="diseaseId", group_keys=False)
test_validation.size()

diseaseId
C0000768     2
C0000772     3
C0000786    23
C0000822    23
C0001197     2
            ..
C4721845     2
C4721952     4
C4722327    13
C4746851     2
C4747850     2
Length: 1814, dtype: int64

In [10]:
#Group by is needed before sample function call!!!
test = test_validation.apply(lambda x: x.sample(frac=0.50, random_state=1))
drop_indices = pd.concat([train, test]).index
validation = disgenet_filtered.drop(drop_indices)
validation.groupby(by="diseaseId", group_keys=False).size()

  test = test_validation.apply(lambda x: x.sample(frac=0.50, random_state=1))


diseaseId
C0000768     1
C0000772     1
C0000786    11
C0000822    11
C0001197     1
            ..
C4721845     1
C4721952     2
C4722327     7
C4746851     1
C4747850     1
Length: 1814, dtype: int64

In [11]:
test.groupby(by="diseaseId", group_keys=False).size()

diseaseId
C0000768     1
C0000772     2
C0000786    12
C0000822    12
C0001197     1
            ..
C4721845     1
C4721952     2
C4722327     6
C4746851     1
C4747850     1
Length: 1814, dtype: int64

<h3>Get the negativ gene-disease conections </h3>

In [12]:
genes_frame = pd.DataFrame(disgenet_filtered["geneId"].unique(), columns=["geneId"])
diseases_frame = pd.DataFrame(disgenet_filtered["diseaseId"].unique(), columns=["diseaseId"])
gene_disease_descartes_product = genes_frame.merge(diseases_frame, how="cross")

disgenet_inverse = gene_disease_descartes_product.merge(disgenet_filtered, on=['geneId', 'diseaseId'], how='left', indicator=True)
disgenet_inverse = disgenet_inverse[disgenet_inverse['_merge'] == 'left_only'].drop(columns='_merge')
disgenet_inverse.shape, genes_frame.shape, diseases_frame.shape

((17289292, 2), (9573, 1), (1814, 1))

In [13]:
disgenet_inverse_grouped = disgenet_inverse.groupby(by="diseaseId", group_keys=False)
disgenet_inverse_grouped.size()

diseaseId
C0000768    9562
C0000772    9557
C0000786    9459
C0000822    9459
C0001197    9562
            ... 
C4721845    9564
C4721952    9553
C4722327    9506
C4746851    9565
C4747850    9565
Length: 1814, dtype: int64

In [14]:
test_validation_n = disgenet_inverse_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))
train_n = disgenet_inverse.drop(test_validation_n.index)

train_n.groupby(by="diseaseId").size()

  test_validation_n = disgenet_inverse_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))


diseaseId
C0000768    7650
C0000772    7646
C0000786    7567
C0000822    7567
C0001197    7650
            ... 
C4721845    7651
C4721952    7642
C4722327    7605
C4746851    7652
C4747850    7652
Length: 1814, dtype: int64

In [20]:
test_validation_n_grouped = test_validation_n.groupby(by="diseaseId", group_keys=False)
test_validation_n_grouped.size()

diseaseId
C0000768    1912
C0000772    1911
C0000786    1892
C0000822    1892
C0001197    1912
            ... 
C4721845    1913
C4721952    1911
C4722327    1901
C4746851    1913
C4747850    1913
Length: 1814, dtype: int64

In [22]:
test_n = test_validation_n_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))
drop_indices_n = pd.concat([train_n, test_n]).index
validation_n = disgenet_inverse.drop(drop_indices_n)
validation_n.groupby(by="diseaseId", group_keys=False).size()

  test_n = test_validation_n_grouped.apply(lambda x: x.sample(frac=0.5, random_state=1))


diseaseId
C0000768    956
C0000772    955
C0000786    946
C0000822    946
C0001197    956
           ... 
C4721845    957
C4721952    955
C4722327    951
C4746851    957
C4747850    957
Length: 1814, dtype: int64

In [23]:
test_n.groupby(by="diseaseId")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C62B2BF6E0>