In [1]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
#from torch_geometric.data.lightning import Dataset
import numpy as np 
import os
from torch_geometric.transforms import NormalizeFeatures
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
class IdMapper():
    sorted_diseases = []
    sorted_genes = []

    def __init__(self, gene_file, disease_file):
        genes = pd.read_csv(gene_file, sep="\t")
        self.genes = genes["genes"].sort_values().unique()

        disieses = pd.read_csv(disease_file, sep="\t")
        diseases_filtered = disieses.groupby("diseaseId").filter(lambda x: len(x) > 7)
        self.diseases = diseases_filtered["diseaseId"].sort_values().unique()

    def diseases_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.diseases)}
    
    def diseases_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.diseases)}
    
    def genes_idx_to_id_map(self):
        return { idx: item  for idx, item in enumerate(self.genes)}
   
    def genes_id_to_idx_map(self):
        return { item: idx  for idx, item in enumerate(self.genes)}

In [3]:
class GeneDataset(Dataset):
    def __init__(self, root, filenames, test_size, val_size, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.test_size = test_size
        self.val_size = val_size
        self.filenames = filenames
        self.mapper = IdMapper("./data/raw/"+filenames[0], "./data/raw/"+filenames[2])
        super(GeneDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  
        """
        return self.filenames

    @property
    def processed_file_names(self):
        """ If these files are found in raw_dir, processing is skipped"""
        if self.test:
            return [F'{file_name}_test' for file_name in self.raw_paths]
        else:
            return self.raw_paths

    def download(self):
        pass

    def process(self):
        self.genes_features = pd.read_csv(self.raw_paths[0], sep="\t")
        self.edges_features = pd.read_csv(self.raw_paths[1], sep="\t")
        self.disiese_gene_matrix = pd.read_csv(self.raw_paths[2], sep="\t")

        self.genes = self.genes_features["genes"].sort_values().unique()
        self.disgenet_filtered = self.disiese_gene_matrix.groupby("diseaseId").filter(lambda x: len(x) > 7)
        self.diseases = self.disgenet_filtered["diseaseId"].sort_values().unique()

        node_feats = self._get_node_features(self.genes_features)
        edge_feats = self._get_edge_features(self.edges_features)
        edge_index = self._get_adjacency_info(self.edges_features)

        y = self._create_mask_matrix(self.disgenet_filtered.copy())
        train_mask, validation_mask, test_mask = self._get_train_val_test_mask(self.disgenet_filtered.copy()) 

        data = Data(x=node_feats,
                    edge_index=edge_index,
                    edge_weight=edge_feats,
                    test_mask=test_mask, val_mask=validation_mask, train_mask=train_mask, y=y)
         
        if self.test:
            torch.save(data, os.path.join(self.processed_dir, 'graph_test.pt'))
        else:
            torch.save(data, os.path.join(self.processed_dir, 'graph.pt'))


    def _get_train_val_test_mask(self, disgenet_filtered):
        """ 
        i need too create matrices shape like disgenet
        and in this matrix i pick random points which are gonna be the train mask, validation mask and test mask
        
        in the train dataset i need to pick 80% from disgenet, equaly 0s and 1s in a column
        in the validation dataset i need to pick 10% from disgenet, equaly 0s and 1s in a column
        """

        train, validation, test = self._split_labels_to_train_val_test(disgenet_filtered)
        disgenet_inverse = self._get_disgenet_inverse(disgenet_filtered)
        train_n, validation_n, test_n = self._split_labels_to_train_val_test(disgenet_inverse)
        train_r = pd.concat([train, train_n], ignore_index=True)
        validation_r = pd.concat([validation, validation_n], ignore_index=True)
        test_r = pd.concat([test, test_n], ignore_index=True)

        train_mask = self._create_mask_matrix(train_r)
        validation_mask = self._create_mask_matrix(validation_r)
        test_mask = self._create_mask_matrix(test_r)

        return train_mask, validation_mask, test_mask
    
    def _split_labels_to_train_val_test(self, disgenet):
        #Split the positive targets to equal partitions by disease
        disgenet_grouped = disgenet.groupby(by="diseaseId", group_keys=False)
        test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))
        train = disgenet.drop(test_validation.index)
        test_validation_grouped = test_validation.groupby(by="diseaseId", group_keys=False)

        #Group by is needed before sample function call!!!
        test = test_validation_grouped.apply(lambda x: x.sample(frac=0.50, random_state=1))
        drop_indices = pd.concat([train, test]).index
        validation = disgenet.drop(drop_indices)
        return train, validation, test
    
    
    def _get_disgenet_inverse(self, disgenet):
        genes_frame = pd.DataFrame(list(self.genes), columns=["geneId"])
        diseases_frame = pd.DataFrame(self.diseases, columns=["diseaseId"])
        gene_disease_descartes_product = genes_frame.merge(diseases_frame, how="cross")
        disgenet_inverse = gene_disease_descartes_product.merge(disgenet, on=['geneId', 'diseaseId'], how='left', indicator=True)
        return disgenet_inverse[disgenet_inverse['_merge'] == 'left_only'].drop(columns='_merge')


    def _create_mask_matrix(self, dataframe):
        matrix = pd.DataFrame(np.zeros((len(self.genes), len(self.diseases)),))
        gene_id_to_idx = self.mapper.genes_id_to_idx_map()
        disease_id_to_idx = self.mapper.diseases_id_to_idx_map()
        
        dataframe["geneId"] = dataframe["geneId"].map(gene_id_to_idx) 
        dataframe["diseaseId"] = dataframe["diseaseId"].map(disease_id_to_idx) 
        tuples_array = [row for row in dataframe.itertuples(index=False, name=None)]
        for row, col in tqdm(tuples_array):
            matrix.loc[row, col] = 1

        return matrix

    def _get_node_features(self, genes):
        gene_id_to_idx = self.mapper.genes_id_to_idx_map()
        genes["genes"] = self.genes_features["genes"].map(gene_id_to_idx)
        all_node_feats = genes.values.tolist()
        all_node_feats = np.asarray(all_node_feats)
        
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, edges):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
        duplicated_edges = edges.loc[edges.index.repeat(2)].reset_index(drop=True)
        all_edge_feats = duplicated_edges["combined_score"].tolist()
        return torch.tensor(all_edge_feats, dtype=torch.float)


    def _get_adjacency_info(self, edges):
        """
        We want to be sure that the order of the indices
        matches the order of the edge features
        """
        gene_id_to_idx = self.mapper.genes_id_to_idx_map()

        edge_indices = []
        gene_1 = edges["gene1"].map(gene_id_to_idx)
        gene_2 = edges["gene2"].map(gene_id_to_idx)
        edges = pd.concat([gene_1, gene_2], axis=1).values.tolist()

        #iterate over the edges end duplicate it because for one edge we need: n1,n2 and n2,n1
        double_edges = []
        for edge in edges:
            double_edges += [ edge, [edge[1], edge[0]]]

        edge_indices = torch.tensor(double_edges)
        edge_indices = edge_indices.t().to(torch.int32).view(2, -1)
        return edge_indices

    def len(self):
        return self.genes.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            graph = torch.load(os.path.join(self.processed_dir, 'graph_test.pt'), weights_only=False)
        else:
            graph = torch.load(os.path.join(self.processed_dir, 'graph.pt'), weights_only=False)

        return graph
    
    def __getitem__(self, idx):
        return self.get(0)

In [4]:
dataset = GeneDataset(
    root="./data", 
    filenames=["gtex_genes.csv", "gene_graph.csv", "disgenet_with_gene_id.csv"],
    test_size=0.2,
    val_size=0.0,
    transform=NormalizeFeatures())

Processing...
100%|██████████| 60446/60446 [00:02<00:00, 21038.57it/s]
  test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))
  test = test_validation_grouped.apply(lambda x: x.sample(frac=0.50, random_state=1))
  test_validation = disgenet_grouped.apply(lambda x: x.sample(frac=0.2, random_state=1))
  test = test_validation_grouped.apply(lambda x: x.sample(frac=0.50, random_state=1))
100%|██████████| 19621221/19621221 [13:42<00:00, 23844.66it/s]
100%|██████████| 2452697/2452697 [01:31<00:00, 26941.71it/s]
100%|██████████| 2452611/2452611 [01:10<00:00, 34989.55it/s]
Done!


A gráf kirajzoltatása
(nagyon lassan fut le!!!!!!!!! --> 1 óra volt a colab-ban)

In [5]:
# import networkx as nx
# from torch_geometric.utils import to_networkx
# import matplotlib.pyplot as plt

# G = to_networkx(dataset[0], to_undirected=True)
# plt.figure(figsize=(100, 100))
# nx.draw(G, with_labels=False, node_color='lightblue', font_weight='bold')
# plt.savefig("graph.svg", format="svg")

disgenetet úgy tovább szűrni, hogy az egyes betegséghez legalább x gén tartozzon --> végén majd kiprobálni, hogy nem szürök rajtuk

GCN --> a veszteség függvény legyen jó, sima bináris osztályozás

keresztvalidáció

mátrixokkal dolgozzak

ha kiegyensulyozatlan akkor --> f1 score, avg precision, precision-recall görbe, (olyan metrikákat használjak)
                                    dúsitást NEEEE


In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear

In [7]:
# The simple GCN modell
class GCN(torch.nn.Module):
    def __init__(self, dataset, hidden_channels):
        super().__init__()
        torch.manual_seed(42)

        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.out = Linear(hidden_channels, dataset.x.shape[1]) # dimension of disies


    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = self.conv2(x, edge_index, edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        
        x = F.sigmoid(self.out(x))
        return x
    
    #ahány betegség annyi osztály
    # minden egyes betegség esetén oda tartozik vagy nem és ezt edja vissza
    #x az osszes csomopont most 

In [8]:
data = dataset[0]

model = GCN(data, hidden_channels=16).to("cpu")

learning_rate = 0.01
decay = 5e-4
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             weight_decay=decay)

criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad() 

      # Use all data as input, because all nodes have node features
      out = model(data.x, data.edge_index, data.edge_weight)  
      # Only use nodes with labels available for loss calculation --> mask
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  
      loss.backward() 
      optimizer.step()
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index, data.edge_weight)  

      # Check against ground-truth labels.
      #The masks are matrices
      test_correct = out[data.test_mask] == data.y[data.test_mask]  
      # Derive ratio of correct predictions.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  
      return test_acc

I need to create a map for idx - geneid, idx - diseaseid

In [9]:
mapper = IdMapper("./data/raw/gtex_genes.csv", "./data/raw/disgenet_with_gene_id.csv")

In [10]:
gene_mapps = mapper.genes_idx_to_id_map()
data.x[0]

tensor([0.0000e+00, 2.7620e+01, 2.4007e+01, 1.4491e+01, 9.3333e+00, 9.2084e+00,
        7.3940e+00, 1.3787e+01, 6.1896e+00, 4.7797e+00, 5.8841e+00, 1.7619e+00,
        2.3907e+00, 4.0218e+00, 4.2509e+00, 4.8172e+00, 7.1494e+00, 5.8944e+00,
        3.9228e+00, 5.3545e+00, 5.5959e+00, 3.0495e+01, 1.5130e+01, 6.7167e-02,
        3.3303e+01, 3.6979e+01, 9.7219e+00, 2.4643e+01, 7.1042e+00, 3.2595e+01,
        6.1625e+00, 2.0533e+01, 4.3448e+00, 1.9549e+00, 1.0834e+01, 2.5802e+01,
        1.9942e+01, 1.2093e+01, 2.9400e+01, 1.8344e+00, 2.9841e+01, 7.3266e+01,
        6.7440e+00, 4.5929e+01, 1.9061e+01, 8.1349e+00, 8.8486e+00, 1.4194e+01,
        7.7756e+00, 9.9295e+00, 6.5609e+01, 1.9163e+01, 2.8292e+01, 2.6099e+01,
        7.1083e-02])

In [11]:
losses = []
for epoch in range(0, 1001):
    loss = train()
    losses.append(loss)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

KeyError: 1553