<a href="https://colab.research.google.com/github/NagypalMarton/DeepLearning_Assignment-Disgenet/blob/main/M%C3%A9lytanul%C3%A1s_Beadand%C3%B3_Csibi_Alexandra%2C_Nagyp%C3%A1l_M%C3%A1rton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Disease-gene interaction prediction with graph neural networks

The goal of this project is to create a graph neural network for predicting disease-gene associations. Working with DisGeNET, a comprehensive database of these associations, you'll apply deep learning to an important challenge of bioinformatics. By choosing this project, you'll gain experience in the intersection of deep learning and bioinformatics while extracting valuable insights from real-world data.

Dataset:
https://www.disgenet.org/

Related GitHub repository:
https://github.com/pyg-team/pytorch_geometric

Related papers:
https://arxiv.org/abs/1607.00653
https://arxiv.org/abs/1611.07308

# Konténerizáció

Dockerfile és a requirements.txt file a Github-on elérhető.

# Adatgyűjtés
*(Data acquisition)*



Szükséges csomagok

In [None]:
import requests
import time
import csv
import json
import pandas as pd

Szükséges változók deklalása

In [None]:
# Provide your API key
# API_KEY = "c89e2d9e-94b2-4b84-8d22-bb525e63b73b"
API_KEY = "ad6669df-65b6-45f9-8e02-7ba74e788acd"

# Specify query parameters in a dictionary
#disease_IDs = "MONDO_0007254, MONDO_0008903, MONDO_0008170, MONDO_0004989, MONDO_0007256, NCI_C18078, MONDO_0008315, MONDO_0005159, MONDO_0005575, MONDO_0005252, MONDO_0024644, MONDO_0005009, MONDO_0005335, MONDO_0004995, MONDO_0021100, MONDO_0021661, MONDO_0004975, MONDO_0018874, HP_0030680, MODNO_0100096, MONDO_0005044, MONDO_0015356, MONDO_0009061, MONDO_0003582, MONDO_0005439, MODNO_0007947, MONDO_0007263, MONDO_0005835, MONDO_0004994, MODNO_0008667, MONDO_0013199, MONDO_0011913, MONDO_0008146, MONDO_0018544, MONDO_0008234, MONDO_0021060, MONDO_0010134, MONDO_0019501, MONDO_0018997, MONDO_0008233, MONDO_0008243, MONDO_0015967, MONDO_0016063, MONDO_0018150, MONDO_0005147, MONDO_0005148, MONDO_0005406, MONDO_0018911, MONDO_0005623, MONDO_0013512, MONDO_0024613, MONDO_0005130, MONDO_0005917, MONDO_0005487, MONDO_0040674, MONDO_0004751, MONDO_0005451, MONDO_0005066, MONDO_0005283, MONDO_0000594, MONDO_0002028, MONDO_0002420, MONDO_0000942, MONDO_0006170, MONDO_0032655, MONDO_0003699, MONDO_0100233, MONDO_0100163, NCI_C189227, MSH_C000718087, MSH_C000718087, MONDO_0005350, MONDO_0005160, NCI_C26682, MONDO_0005385, MONDO_0005291, MONDO_0007194, MONDO_0005396, MONDO_0005648, MONDO_0011194, MONDO_0005180, MONDO_0005105, MONDO_0018177, MONDO_0024880, MONDO_0024882, MONDO_0008383, MONDO_0004979, MONDO_0004784, MONDO_0005405, MONDO_0850282, MONDO_0011805"
params = {
    "page_number": 0,
    "type": "disease"
}

# Create a dictionary with HTTP headers
headers = {
    'Authorization': API_KEY,
    'accept': 'application/json'
}

# API endpoints
url_gda = "https://api.disgenet.com/api/v1/gda/summary"
url_disease = "https://api.disgenet.com/api/v1/entity/disease"

Kérések küldésének fg-ei

In [None]:
# Function to handle API requests with rate-limiting handling
def make_request(url, params, headers):
    retries = 0
    while retries < 5:
        try:
            response = requests.get(url, params=params, headers=headers, timeout=10)
            # If rate-limited (HTTP 429), retry after waiting
            if response.status_code == 429:
                wait_time = int(response.headers.get('x-rate-limit-retry-after-seconds', 60))
                print(f"Rate limit exceeded. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                retries += 1
            else:
                return response  # Return response if successful or error other than 429

        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}")
            retries += 1
            time.sleep(2)  # Wait before retrying

    return None  # Return None if retries are exhausted

In [None]:
def get_max_pages(url, params=params, headers=headers):
  response = make_request(url, params=params, headers=headers)
  if response.ok:
      response_json = response.json()
      total_results = response_json.get("paging", {}).get("totalElements", 0)
      results_in_page = response_json.get("paging", {}).get("totalElementsInPage", 0)
      # mert 100 a max amit enged
      max_pages = min((total_results + results_in_page - 1) // results_in_page, 100)
  else:
      max_pages = 100
      print("Request failed, returned max_pages=100")
  return max_pages

In [None]:
def get_disease_ids(disease_type):
    disease_ids = []
    params['disease_free_text_search_string'] = disease_type
    #pages = get_max_pages(url_disease, params, headers)

    for page in range(100):
      params['page_number'] = str(page)
      response_disease = make_request(url_disease, params, headers)
      if response_disease and response_disease.ok:
          response_disease_json = response_disease.json()
          data = response_disease_json.get("payload", [])
          for item in data:
              for code_info in item.get("diseaseCodes", []):
                if code_info.get("vocabulary") == "MONDO":
                  disease_ids.append(f'MONDO_{code_info.get("code")}')
      else:
          print(f"Failed to fetch data for page {page}. Status code: {response_disease_json.status_code}")
          break
    return disease_ids

In [None]:
def download_gda(disease_ids):
    gda_data = []
    params['disease'] = disease_ids
    #max_pages = get_max_pages(url_gda, params, headers)

    for page in range(100):
        params['page_number'] = str(page)  # Különböző oldalak lekérése
        response_gda = make_request(url_gda, params, headers)
        if response_gda and response_gda.ok:
            response_json = response_gda.json()
            data = response_json.get("payload", [])
            gda_data.extend(data)
        else:
            print(f"Failed to fetch data for page {page}. Status code: {response_json.status_code}")
            break  # Ha nincs több oldal vagy hiba történik, kilépünk a ciklusból

    # Mentés CSV fájlba
    # gda_df = pd.DataFrame(all_data)
    # gda_df.to_csv('disgenet-GDA.csv', index=False)
    # print(f"All data saved to disgenet-GDA.csv")
    return gda_data


In [None]:
def download_all_gda(ids, chunk_size=100):
    all_data = []
    for i in range(0, len(ids), chunk_size):
        ids_chunk = ids[i:i + chunk_size]
        ids_string = '"' + ', '.join(ids_chunk) + '"'
        chunk_data = download_gda(ids_string)
        all_data.extend(chunk_data)
    df_gda = pd.DataFrame(all_data)
    df_gda.to_csv('disgenet-GDA.csv', index=False)
    print(f"All data saved to disgenet-GDA.csv")

In [None]:
ids = get_disease_ids("cancer")
print(len(ids))

Rate limit exceeded. Waiting 23 seconds...
556


In [None]:
unique_ids = list(set(ids))
download_all_gda(unique_ids)

Rate limit exceeded. Waiting 19 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 2 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 13 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 4 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 13 seconds...
Rate limit exceeded. Waiting 3 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 14 seconds...
Rate limit exceeded. Waiting 13 seconds...
Rate limit exceeded. Waiting 0 seconds...
Rate limit exceeded. Waiting 15 seconds...
All data saved to disgenet-GDA.csv


# Adat feldolgozás

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import re
from sklearn.preprocessing import LabelEncoder

In [None]:
GDA_df=pd.read_csv('disgenet-GDA.csv', sep=',')
GDA_df.head()

Unnamed: 0,assocID,symbolOfGene,geneNcbiID,geneEnsemblIDs,geneNcbiType,geneDSI,geneDPI,genepLI,geneProteinStrIDs,geneProteinClassIDs,...,diseaseClasses_DO,diseaseClasses_HPO,numCTsupportingAssociation,chemicalsIncludedInEvidence,numberPmidsWithChemsIncludedInEvidenceBySource,score,yearInitial,yearFinal,el,ei
0,10167200,MSH2,4436,['ENSG00000095002'],protein-coding,0.39,0.957,4e-06,['P43246'],['DTO_05007557'],...,"['genetic disease (630)', 'syndrome (225)']",[],3,,"[{'source': 'ALL', 'numPmids': 15}, {'source':...",1.0,1975.0,2023.0,,0.970979
1,15586201,AKT1,207,['ENSG00000142208'],protein-coding,0.283,0.957,0.99533,"['B0LPE5', 'P31749', 'B3KVH4']",['DTO_03300101'],...,"['disease of cellular proliferation (14566)', ...",['Abnormality of the respiratory system (02086...,3,,"[{'source': 'ALL', 'numPmids': 81}, {'source':...",1.0,2003.0,2012.0,,0.965812
2,20723234,CTNNB1,1499,['ENSG00000168036'],protein-coding,0.278,0.913,1.0,"['B4DGU4', 'P35222']",[],...,"['disease of cellular proliferation (14566)', ...",['Abnormality of the digestive system (25031)'...,2,,"[{'source': 'ALL', 'numPmids': 24}, {'source':...",1.0,2014.0,2014.0,,0.959596
3,20294252,TP53,7157,['ENSG00000141510'],protein-coding,0.256,0.957,0.99795,"['A0A087WXZ1', 'A0A087X1Q1', 'P04637', 'A0A087...",['DTO_05007542'],...,"['disease of cellular proliferation (14566)', ...",['Abnormality of the digestive system (25031)'...,0,,"[{'source': 'ALL', 'numPmids': 13}, {'source':...",1.0,2014.0,2015.0,,0.905
4,15608474,STK11,6794,['ENSG00000118046'],protein-coding,0.401,0.913,0.99991,"['A0A0S2Z4D1', 'Q15831']",['DTO_03300101'],...,"['disease of cellular proliferation (14566)', ...",['Abnormality of the respiratory system (02086...,5,,"[{'source': 'ALL', 'numPmids': 6}, {'source': ...",1.0,2007.0,2007.0,,0.940678


In [None]:
GDA_df = GDA_df.map(lambda x: np.nan if x == '[]' else x)
GDA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18089 entries, 0 to 18088
Data columns (total 27 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   assocID                                         18089 non-null  int64  
 1   symbolOfGene                                    18089 non-null  object 
 2   geneNcbiID                                      18089 non-null  int64  
 3   geneEnsemblIDs                                  17742 non-null  object 
 4   geneNcbiType                                    18089 non-null  object 
 5   geneDSI                                         18089 non-null  float64
 6   geneDPI                                         18089 non-null  float64
 7   genepLI                                         16434 non-null  float64
 8   geneProteinStrIDs                               17342 non-null  object 
 9   geneProteinClassIDs                    

In [None]:
for column in GDA_df.columns:
  print(f"{column}: {GDA_df[column].nunique()}")

assocID: 13961
symbolOfGene: 4669
geneNcbiID: 4669
geneEnsemblIDs: 4553
geneNcbiType: 7
geneDSI: 375
geneDPI: 23
genepLI: 3399
geneProteinStrIDs: 5755
geneProteinClassIDs: 21
geneProteinClassNames: 21
diseaseVocabularies: 349
diseaseName: 349
diseaseType: 1
diseaseUMLSCUI: 349
diseaseClasses_MSH: 70
diseaseClasses_UMLS_ST: 2
diseaseClasses_DO: 12
diseaseClasses_HPO: 25
numCTsupportingAssociation: 60
chemicalsIncludedInEvidence: 0
numberPmidsWithChemsIncludedInEvidenceBySource: 605
score: 19
yearInitial: 57
yearFinal: 40
el: 6
ei: 853


In [None]:
# Convert the IDs from object data type to integer format for better interpretation and processing in the GNN
label_encoder = LabelEncoder()
GDA_df['diseaseUMLSCUI_encoded'] = label_encoder.fit_transform(GDA_df['diseaseUMLSCUI'])

In [None]:
GDA_df = GDA_df.drop_duplicates(subset=['assocID']).reset_index(drop=True)

In [None]:
# Gene and disease mappings
gene_symbol_mapping = GDA_df[['geneNcbiID', 'symbolOfGene']].drop_duplicates().set_index('geneNcbiID').to_dict()['symbolOfGene']
disease_encoded_mapping = GDA_df[['diseaseUMLSCUI_encoded', 'diseaseUMLSCUI']].drop_duplicates().set_index('diseaseUMLSCUI_encoded').to_dict()['diseaseUMLSCUI']
disease_name_mapping = GDA_df[['diseaseUMLSCUI', 'diseaseName']].drop_duplicates().set_index('diseaseUMLSCUI').to_dict()['diseaseName']

In [None]:
GDA_df = GDA_df[[
    'geneNcbiID',
    'geneDSI',
    'geneDPI',
    'geneNcbiType',
    'diseaseUMLSCUI_encoded',
    'diseaseClasses_MSH',
    'diseaseClasses_UMLS_ST',
    'assocID',
    'score'
]]

In [None]:
# One-hot encoding geneNcbiType
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_geneNcbiType = enc.fit_transform(GDA_df[['geneNcbiType']])
columns = ['geneType_' + col.split('_')[-1] for col in enc.get_feature_names_out(['geneNcbiType'])]
encoded_df = pd.DataFrame(encoded_geneNcbiType, columns=columns)
GDA_df = pd.concat([GDA_df.reset_index(drop=True), encoded_df], axis=1).drop('geneNcbiType', axis=1)

In [None]:
# Process diseaseClasses_UMLS_ST and diseaseClasses_MSH
# Extracting IDs and names into a mapping
def extract_mapping(col):
    mapping = {}
    for entry in col:
        if pd.notnull(entry):
            matches = re.findall(r"'(.+?)\s+\((.+?)\)'", entry)
            for name, id in matches:
                mapping[id.strip()] = name.strip()
    return mapping

In [None]:
diseaseClass_mapping = extract_mapping(GDA_df['diseaseClasses_UMLS_ST'])
diseaseClass_mapping.update(extract_mapping(GDA_df['diseaseClasses_MSH']))
diseaseClass_mapping

{'T191': 'Neoplastic Process',
 'T047': 'Disease or Syndrome',
 'C06': 'Digestive System Diseases',
 'C04': 'Neoplasms',
 'C16': 'Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
 'C18': 'Nutritional and Metabolic Diseases',
 'C08': 'Respiratory Tract Diseases',
 'C19': 'Endocrine System Diseases',
 'C12': 'Urogenital Diseases',
 'C17': 'Skin and Connective Tissue Diseases',
 'C10': 'Nervous System Diseases',
 'C09': 'Otorhinolaryngologic Diseases',
 'C07': 'Stomatognathic Diseases',
 'C20': 'Immune System Diseases',
 'C15': 'Hemic and Lymphatic Diseases',
 'C14': 'Cardiovascular Diseases',
 'C23': 'Pathological Conditions, Signs and Symptoms',
 'C11': 'Eye Diseases',
 'C01': 'Infections',
 'C05': 'Musculoskeletal Diseases'}

In [None]:
# Keep only IDs for simplicity
def clean_classes(entry):
    if isinstance(entry, (str, bytes)):
        return [match.strip() for match in re.findall(r'\((.*?)\)', entry)]
    else:
        return []

GDA_df['diseaseClasses_UMLS_ST'] = GDA_df['diseaseClasses_UMLS_ST'].apply(clean_classes)
GDA_df['diseaseClasses_MSH'] = GDA_df['diseaseClasses_MSH'].apply(clean_classes)

In [None]:
# Combine the two lists into a new column for handling missing values in diseaseClasses_MSH
GDA_df['diseaseClass'] = GDA_df.apply(
    lambda row: list(set(row['diseaseClasses_UMLS_ST'] + row['diseaseClasses_MSH'])),
    axis=1
)

In [None]:
# Using MultiLabelBinarizer because of the input being lists of disease codes
mlb = MultiLabelBinarizer()
encoded_diseaseClass = mlb.fit_transform(GDA_df['diseaseClass'])
enc_df = pd.DataFrame(encoded_diseaseClass, columns=['diseaseClass_' + cols for cols in mlb.classes_])
GDA_df = pd.concat([GDA_df.reset_index(drop=True), enc_df], axis=1)

In [None]:
disease_class_cols = [col for col in GDA_df.columns if col.startswith('diseaseClass')]
GDA_df[disease_class_cols].head()

Unnamed: 0,diseaseClasses_MSH,diseaseClasses_UMLS_ST,diseaseClass,diseaseClass_C01,diseaseClass_C04,diseaseClass_C05,diseaseClass_C06,diseaseClass_C07,diseaseClass_C08,diseaseClass_C09,...,diseaseClass_C14,diseaseClass_C15,diseaseClass_C16,diseaseClass_C17,diseaseClass_C18,diseaseClass_C19,diseaseClass_C20,diseaseClass_C23,diseaseClass_T047,diseaseClass_T191
0,"[C06, C04, C16, C18]",[T191],"[C04, C16, T191, C18, C06]",0,1,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,"[C04, C08]",[T191],"[C04, C08, T191]",0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,"[C06, C04]",[T191],"[C06, C04, T191]",0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"[C19, C06, C04]",[T191],"[C06, C04, C19, T191]",0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,"[C04, C08]",[T191],"[C04, C08, T191]",0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
GDA_df = GDA_df.drop(['diseaseClasses_UMLS_ST', 'diseaseClasses_MSH', 'diseaseClass'], axis=1)

In [None]:
GDA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13961 entries, 0 to 13960
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   geneNcbiID                  13961 non-null  int64  
 1   geneDSI                     13961 non-null  float64
 2   geneDPI                     13961 non-null  float64
 3   diseaseUMLSCUI_encoded      13961 non-null  int64  
 4   assocID                     13961 non-null  int64  
 5   score                       13961 non-null  float64
 6   geneType_biological-region  13961 non-null  float64
 7   geneType_ncRNA              13961 non-null  float64
 8   geneType_other              13961 non-null  float64
 9   geneType_protein-coding     13961 non-null  float64
 10  geneType_pseudo             13961 non-null  float64
 11  geneType_snoRNA             13961 non-null  float64
 12  geneType_tRNA               13961 non-null  float64
 13  diseaseClass_C01            139

In [None]:
GDA_df.rename(columns={'geneNcbiID': 'geneID', 'diseaseUMLSCUI_encoded': 'diseaseID'}, inplace=True)
GDA_df.head()

Unnamed: 0,geneID,geneDSI,geneDPI,diseaseID,assocID,score,geneType_biological-region,geneType_ncRNA,geneType_other,geneType_protein-coding,...,diseaseClass_C14,diseaseClass_C15,diseaseClass_C16,diseaseClass_C17,diseaseClass_C18,diseaseClass_C19,diseaseClass_C20,diseaseClass_C23,diseaseClass_T047,diseaseClass_T191
0,4436,0.39,0.957,31,10167200,1.0,0.0,0.0,0.0,1.0,...,0,0,1,0,1,0,0,0,0,1
1,207,0.283,0.957,48,15586201,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
2,1499,0.278,0.913,77,20723234,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
3,7157,0.256,0.957,61,20294252,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,0,0,1
4,6794,0.401,0.913,48,15608474,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
GDA_df.to_csv('preprocessed_GDA_df_cancer.csv', index=False)

# Adathalmaz előkészítés a modellhez


In [None]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Adding gene nodes
gene_ids = GDA_df['geneID'].unique()
G.add_nodes_from(gene_ids, node_type='gene')

# Adding disease nodes
disease_ids = GDA_df['diseaseID'].unique()
G.add_nodes_from(disease_ids, node_type='disease')

# Adding edges (gene to disease based on assocID)
for idx, row in GDA_df.iterrows():
    G.add_edge(row['geneID'], row['diseaseID'],
               assocID=row['assocID'], score=row['score'])

In [None]:
# Create gene features DataFrame
gene_features = GDA_df.groupby('geneID').agg({
    'geneDSI': 'mean',
    'geneDPI': 'mean',
    **{col: 'mean' for col in GDA_df.columns if 'geneType_' in col}
}).reset_index()

# Convert to dictionary
gene_features_dict = gene_features.set_index('geneID').to_dict(orient='index')


In [None]:
# Create disease features DataFrame (can be adjusted based on relevant disease features)
disease_features = GDA_df.groupby('diseaseUMLSCUI_encoded').agg({
    **{col: 'mean' for col in GDA_df.columns if 'diseaseClass_' in col}
}).reset_index()

# Convert to dictionary
disease_features_dict = disease_features.set_index('diseaseUMLSCUI_encoded).to_dict(orient='index')


SyntaxError: unterminated string literal (detected at line 7) (<ipython-input-44-8d06b282594b>, line 7)

In [None]:
# Adding edge features
for idx, row in GDA_df.iterrows():
    G.edges[row['geneNcbiID'], row['diseaseUMLSCUI_encoded']]['assocID'] = row['assocID']
    G.edges[row['geneNcbiID'], row['diseaseUMLSCUI_encoded']]['score'] = row['score']


In [None]:
from sklearn.model_selection import train_test_split

# Get all edges
edges = list(G.edges(data=True))

# Split edges into train and test sets
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# Creating DataFrames for train and test edges (optional)
train_edges_df = pd.DataFrame(train_edges, columns=['source', 'target', 'data'])
test_edges_df = pd.DataFrame(test_edges, columns=['source', 'target', 'data'])
