<a href="https://colab.research.google.com/github/IshitaSinghFaujdar/gene-disease-association/blob/main/bda_cia_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install --upgrade networkx




In [6]:
pip install node2vec


Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [1]:
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import networkx as nx
import requests



In [2]:

# Load data
df = pd.read_csv("CTD_curated_genes_diseases.tsv.gz", sep="\t", compression="gzip", skiprows=27)
print(df.columns)  # Check available columns
df.columns = df.columns.str.strip("#")
df.columns = df.columns.str.strip()  # Remove spaces
df.columns = df.columns.str.lower()  # Convert to lowercase

print(df.columns)  # Check if 'genesymbol' exists




Index(['# GeneSymbol', 'GeneID', 'DiseaseName', 'DiseaseID', 'DirectEvidence',
       'OmimIDs', 'PubMedIDs'],
      dtype='object')
Index(['genesymbol', 'geneid', 'diseasename', 'diseaseid', 'directevidence',
       'omimids', 'pubmedids'],
      dtype='object')


# Step 2: Apply Node Embeddings (Node2Vec)
Now, we'll apply Node2Vec to learn embeddings for each node.

1.  Train a Node2Vec model to learn numerical embeddings for each disease/gene.

2.  These embeddings will help in predicting missing links.

In [10]:

# Remove NaN values **before processing**
df.dropna(subset=["diseasename", "genesymbol"], inplace=True)

# Initialize the graph
G = nx.Graph()

# Extract unique diseases and genes
diseases = df["diseasename"].unique()
genes = df["genesymbol"].unique()

# Add nodes
G.add_nodes_from(diseases, bipartite=0)  # Diseases
G.add_nodes_from(genes, bipartite=1)     # Genes

# Add edges (disease-gene associations)
edges = list(df[["diseasename", "genesymbol"]].itertuples(index=False, name=None))
G.add_edges_from(edges)

# Check for NaN nodes in the graph
nan_nodes = [node for node in G.nodes if pd.isna(node)]
if nan_nodes:
    print(f"Removing NaN nodes: {nan_nodes}")
    G.remove_nodes_from(nan_nodes)

# Graph info
print('Number of nodes:', len(G.nodes))
print('Number of edges:', len(G.edges))
print('Average degree:', sum(dict(G.degree).values()) / len(G.nodes))




Number of nodes: 14964
Number of edges: 34128
Average degree: 4.561347233360064


In [14]:
from node2vec import Node2Vec

# Train Node2Vec
node2vec = Node2Vec(G, dimensions=32, walk_length=10, num_walks=50, workers=3)
model = node2vec.fit(window=5, min_count=1, batch_words=4)


Computing transition probabilities:   0%|          | 0/14964 [00:00<?, ?it/s]

# Step 3: Train a Link Prediction Model
We’ll use a binary classification model to predict new disease-gene links.


1.   Generate positive samples (existing edges).
2.  Generate negative samples (random unconnected disease-gene pairs).
3.  Extract feature vectors using Node2Vec embeddings.
4.  Train a Random Forest Classifier to predict missing edges.



In [15]:
# Generate positive edges (existing disease-gene links)
positive_edges = list(G.edges())

# Ensure negative edges are not in the graph
all_possible_edges = set((d, g) for d in diseases for g in genes)
negative_edges = list(all_possible_edges - set(positive_edges))
negative_edges = random.sample(negative_edges, len(positive_edges))



# Extract feature vectors from embeddings
def get_edge_embedding(edge):
    return np.concatenate((model.wv[edge[0]], model.wv[edge[1]]))

# Prepare dataset
X = np.array([get_edge_embedding(e) for e in positive_edges + negative_edges])
y = np.array([1] * len(positive_edges) + [0] * len(negative_edges))  # 1 = real link, 0 = fake link

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Evaluate model
score = clf.score(X_test, y_test)
print(f"Model Accuracy: {score:.4f}")


Model Accuracy: 0.9540


# Step 4: Identify New Predictions

Once trained, we can predict the probability of new disease-gene links. Sort results by highest probability.

In [17]:
print("Available Diseases:\n")
for i, disease in enumerate(diseases, 1):
    print(f"{i}. {disease}")

Available Diseases:

1. Dermatitis
2. Diabetes Mellitus
3. Diabetes Mellitus, Type 2
4. Diabetic Nephropathies
5. Edema
6. Failure to Thrive
7. Fibrosis
8. Furunculosis
9. Hepatic Veno-Occlusive Disease
10. Hyperglycemia
11. Hyperplasia
12. Keratosis
13. Liver Neoplasms
14. Neoplasms
15. Obesity
16. Pigmentation Disorders
17. Prenatal Exposure Delayed Effects
18. Weight Gain
19. Hepatomegaly
20. Schizophrenia
21. Acute Kidney Injury
22. Adenoma, Liver Cell
23. Alzheimer Disease
24. Carcinoma, Hepatocellular
25. Colonic Neoplasms
26. Hepatolenticular Degeneration
27. Liver Cirrhosis
28. Liver Cirrhosis, Experimental
29. Lung Diseases
30. Lung Neoplasms
31. Nephrotic Syndrome
32. Otitis Media
33. Burkitt Lymphoma
34. Achalasia Addisonianism Alacrimia syndrome
35. Prostatic Neoplasms
36. Keratoderma, Palmoplantar
37. Keratosis palmoplantaris papulosa
38. Sleep Disorders, Circadian Rhythm
39. Charcot-Marie-Tooth Disease, Axonal, Type 2n
40. DEVELOPMENTAL AND EPILEPTIC ENCEPHALOPATHY 29
41.

In [20]:
# User Input Handling
disease = input("Enter a disease name: ").strip()

if disease not in set(diseases):
    print("Disease not found in dataset!")
else:
    existing_genes = set(df[df["diseasename"] == disease]["genesymbol"])
    potential_genes = list(set(genes) - existing_genes)

    # Predict Associations
    predictions = [
        (disease, gene, clf.predict_proba([get_edge_embedding((disease, gene))])[0][1])
        for gene in potential_genes
    ]

    predictions.sort(key=lambda x: x[2], reverse=True)

    print("\nTop 10 Predicted Gene Associations:")
    for disease, gene, prob in predictions[:10]:
        print(f"{disease} ↔ {gene} (Probability: {prob:.4f})")

Enter a disease name: CATARACT 41

Top 10 Predicted Gene Associations:
CATARACT 41 ↔ SLC39A5 (Probability: 0.4300)
CATARACT 41 ↔ SPG11 (Probability: 0.4100)
CATARACT 41 ↔ LRP5 (Probability: 0.4000)
CATARACT 41 ↔ PRPH2 (Probability: 0.3900)
CATARACT 41 ↔ C4A (Probability: 0.3700)
CATARACT 41 ↔ GPD2 (Probability: 0.3700)
CATARACT 41 ↔ SLC6A17 (Probability: 0.3700)
CATARACT 41 ↔ TMC6 (Probability: 0.3600)
CATARACT 41 ↔ IMPG2 (Probability: 0.3600)
CATARACT 41 ↔ RPGR (Probability: 0.3600)
