In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data, DataLoader
import numpy as np
from graphdatascience import GraphDataScience
from py2neo import Graph
from neo4j import GraphDatabase
import networkx as nx
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pandas as pd
import neo4jupyter
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling, remove_self_loops, add_self_loops, to_dense_adj, dense_to_sparse
from torch_geometric.nn import VGAE,GAE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from torch_geometric.utils import to_dense_adj
#neo4jupyter.init_notebook_mode()
import tqdm as notebook_tqdm

In [3]:
import pickle
# Load the list of arrays from the .pkl file
with open('node_embeddings.pkl', 'rb') as pkl_file:
    extracted_data = pickle.load(pkl_file)

In [4]:
node_embeddings = extracted_data

In [5]:
len(node_embeddings)

33277

In [6]:
diseases=pd.read_csv('Disease.csv')
gene=pd.read_csv('Gene.csv')

In [7]:
diseases_nodes=list(diseases.fname)
gene_nodes=list(gene.fname)

In [8]:
diseases_embeddings = {key:node_embeddings[key] for key in diseases_nodes if key in node_embeddings}
gene_embeddings = {key:node_embeddings[key] for key in gene_nodes if key in node_embeddings}

### Gene-disease asscociation prediction HF

In [9]:
df_HF=pd.read_csv('HF_all_nodes.csv')
All_Gene_nodes= pd.read_csv('Gene.csv')

In [10]:
list_all_gene = list(set(All_Gene_nodes.fname))

In [4086]:
HF_gene_pos = ['PIK3IP1', 'SERPINE1', 'IL6', 'FIBIN', 'EDNRA', 'KAT8', 'ADRA1D', 'PTH', 'ALOX15', 'SLC9A1', 'PTGS1', 'IFNG', 'CIDEA', 'UCP1',
 'ATP2A1', 'NPPA', 'MYH6', 'TNFRSF1A', 'CYRIB', 'WDR45', 'APLN', 'NRG1', 'MS4A6A', 'NOS2', 'MSTN', 'SIRT1', 'PTPN3', 'VEGFA',
 'ALB', 'PTP4A2', 'CXCL2', 'BAMBI', 'INS', 'APCS', 'EPN3', 'EDNRB', 'ATP2A2', 'RAC1', 'HSPB1', 'ACACA', 'CFD', 'POMC', 'PTGS2',
 'ROCK2', 'NR3C2', 'GHRL', 'SOD1', 'PON1','AGTR1', 'PDPK1', 'PIK3CG', 'ADIPOQ', 'PLAT', 'EPHX2', 'ADRB3', 'HMOX1', 'PCK1', 'TNF', 'TRDN', 'FASN', 'CXCL8', 'NPPB',
 'AVP', 'NUPR1', 'SFRP1', 'CSF2', 'HAND2', 'AGT', 'PDGFRA', 'TLR2', 'APOC1', 'FIP1L1', 'FXYD3', 'ITGB1', 'MYH7', 'HAMP', 'REN',
 'HTR2B', 'PPP1R1A', 'PEBP1', 'CRP', 'NPR1', 'GCG', 'SOD3', 'CSF3', 'CCN2', 'COL4A1', 'RETN', 'PLXND1', 'PPARG', 'NRIP1', 'FBLN5',
 'NOX4', 'COL6A1', 'SOX4', 'CREG1', 'ADRB1', 'ACADS', 'CAT', 'ELOVL6', 'AKIP1', 'CYBB', 'NFE2L2', 'PRL', 'ADRA2C', 'NOS3', 'XDH',
 'CCL2', 'ANKRD23', 'EDN1', 'COL8A1', 'GRK2', 'TNNT2', 'ATP1A3', 'GSK3B', 'PPARGC1A', 'ACLY', 'AVPR2', 'ACE', 'KANK2', 'CLIC2',
 'VWF', 'PRKAR2B', 'CDC25B', 'FHL1', 'TBX20', 'CS', 'GPX4', 'SOD2', 'RBP4', 'IL1B', 'OLR1', 'DSTN', 'SIPA1L1', 'MAP2K7', 'KLF9', 
 'GDF15', 'CXCL3', 'UCN2','NOX1', 'ELK3', 'GATM', 'HIF1A']

In [4087]:
HF_gene_neg = [
    'HABP2', 'CRACR2B', 'RBMY2BP', 'HINT1', 'PSG2', 'GPT', 'ADAMDEC1', 'IGKV1D-39', 'NBAS', 'PRDM12',
    'AAGAB', 'POC1B-GALNT4', 'GID4', 'OPA3', 'TEP1', 'UPP1', 'GPHA2', 'HINT3', 'BTF3L4', 'DNAL4',
    'NSMCE1', 'EP400P1', 'LRRC8E', 'FAM43A', 'PEX13', 'OR2H1', 'PAGR1', 'CORT', 'NSA2', 'ADAMTS4',
    'VWA8', 'FGF23', 'OPN1LW', 'MGST1', 'S1PR2', 'TAS2R13', 'WFDC9', 'SPTLC3', 'MAMDC2', 'PAPOLG',
    'SWAP70', 'NR2F1', 'CHTF8', 'KLHL10', 'KCNMB2', 'BCLAF3', 'UNC79', 'DLGAP3', 'RNF115', 'EMSY',
    'SLC15A3', 'CNPY1', 'WASH6P', 'CATSPERD', 'ESCO2', 'METTL27', 'ITFG2', 'SETD5', 'ARMC12', 'ST14',
    'APCDD1L', 'SCARNA27', 'ANKRD53', 'ZNF350', 'SLC25A31', 'GALNT6', 'WFDC2', 'IGHV4-31', 'SMIM2',
    'ETHE1', 'QRICH2', 'PPFIBP2', 'PCF11', 'ALDH5A1', 'SACS', 'HRC', 'C12orf29', 'C9orf153', 'SIKE1',
    'COX6A2', 'KLHL21', 'FAM131A', 'TAS2R7', 'HOATZ', 'CRYGS', 'RFLNA', 'GMEB1', 'CYP46A1', 'MIR425',
    'SMIM19', 'TPTEP1', 'METTL17', 'MED12L', 'SERF1A', 'AGPAT5', 'TMEM54', 'PATE1', 'KLHL3', 'C3orf35',
    'ABTB1', 'RPS6KC1', 'ABCB7', 'C16orf46', 'SPATA13', 'ZNF347', 'STEAP4', 'PCDH9', 'FREM1', 'AHSA1',
    'SPATA1', 'SOWAHC', 'IGLV3-19', 'IL17RD', 'GRSF1', 'CABCOCO1', 'ABR', 'ROBO1', 'TLR7', 'SH3BGR',
    'CYP27C1', 'KIF3B', 'H2AZ2', 'ESRRB', 'LAD1', 'PSORS1C3', 'ASCL4', 'TMIGD3', 'OSBPL7', 'CLP1',
    'HES5', 'SELENOM', 'IGLV3-12', 'VAX2', 'ARMC2', 'FECH', 'GSTT2', 'TCN1', 'PDZD11', 'ALDOB',
    'FAM72A', 'GAGE5', 'ENAH', 'IGKV4-1'
]

In [4094]:
list_of_pos_neg = HF_gene_pos+HF_gene_neg

In [4095]:
list_uknown_genes = [item for item in list_all_gene if item not in list_of_pos_neg]

In [4096]:
len(list_uknown_genes)

18320

In [4098]:
gene_embeddigns_pos = [np.concatenate((gene_embeddings[key],diseases_embeddings.get('heart failure'))) for key in HF_gene_pos if key in gene_embeddings]
gene_embeddigns_neg = [np.concatenate((gene_embeddings[key],diseases_embeddings.get('heart failure'))) for key in HF_gene_neg if key in gene_embeddings]
uknown_gene_embeddigns = [np.concatenate((gene_embeddings[key],diseases_embeddings.get('heart failure'))) for key in list_uknown_genes if key in gene_embeddings]
HF_embeddings = diseases_embeddings.get('heart failure')

In [4101]:
diseases_embeddings.get('heart failure')

array([ 0.06355859,  1.5193712 ,  0.8581839 ,  0.13833424,  1.3158343 ,
       -3.0906117 ,  1.5077014 ,  0.0091086 ,  2.8876116 ,  5.1902757 ],
      dtype=float32)

In [4103]:
gene_disease_pos = pd.DataFrame(np.array(gene_embeddigns_pos))
gene_disease_pos['Label'] = 1
gene_disease_neg = pd.DataFrame(np.array(gene_embeddigns_neg))
gene_disease_neg['Label'] = 0

In [4104]:
gene_disease_pos_neg = pd.concat([gene_disease_pos,gene_disease_neg])

In [4105]:
uknown_genes_names= [key for key in list_uknown_genes if key in gene_embeddings]
uknown_gene_embeddigns_df = pd.DataFrame(np.array(uknown_gene_embeddigns))

In [4106]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
import random 

random.seed(123456)
np.random.seed(123456)
X = gene_disease_pos_neg.iloc[:,0:-1]
y = gene_disease_pos_neg.iloc[:,-1]
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20)

In [4113]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score
# Define the parameter grid
param_grid = {
    'C': [0.1, 1,5, 10, 50, 100, 200, 300, 500],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Initialize the classifier
random.seed(123)
svm = SVC(probability=True, random_state=42)

# Initialize GridSearchCV
random.seed(123)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_svm = grid_search.best_estimator_

print("Best parameters found: ", best_params)

# Evaluate the best estimator
y_pred_svm = best_svm.predict(X_test)
y_pred_proba_svm = best_svm.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba_svm))


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters found:  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy: 0.7241379310344828
ROC AUC: 0.8002378121284186


In [4114]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

random.seed(123)
# Define the tuned parameters (example parameters, replace with your actual tuned values)
tuned_params = {
    'C': 10,        # Regularization parameter
    'gamma': 0.01, # Kernel coefficient
    'kernel': 'rbf' # Kernel type
}

# Initialize the classifier with tuned parameters
svm = SVC(probability=True, random_state=42, **tuned_params)

# Train the classifier
svm.fit(X_train, y_train)

# Predict the test set labels
y_pred_svm = svm.predict(X_test)
y_pred_proba_svm = svm.predict_proba(X_test)[:, 1]

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("ROC AUC:", roc_auc_score(y_test, y_pred_proba_svm))


Accuracy: 0.7241379310344828
ROC AUC: 0.8002378121284186


In [9]:
svm_my_dict = {}
for i in range(0,uknown_gene_embeddigns_df.shape[0]):
    y_pred_svm_p = svm.predict(uknown_gene_embeddigns_df.iloc[i:i+1,:])
    y_pred_proba_svm_p = svm.predict_proba(uknown_gene_embeddigns_df.iloc[i:i+1,:])[:, 1]
    print(uknown_genes_names[i],y_pred_svm_p[0],y_pred_proba_svm_p[0])
    if y_pred_svm_p[0]>0:
        svm_my_dict[uknown_genes_names[i]] =y_pred_proba_svm_p[0]

pos_prd_prob_svm = dict(sorted(svm_my_dict.items(), key=lambda item: item[1],reverse=True))