In [1]:
import pandas as pd

In [2]:
from neo4j import GraphDatabase

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(username, password))


In [3]:
def _execute_query(query):
        with driver.session(database='hetio') as session:
            result = session.run(query)
            return [record for record in result.data()]

In [4]:
#CENTRALITY

In [5]:
#Degree Centrality

In [7]:
query = '''CALL gds.degree.stream('mygraph1')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS node, score
ORDER BY score DESC
LIMIT 10
;''' 
r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,node,score
0,MAPT,45.0
1,CASP1,1.0
2,SCRT2,1.0
3,MARK4,1.0
4,RPS6KA1,1.0
5,HDAC6,1.0
6,SYK,1.0
7,ABL1,1.0
8,CAMK2A,1.0
9,STXBP1,1.0


In [8]:
#Betweeness Centrality

In [9]:
query = '''CALL gds.betweenness.stream('mygraph1')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS node, score
ORDER BY score DESC
LIMIT 10;

''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,node,score
0,MAPT,1980.0
1,PPP1R13L,0.0
2,SNW1,0.0
3,ASGR1,0.0
4,DNAJC24,0.0
5,HSFX1,0.0
6,OR2AP1,0.0
7,MYOG,0.0
8,VEGFC,0.0
9,PEX16,0.0


In [10]:
#EigenVector Centrality

In [11]:
query = '''CALL gds.eigenvector.stream('mygraph1')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS node, score
ORDER BY score DESC
LIMIT 10;
''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,node,score
0,MAPT,0.45036
1,CASP1,0.133098
2,SCRT2,0.133098
3,MARK4,0.133098
4,RPS6KA1,0.133098
5,HDAC6,0.133098
6,SYK,0.133098
7,ABL1,0.133098
8,CAMK2A,0.133098
9,STXBP1,0.133098


In [12]:
#PageRank Centrality

In [13]:
query = '''CALL gds.pageRank.stream('mygraph1')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS node, score
ORDER BY score DESC
LIMIT 10;

''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,node,score
0,MAPT,20.393886
1,CASP1,0.529404
2,SCRT2,0.529404
3,MARK4,0.529404
4,RPS6KA1,0.529404
5,HDAC6,0.529404
6,SYK,0.529404
7,ABL1,0.529404
8,CAMK2A,0.529404
9,STXBP1,0.529404


In [14]:
#COMMUNITY DETECTION

In [15]:
#Louvain Community Detection

In [17]:
query = '''CALL gds.louvain.stream('mygraph')
YIELD nodeId, communityId
WITH communityId, collect(gds.util.asNode(nodeId).name) AS members
WITH communityId, members, size(members) AS memberCount
WHERE memberCount >= 2
RETURN communityId, apoc.text.join(members, ', ') AS members
ORDER BY communityId;

''' 

r = _execute_query(query)
pd.DataFrame(r)



Unnamed: 0,communityId,members
0,2545,"Hyperpyrexia, ZWINT, TBC1D31, Connective tissu..."
1,3055,"PPP1R13L, SNW1, RPS27A, ZBTB17, PROS1, DLD, ty..."
2,5318,"response to nutrient, regulation of cellular c..."
3,8698,"C2, DHX40, EPHA1, MAP2, SOD2, ACOX2, PTGS1, CF..."
4,21982,"BMPR2, SEMA6C, SEMA3D, TNR, VAV3, CDKL5, KCNN4..."
5,34552,"NUP205, NCOR2, PHF5A, JAK2, NUP153, LMNB2, MYB..."
6,37761,"DAG1, KIF2C, RAB11FIP5, TUBGCP4, KIF17, PARVG,..."


In [18]:
#Weakly Connected Components

In [19]:
query = '''CALL gds.wcc.stream('mygraph')
YIELD nodeId, componentId
WITH componentId, collect(gds.util.asNode(nodeId).name) AS members
WITH componentId, members, size(members) AS memberCount
WHERE memberCount >=4
RETURN componentId, apoc.text.join(members, ', ') AS members

''' 

r = _execute_query(query)
pd.DataFrame(r)



Unnamed: 0,componentId,members
0,25,"PPP1R13L, SNW1, RPS27A, C2, ZBTB17, DAG1, KIF2..."


In [20]:
query = '''CALL gds.scc.stream('mygraph')
YIELD nodeId, componentId
WITH componentId, collect(gds.util.asNode(nodeId).name) AS members
WITH componentId, members, size(members) AS memberCount
WHERE memberCount >=4
RETURN componentId, apoc.text.join(members, ', ') AS members

''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,componentId,members
0,25,"PPP1R13L, SNW1, RPS27A, C2, ZBTB17, DAG1, KIF2..."


In [21]:
#Triangle Count

In [58]:
query = '''CALL gds.triangleCount.stream('mygraph6')
YIELD nodeId, triangleCount
RETURN gds.util.asNode(nodeId).name AS node, triangleCount
ORDER BY triangleCount Desc
LIMIT 10;
''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,node,triangleCount
0,MAPT,25
1,retina,14
2,Alzheimer's disease,7
3,AKT1,3
4,S100B,3
5,CASP7,2
6,ABL1,2
7,Caspase-mediated cleavage of cytoskeletal prot...,2
8,USP9Y,1
9,NEFM,1


In [25]:
#SIMILARITY

In [26]:
#Jaccard Similarity

In [27]:
query = '''CALL gds.nodeSimilarity.stream( "mygraph",{similarityMetric: 'JACCARD'
}) 
YIELD node1, node2,similarity
RETURN DISTINCT gds.util.asNode(node1).name as G1, gds.util.asNode(node2).name AS G2, similarity
ORDER BY similarity ASC, G1, G2
LIMIT 10
''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,G1,G2,similarity
0,retina,excitatory synapse,0.000584
1,retina,postsynaptic density,0.000584
2,retina,uterine cervix,0.000584
3,retina,Caspase-mediated cleavage of cytoskeletal prot...,0.001163
4,retina,nuclear periphery,0.002843
5,cytoskeletal protein binding,SYK,0.002882
6,cytoskeletal protein binding,excitatory synapse,0.002882
7,cytoskeletal protein binding,positive regulation of axon extension,0.002882
8,cytoskeletal protein binding,postsynaptic density,0.002882
9,cytoskeletal protein binding,uterine cervix,0.002882


In [28]:
#Overlap Similarity

In [29]:
query = '''CALL gds.nodeSimilarity.stream( "mygraph",{similarityMetric: 'OVERLAP'
}) 
YIELD node1, node2,similarity
RETURN DISTINCT gds.util.asNode(node1).name as G1, gds.util.asNode(node2).name AS G2, similarity
ORDER BY similarity Asc, G1, G2
LIMIT 10
''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,G1,G2,similarity
0,A2M,C2,1.0
1,A2M,DHX40,1.0
2,A2M,DLD,1.0
3,A2M,EPHA1,1.0
4,A2M,PPP1R13L,1.0
5,A2M,PROS1,1.0
6,A2M,RPS27A,1.0
7,A2M,SNW1,1.0
8,A2M,ZBTB17,1.0
9,A2M,type 2 diabetes mellitus,1.0


In [30]:
#Cosine Similarity

In [31]:
query = '''CALL gds.nodeSimilarity.stream( "mygraph",{similarityMetric: 'COSINE'
}) 
YIELD node1, node2,similarity
RETURN DISTINCT gds.util.asNode(node1).name as G1, gds.util.asNode(node2).name AS G2, similarity
ORDER BY similarity ASC, G1, G2
LIMIT 10
''' 

r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,G1,G2,similarity
0,retina,SYK,0.024161
1,retina,excitatory synapse,0.024161
2,retina,positive regulation of axon extension,0.024161
3,retina,postsynaptic density,0.024161
4,retina,synapse,0.024161
5,retina,uterine cervix,0.024161
6,retina,MAPT,0.027805
7,retina,regulation of cell size,0.037129
8,cytoskeletal protein binding,SCRT2,0.053683
9,cytoskeletal protein binding,SYK,0.053683


In [32]:
#PATH FINDING

In [33]:
#Finding all paths maxlength 2 between Alzheimer's Disease and Tau protein

In [34]:
query = '''MATCH p = (d:Disease{identifier:"DOID:10652"})-[*1..2]-(g:Gene{identifier:"4137"})
RETURN DISTINCT p
''' 
r = _execute_query(query)
pd.DataFrame(r)


Unnamed: 0,p
0,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
1,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
2,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
3,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
4,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
5,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
6,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
7,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
8,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."
9,"[{'license': 'CC BY 3.0', 'identifier': 'DOID:..."


In [35]:
#Finding all paths of length 2 between Alzheimer's Disease and Tau protein

In [36]:
query = '''MATCH p = (d:Disease {identifier: "DOID:10652"})-[r1]-(i)-[r2]-(j)-[r3]-(g:Gene {identifier: "4137"})
WHERE NOT i.identifier = "DOID:10652" AND NOT i.identifier = "4137" AND NOT j.identifier = "DOID:10652" 
AND NOT j.identifier = "4137" 
RETURN DISTINCT
  d.name AS start_node,r1.type AS first_relation,i.name AS intermediate_node1,
  r2.type AS second_relation,
  j.name AS intermediate_node2,
  r3.type AS third_relation,
  g.name AS end_node
''' 
r = _execute_query(query)
pd.DataFrame(r)


Unnamed: 0,start_node,first_relation,intermediate_node1,second_relation,intermediate_node2,third_relation,end_node
0,Alzheimer's disease,downregulates,DYNC1H1,expresses,retina,expresses,MAPT
1,Alzheimer's disease,associates,APBB1,expresses,retina,expresses,MAPT
2,Alzheimer's disease,downregulates,DET1,expresses,retina,expresses,MAPT
3,Alzheimer's disease,downregulates,AP3M2,expresses,retina,expresses,MAPT
4,Alzheimer's disease,associates,CLU,expresses,retina,expresses,MAPT
...,...,...,...,...,...,...,...
56,Alzheimer's disease,upregulates,CHD1,expresses,retina,expresses,MAPT
57,Alzheimer's disease,upregulates,CYTH1,expresses,retina,expresses,MAPT
58,Alzheimer's disease,associates,CST3,expresses,retina,expresses,MAPT
59,Alzheimer's disease,downregulates,B3GAT1,expresses,retina,expresses,MAPT


In [37]:
#Finding all paths of length 3 between Alzheimer's Disease and all Genes

In [38]:
query = '''MATCH p = (d:Disease {identifier: "DOID:10652"})-[r1]-(i)-[r2]-(j)-[r3]-(g:Gene)
WHERE NOT i.identifier = "DOID:10652"  AND NOT j.identifier = "DOID:10652" 

RETURN DISTINCT
  d.name AS start_node,r1.type AS first_relation,i.name AS intermediate_node1,
  r2.type AS second_relation,
  j.name AS intermediate_node2,
  r3.type AS third_relation,
  g.name AS end_node
''' 
r = _execute_query(query)
pd.DataFrame(r)

Unnamed: 0,start_node,first_relation,intermediate_node1,second_relation,intermediate_node2,third_relation,end_node
0,Alzheimer's disease,associates,MAPT,expresses,pituitary gland,upregulates,MAPT
1,Alzheimer's disease,associates,MAPT,expresses,pituitary gland,expresses,MAPT
2,Alzheimer's disease,associates,MAPT,expresses,retina,expresses,FAM174B
3,Alzheimer's disease,associates,MAPT,expresses,retina,expresses,CDIP1
4,Alzheimer's disease,associates,MAPT,expresses,retina,expresses,ADD1
...,...,...,...,...,...,...,...
84457,Alzheimer's disease,upregulates,AGBL5,expresses,retina,expresses,CIRBP
84458,Alzheimer's disease,upregulates,AGBL5,expresses,retina,expresses,IGBP1
84459,Alzheimer's disease,upregulates,AGBL5,expresses,retina,expresses,STAT3
84460,Alzheimer's disease,upregulates,AGBL5,expresses,retina,expresses,ARMC8


In [39]:
#Finding the shortest paths between Alzheier's disease and all the genes

In [40]:
query = '''MATCH (start:Disease
 {identifier:"DOID:10652"}), (end:Gene )
MATCH path = shortestPath((start)-[*]-(end))
UNWIND nodes(path) AS n
UNWIND relationships(path) AS r
RETURN 
    start.name AS Start_Node,
    collect(distinct n.name) AS Intermediate_Nodes,
    collect( r.type) AS Relationships,
    end.name AS End_Node    
''' 
r = _execute_query(query)
pd.DataFrame(r)


Unnamed: 0,Start_Node,Intermediate_Nodes,Relationships,End_Node
0,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, PPP1R13L]","[upregulates, expresses, expresses, upregulate...",PPP1R13L
1,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, SNW1]","[upregulates, expresses, expresses, upregulate...",SNW1
2,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, RPS27A]","[upregulates, expresses, expresses, upregulate...",RPS27A
3,Alzheimer's disease,"[Alzheimer's disease, C2]","[upregulates, upregulates]",C2
4,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, ZBTB17]","[upregulates, expresses, expresses, upregulate...",ZBTB17
...,...,...,...,...
2450,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, PPFIA1]","[upregulates, expresses, expresses, upregulate...",PPFIA1
2451,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, MTFR1L]","[upregulates, expresses, expresses, upregulate...",MTFR1L
2452,Alzheimer's disease,"[Alzheimer's disease, TNF]","[associates, associates]",TNF
2453,Alzheimer's disease,"[Alzheimer's disease, CDIP1, retina, USP3]","[upregulates, expresses, expresses, upregulate...",USP3


In [41]:
#Depth First Search

In [42]:
query = '''MATCH (startNode:Disease {identifier: 'DOID:10652'})
CALL gds.dfs.stream("mygraph",{
  sourceNode: startNode,
  relationshipTypes: ['RELATIONSHIP_TYPE'],
  maxDepth: 3
})
YIELD path
//RETURN [node in nodes(path) | node.name] AS nodesInPath, length(path) AS pathLength
Return path  
''' 
r = _execute_query(query)
for record in r:
    print (record)

{'path': [{'name': "Alzheimer's disease", 'license': 'CC BY 3.0', 'identifier': 'DOID:10652', 'source': 'Disease Ontology', 'url': 'http://purl.obolibrary.org/obo/DOID_10652'}, 'NEXT', {'license': 'CC0 1.0', 'identifier': '1020', 'chromosome': '7', 'name': 'CDK5', 'description': 'cyclin-dependent kinase 5', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/1020'}, 'NEXT', {'license': 'CC0 1.0', 'identifier': '7124', 'chromosome': '6', 'name': 'TNF', 'description': 'tumor necrosis factor', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/7124'}, 'NEXT', {'identifier': '1054', 'license': 'CC0 1.0', 'chromosome': '19', 'name': 'CEBPG', 'description': 'CCAAT/enhancer binding protein (C/EBP), gamma', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/1054'}, 'NEXT', {'name': 'Agnosia', 'license': 'CC0 1.0', 'identifier': 'D000377', 'source': 'MeSH', 'url': 'http://identifiers.org/mesh/D000377'}, 'NEXT', {'identifier': '7461', 'license': 'CC0 1.0'

In [43]:
#Breadth First Search

In [44]:
query = '''MATCH (startNode:Disease {identifier: 'DOID:10652'})
CALL gds.bfs.stream("mygraph",{
  sourceNode: startNode,
  relationshipTypes: ['RELATIONSHIP_TYPE'],
  maxDepth: 3
})
YIELD path
//RETURN [node in nodes(path) | node.name] AS nodesInPath, length(path) AS pathLength
Return path  
''' 
r = _execute_query(query)
for record in r:
    print (record)

{'path': [{'name': "Alzheimer's disease", 'license': 'CC BY 3.0', 'identifier': 'DOID:10652', 'source': 'Disease Ontology', 'url': 'http://purl.obolibrary.org/obo/DOID_10652'}, 'NEXT', {'license': 'CC0 1.0', 'identifier': '717', 'chromosome': '6', 'name': 'C2', 'description': 'complement component 2', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/717'}, 'NEXT', {'identifier': '79665', 'license': 'CC0 1.0', 'chromosome': '17', 'name': 'DHX40', 'description': 'DEAH (Asp-Glu-Ala-His) box polypeptide 40', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/79665'}, 'NEXT', {'license': 'CC0 1.0', 'identifier': '2041', 'chromosome': '7', 'name': 'EPHA1', 'description': 'EPH receptor A1', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/2041'}, 'NEXT', {'license': 'CC0 1.0', 'identifier': '4133', 'chromosome': '2', 'name': 'MAP2', 'description': 'microtubule-associated protein 2', 'source': 'Entrez Gene', 'url': 'http://identifiers.org/ncbigene/

In [45]:
#Scoring Metapaths (A naive approach for understanding feature extraction)

In [46]:
query123 = '''MATCH p = (d:Disease {identifier: "DOID:10652"})-[r1]-(i)-[r2]-(g:Gene)
WHERE NOT i.identifier = "DOID:10652" 
RETURN distinct
d.name AS start_node,
r1.type AS first_relation,
i.name AS intermediate_node1,
r2.type AS second_relation,
g.name AS end_node''' 

r = _execute_query(query123)

for rec in r:
    print(rec)






{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'USP9Y'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'MARK4'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'APP'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'SYK'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'CASP6'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'CDK5'}
{'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_nod

In [47]:
#Extracing all nodes in the form of list

In [48]:
query123 = '''MATCH p = (d:Disease {identifier: "DOID:10652"})-[r1]-(i)-[r2]-(g:Gene)
UNWIND nodes(p) AS node
RETURN DISTINCT node.name
''' 

n = _execute_query(query123)

In [49]:
file_path = r'C:/Users/Muskaan Jain/Downloads/Book1.xlsx'
sheet_name = 'Sheet3'  

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=sheet_name)

# Convert a specific column to a list
column_name = 'Nodes'  
Nodes = df[column_name].tolist()

# Print the list 
print(Nodes)

['"Alzheimer\'s disease"', " 'MAPT'", " 'USP9Y'", " 'MARK4'", " 'APP'", " 'SYK'", " 'CASP6'", " 'CDK5'", " 'SLC1A2'", " 'MAPK1'", " 'TUBA4A'", " 'PPP5C'", " 'CASP7'", " 'GSK3A'", " 'PPP2R5A'", " 'HDAC6'", " 'RPS6KA5'", " 'SCRT2'", " 'BAG1'", " 'PTK2B'", " 'EP300'", " 'AKT1'", " 'CASP1'", " 'CSNK2A1'", " 'PIN1'", " 'PHKG1'", " 'RPS6KA1'", " 'STAU1'", " 'HSPA1B'", " 'SGK1'", " 'S100B'", " 'NEFM'", " 'STXBP1'", " 'UBE2D2'", " 'MAPK12'", " 'CAMK2A'", " 'ABL1'", " 'UBB'", " 'ACTB'", " 'YWHAB'", " 'PRKCD'", " 'TRAF6'", " 'PARK2'", " 'HSPA8'", " 'SPTB'", " 'RPS6KB1'", " 'NEFH'"]


In [50]:
#Extracting all relations in the form of list

In [51]:
import pandas as pd

# Load the Excel file using a raw string
file_path = r'C:/Users/Muskaan Jain/Downloads/Book1.xlsx'
sheet_name = 'Sheet1'  # Change this to your sheet name if needed

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path, sheet_name=sheet_name)

# Convert rows to tuples
column1 = 'Node1'  # Change this to the name of the first column
column2 = 'Node2'  # Change this to the name of the second column
R_as_tuples = list(df[[column1, column2]].itertuples(index=False, name=None))

In [52]:
Relations=list(R_as_tuples)
print(Relations)

[('"MAPT"', '"USP9Y"'), ('"MAPT"', '"USP9Y"'), ('"MAPT"', '"MARK4"'), ('"MAPT"', '"MARK4"'), ('"MAPT"', '"APP"'), ('"MAPT"', '"APP"'), ('"MAPT"', '"SYK"'), ('"MAPT"', '"SYK"'), ('"MAPT"', '"CASP6"'), ('"MAPT"', '"CASP6"'), ('"MAPT"', '"CDK5"'), ('"MAPT"', '"CDK5"'), ('"MAPT"', '"SLC1A2"'), ('"MAPT"', '"SLC1A2"'), ('"MAPT"', '"MAPK1"'), ('"MAPT"', '"MAPK1"'), ('"MAPT"', '"TUBA4A"'), ('"MAPT"', '"TUBA4A"'), ('"MAPT"', '"PPP5C"'), ('"MAPT"', '"PPP5C"'), ('"MAPT"', '"CASP7"'), ('"MAPT"', '"CASP7"'), ('"MAPT"', '"GSK3A"'), ('"MAPT"', '"GSK3A"'), ('"MAPT"', '"PPP2R5A"'), ('"MAPT"', '"PPP2R5A"'), ('"MAPT"', '"HDAC6"'), ('"MAPT"', '"HDAC6"'), ('"MAPT"', '"RPS6KA5"'), ('"MAPT"', '"RPS6KA5"'), ('"MAPT"', '"SCRT2"'), ('"MAPT"', '"SCRT2"'), ('"MAPT"', '"BAG1"'), ('"MAPT"', '"BAG1"'), ('"MAPT"', '"PTK2B"'), ('"MAPT"', '"PTK2B"'), ('"MAPT"', '"EP300"'), ('"MAPT"', '"EP300"'), ('"MAPT"', '"AKT1"'), ('"MAPT"', '"AKT1"'), ('"MAPT"', '"CASP1"'), ('"MAPT"', '"CASP1"'), ('"MAPT"', '"CSNK2A1"'), ('"MAPT"',

In [53]:
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

metapaths = r  

# Example utility functions to extract the additional features
def get_node_degree(graph, node):
    return graph.degree[node] if node in graph else 0

def get_centrality_measure(graph, node):
    centrality = nx.betweenness_centrality(graph)
    return centrality.get(node, 0)


def get_path_length(metapath):
    return 3  

def get_path_redundancy(graph, metapath):
    start_node = metapath['start_node']
    end_node = metapath['end_node']
    try:
        all_paths = list(nx.all_simple_paths(graph, source=start_node, target=end_node))
        return len(all_paths)
    except nx.NetworkXNoPath:
        return 0



# Create a graph (replace with your actual graph data)
graph = nx.Graph()

# Add nodes and edges to the graph (replace with your actual graph data)

for node in Nodes:
    graph.add_node(node)

for edge in Relations:
    graph.add_edge(edge[0], edge[1])


# Enhanced feature extraction function
def enhanced_feature_extraction(graph, metapath):
    features = [
        len(metapath['start_node']),
        len(metapath['first_relation']),
        len(metapath['intermediate_node1']),
        len(metapath['second_relation']),
        len(metapath['end_node']),
        get_node_degree(graph, metapath['start_node']),
        get_node_degree(graph, metapath['intermediate_node1']),
        get_node_degree(graph, metapath['end_node']),
        get_centrality_measure(graph, metapath['start_node']),
        get_centrality_measure(graph, metapath['intermediate_node1']),
        get_centrality_measure(graph, metapath['end_node']),
        
        get_path_length(metapath),
        #get_path_redundancy(graph, metapath),
        
    ]
    
    # Ensure all features are scalar numeric values
    features = [float(feature) for feature in features]
    
    return features

# Extract features for each metapath
X = [enhanced_feature_extraction(graph, mp) for mp in metapaths]

# Generate an array of 0s and 1s of length 52 

y=np.random.rand(52)
# Ensure X and y have the same number of samples
assert len(X) == len(y), "Number of samples in X and y must match"

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize model (example using RandomForestRegressor)
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
#print(f'Mean Squared Error: {mse}')

# Combine predictions with metapaths for sorting
metapaths_with_scores = list(zip(metapaths, model.predict(X)))

# Sort metapaths based on predicted scores (descending order)
sorted_metapaths = sorted(metapaths_with_scores, key=lambda x: x[1], reverse=True)

# Print or use sorted_metapaths as needed
for metapath, score in sorted_metapaths:
    print(f"Metapath: {metapath} | Score: {score}")


Metapath: {'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'APP'} | Score: 0.5905200540594593
Metapath: {'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'SYK'} | Score: 0.5905200540594593
Metapath: {'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'MAPT', 'second_relation': 'interacts', 'end_node': 'UBB'} | Score: 0.5905200540594593
Metapath: {'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'APP', 'second_relation': 'interacts', 'end_node': 'MAPT'} | Score: 0.5545907782520656
Metapath: {'start_node': "Alzheimer's disease", 'first_relation': 'associates', 'intermediate_node1': 'PTK2B', 'second_relation': 'interacts', 'end_node': 'MAPT'} | Score: 0.5264000223377256
Metapath: {'start_node': "Alzheimer's disease", 'firs