### Usando Neo4j para base de datos de citas

In [1]:
import neo4j
from neo4j import GraphDatabase
import pandas as pd


In [17]:
### Conexión a la BD
URI = "bolt://localhost:7687"
AUTH = ("neo4j","12345678")

driver = GraphDatabase.driver(URI, auth=AUTH)
with driver.session() as session:
    try:
        session.run("RETURN 1")
        print("Connection to Neo4j established successfully!")
    except Exception as e:
        print(f"Failed to connect to Neo4j: {e}")



Connection to Neo4j established successfully!


### Datos desde CSV a pandas

In [2]:
#### papers que citan a otro paper

cites = pd.read_csv('cora/cora.cites',sep='\t',header=None,names=['target','source'])
cites

Unnamed: 0,target,source
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960
...,...,...
5424,853116,19621
5425,853116,853155
5426,853118,1140289
5427,853155,853118


In [4]:
citas_por_paper = cites.groupby('target')['source'].nunique()
citas_por_paper

target
35        166
40          3
114        42
117        17
128         4
         ... 
853115      4
853116      2
853118      1
853155      1
954315      1
Name: source, Length: 1565, dtype: int64

In [None]:
#### papers y los temas de los papers.  

column_names = ["paper_id"] + [f"word_{idx}" for idx in range(1433)] + ["subject"]
papers = pd.read_csv(
    'cora/cora.content', sep="\t", names=column_names,
)
subjects = papers[["paper_id","subject"]]
subjects

### Metiendo datos en Neo4j

In [66]:
### estas consultas procesan los diccionarios de input y lo van metiendo a neo4j

def load_papers(session, subjects):
    query = """
    UNWIND $nodes AS row
    MERGE (p:Paper {id: row.paper_id})
    SET p.subject = row.subject;
    """
    session.run(query, nodes=subjects)

def load_relationships(session, cites):
    query = """
    UNWIND $edges AS row
    MATCH (p1:Paper {id: row.source})
    MATCH (p2:Paper {id: row.target})
    MERGE (p1)-[:CITES]->(p2);
    """
    session.run(query, edges=cites)

In [67]:
### agregamos datos y consultamos 

with driver.session() as session:
    load_papers(session,subjects.to_dict('records'))
    load_relationships(session,cites.to_dict('records'))
    query = "MATCH (n) RETURN n LIMIT 5"
    result = session.run(query)
    for record in result:
        print(record)


  with driver.session() as session:


<Record n=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:0' labels=frozenset({'nodes'}) properties={'subject': 'Neural_Networks', 'id': 31336}>>
<Record n=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:1' labels=frozenset({'nodes'}) properties={'subject': 'Rule_Learning', 'id': 1061127}>>
<Record n=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:2' labels=frozenset({'nodes'}) properties={'subject': 'Reinforcement_Learning', 'id': 1106406}>>
<Record n=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:3' labels=frozenset({'nodes'}) properties={'subject': 'Reinforcement_Learning', 'id': 13195}>>
<Record n=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:4' labels=frozenset({'nodes'}) properties={'subject': 'Probabilistic_Methods', 'id': 37879}>>


### Algunas consultas

In [74]:
query = """
MATCH (:Paper)-[:CITES]->(p:Paper)
WITH p.subject AS subject, p, count(*) AS citations
ORDER BY subject, citations DESC
WITH subject, collect({paper: p, citations: citations})[0] AS max_paper
RETURN subject, max_paper.paper AS paper, max_paper.citations AS citations
"""
with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record)

<Record subject='Case_Based' paper=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:4062' labels=frozenset({'Paper'}) properties={'subject': 'Case_Based', 'id': 20193}> citations=22>
<Record subject='Genetic_Algorithms' paper=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:2871' labels=frozenset({'Paper'}) properties={'subject': 'Genetic_Algorithms', 'id': 35}> citations=166>
<Record subject='Neural_Networks' paper=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:3455' labels=frozenset({'Paper'}) properties={'subject': 'Neural_Networks', 'id': 1365}> citations=74>
<Record subject='Probabilistic_Methods' paper=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:2782' labels=frozenset({'Paper'}) properties={'subject': 'Probabilistic_Methods', 'id': 4330}> citations=38>
<Record subject='Reinforcement_Learning' paper=<Node element_id='4:394a99ac-1fbd-4afd-844d-38d3690cbb47:3273' labels=frozenset({'Paper'}) properties={'subject': 'Reinforcement_Learning', 'id':

  with driver.session() as session:


### Consultas para escribir (usa gpt si necesitas ayuda!): 
- cuantos papers hay por subject?
- encontrar triangulos: papers A,B,C tal que A cita a B, B cita a C, y A cita a C
- encontrar cuadrados: papers A,B,C,D 

In [None]:
driver.close()