In [None]:
# Type hints
from typing import Any, Dict, List, Tuple, Optional

# Standard library
import ast
import logging
import re
import warnings

# Third-party packages - Data manipulation
import pandas as pd
from tqdm import tqdm

# Third-party packages - Environment & Database
from dotenv import load_dotenv
from neo4j import GraphDatabase

# Third-party packages - Error handling & Retry logic
from tenacity import retry, stop_after_attempt, wait_exponential

# Langchain - Core
from langchain.chains import GraphCypherQAChain
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document

# Langchain - Models & Connectors
from langchain_ollama.llms import OllamaLLM


# Langchain - Graph & Experimental
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer


# Suppress warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

False

In [128]:
dataset = pd.read_csv('../data/dataset3.csv')
dataset.head()

Unnamed: 0,contributor,2docontributor,date,abstract,description,identifier,identifier.uri,publisher,source,subject,title,renati.advisor.dni,renati.advisor.orcid,thesis.degree.discipline,thesis.degree.grantor,thesis.degree.level,thesis.degree.name,thesis.degree.program
0,"Valencia Rivera, Felipe Eladio","Nuñez Yanyachy, Renzo Gabriel||Zegarra Ramos, ...",2022,El presente trabajo titulado “Propuesta de mej...,Tesis,1076221,https://hdl.handle.net/20.500.12590/17420,Universidad Católica San Pablo,Universidad Católica San Pablo||Repositorio In...,Mantenimiento correctivo||Organigrama||Capacit...,Propuesta de mejora aplicada al mantenimiento ...,722026.0,https://orcid.org/0000-0001-7037-1016,Ingeniería Industrial,Universidad Católica San Pablo. Departamento d...,Título Profesional,Ingeniero Industrial,Escuela Profesional de Ingeniería Industrial
1,"Huamán Mamani, Fredy Alberto","Tejada Fernández, María Alejandra||Loayza Pala...",2017,El presente proyecto tiene como propósito dete...,Tesis,1058260,https://hdl.handle.net/20.500.12590/15551,Universidad Católica San Pablo,Universidad Católica San Pablo||Repositorio In...,Producción||Comercialización de adoquines||Inv...,Proyecto de inversión para la producción y com...,,,Ingeniería Industrial,Universidad Católica San Pablo. Departamento d...,Título Profesional,Ingeniero Industrial,Escuela Profesional de Ingeniería Industrial
2,"Quintanilla Montoya, Zultner Zenon Julio","Bravo Oviedo, Daniel Alberto||Jara Chirinos, M...",2020,"Actualmente en la provincia de Arequipa, se ti...",Trabajo de investigación,1073010,https://hdl.handle.net/20.500.12590/16600,Universidad Católica San Pablo,Universidad Católica San Pablo||Repositorio In...,COVID-19||Impacto económico||Producción indust...,"Impacto del estado de emergencia, debido al CO...",21520217.0,https://orcid.org/0000-0003-3269-5612,Ingeniería Industrial,Universidad Católica San Pablo. Departamento d...,Bachiller,Bachiller en Ingeniería Industrial,Escuela Profesional de Ingeniería Industrial
3,"Nuñez Ramirez, Mario","Argüelles Bendezú, Irene||Gómez Valdivia, Parr...",2013,Ante lo ya mencionado se plantea el actual pro...,Tesis,1054375,https://hdl.handle.net/20.500.12590/15243,Universidad Católica San Pablo,Universidad Católica San Pablo||Repositorio In...,Ingeniería de proyecto||Exportación||Exportaci...,Estudio de pre-factibilidad para la creación d...,29562348.0,,Ingeniería Industrial,Universidad Católica San Pablo. Departamento d...,Título Profesional,Ingeniero Industrial,Escuela Profesional de Ingeniería Industrial
4,"Chirinos Urday, Harold Renzo","Suarez Hernani, Sasha Kasandra",2022,El año 2020 será recordado por haber sido esce...,Tesis,1075979,https://hdl.handle.net/20.500.12590/17351,Universidad Católica San Pablo,Universidad Católica San Pablo||Repositorio In...,Balanced scorecard||Proceso de planificación||...,Propuesta de mejora en el proceso de planifica...,40663459.0,https://orcid.org/0000-0002-9542-4695,Ingeniería Industrial,Universidad Católica San Pablo. Departamento d...,Título Profesional,Ingeniero Industrial,Escuela Profesional de Ingeniería Industrial


In [130]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and preprocess thesis DataFrame.
    
    Args:
        df: Input DataFrame
    
    Returns:
        Cleaned DataFrame
    """
    # Columnas a eliminar según lo solicitado
    columns_to_drop = [
        "publisher", 
        "source", 
        "renati.advisor.dni", 
        "renati.advisor.orcid", 
        "thesis.degree.discipline", 
        "thesis.degree.grantor", 
        "thesis.degree.level"
    ]
    df.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

    # Quitar duplicados según 'title'
    df = df.drop_duplicates(subset='title', keep='first')

    # Limpiar columnas de texto (object)
    col_obj = df.select_dtypes(include=["object"]).columns

    for col in col_obj:
        # Strip espacios en blanco
        df[col] = df[col].str.strip()

        # Reemplazar vacíos o unknown por None (sin capitalizar abstracts ni títulos)
        if col in ['abstract', 'description', 'title', 'subject']:
            df[col] = df[col].apply(lambda x: None if pd.isna(x) or x.lower() in ["", "unknown"] else x)
        else:
            df[col] = df[col].apply(lambda x: None if pd.isna(x) or x.lower() in ["", "unknown"] else x.capitalize())

    # Eliminar filas con cualquier null
    df = df.dropna(how="any", axis=0)

    return df
dataset = clean_data(dataset)


In [131]:
dataset.head()

Unnamed: 0,contributor,2docontributor,date,abstract,description,identifier,identifier.uri,subject,title,thesis.degree.name,thesis.degree.program
0,"Valencia rivera, felipe eladio","Nuñez yanyachy, renzo gabriel||zegarra ramos, ...",2022,El presente trabajo titulado “Propuesta de mej...,Tesis,1076221,Https://hdl.handle.net/20.500.12590/17420,Mantenimiento correctivo||Organigrama||Capacit...,Propuesta de mejora aplicada al mantenimiento ...,Ingeniero industrial,Escuela profesional de ingeniería industrial
1,"Huamán mamani, fredy alberto","Tejada fernández, maría alejandra||loayza pala...",2017,El presente proyecto tiene como propósito dete...,Tesis,1058260,Https://hdl.handle.net/20.500.12590/15551,Producción||Comercialización de adoquines||Inv...,Proyecto de inversión para la producción y com...,Ingeniero industrial,Escuela profesional de ingeniería industrial
2,"Quintanilla montoya, zultner zenon julio","Bravo oviedo, daniel alberto||jara chirinos, m...",2020,"Actualmente en la provincia de Arequipa, se ti...",Trabajo de investigación,1073010,Https://hdl.handle.net/20.500.12590/16600,COVID-19||Impacto económico||Producción indust...,"Impacto del estado de emergencia, debido al CO...",Bachiller en ingeniería industrial,Escuela profesional de ingeniería industrial
3,"Nuñez ramirez, mario","Argüelles bendezú, irene||gómez valdivia, parr...",2013,Ante lo ya mencionado se plantea el actual pro...,Tesis,1054375,Https://hdl.handle.net/20.500.12590/15243,Ingeniería de proyecto||Exportación||Exportaci...,Estudio de pre-factibilidad para la creación d...,Ingeniero industrial,Escuela profesional de ingeniería industrial
4,"Chirinos urday, harold renzo","Suarez hernani, sasha kasandra",2022,El año 2020 será recordado por haber sido esce...,Tesis,1075979,Https://hdl.handle.net/20.500.12590/17351,Balanced scorecard||Proceso de planificación||...,Propuesta de mejora en el proceso de planifica...,Ingeniero industrial,Escuela profesional de ingeniería industrial


In [119]:
class Neo4jConnection:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()
        print("Connection closed")

    def reset_database(self):
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            print("Database resetted successfully!")

    def execute_query(self, query, parameters=None):
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            return [record for record in result]

In [120]:
uri = "bolt://localhost:7687"
user = "neo4j"
password = "ucsp_test"
conn = Neo4jConnection(uri, user, password)
conn.reset_database()

Database resetted successfully!


In [132]:
def parse_number(value: Any, target_type: type) -> Optional[float]:
    """Parse string to number with proper error handling."""
    if pd.isna(value):
        return None
    try:
        cleaned = str(value).strip().replace(',', '')
        return target_type(cleaned)
    except (ValueError, TypeError):
        return None

def clean_text(text: str) -> str:
    """Clean and normalize text fields."""
    if pd.isna(text):
        return ""
    return str(text).strip().title()

In [123]:
# Add this import at the top of your cell or with your other imports
from langchain_google_genai import GoogleGenerativeAI

# Then fix your LLM initialization
import google.generativeai as genai

# Store your API key in a variable
api_key = "AIzaSyD-r-kvXNJ_DFaVW2iG275cWgekh14o0iU"  
genai.configure(api_key=api_key)

# Initialize the LLM
llm = GoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=api_key)
#llm = OllamaLLM(model="qwen2.5-coder:latest")

df = dataset.copy()

# Step 1: Define Node Labels and Properties
node_structure = "\n".join(
    [f"{col}: {', '.join(map(str, df[col].unique()[:3]))}..." for col in df.columns]
)

print(node_structure)

contributor: Valencia rivera, felipe eladio, Huamán mamani, fredy alberto, Quintanilla montoya, zultner zenon julio...
2docontributor: Nuñez yanyachy, renzo gabriel||zegarra ramos, manuel arturo, Tejada fernández, maría alejandra||loayza palazuelos, erick gianfranco, Bravo oviedo, daniel alberto||jara chirinos, marco antonio...
date: 2022, 2017, 2020...
abstract: El presente trabajo titulado “Propuesta de mejora aplicada al mantenimiento de los grifos contra incendios públicos, ubicados en la provincia de Arequipa”, tiene como objetivo plantear una propuesta de mejora que permita mejorar la disponibilidad de los grifos contra incendios ubicados en la provincia de Arequipa, establecer indicadores que permitan realizar un adecuado seguimiento y control de la propuesta y finalmente se realizó el análisis económico de la propuesta de mejora. La metodología de la presente propuesta de mejora tuvo un diseño no experimental, de tipo transversal, cualitativa, cuantitativa e inductiva. Las técn

In [15]:
# Prueba simple para el LLM configurado usando predict

try:
    # Pregunta de prueba
    pregunta = "¿Cuál es la capital de Francia?"
    
    # Enviar la pregunta al modelo como un string
    respuesta = llm.predict(pregunta)
    
    # Mostrar la respuesta
    print("Respuesta del LLM:", respuesta)

except Exception as e:
    print("Error al probar el LLM:", e)

Respuesta del LLM: La capital de Francia es París.


In [124]:
# Setup logging
from langchain.chains import LLMChain

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_dict_from_llm_response(response: str) -> str:
    """Extract a dictionary from an LLM response that might contain additional text or formatting."""
    # Remove any markdown code block markers
    response = re.sub(r'```(?:python|json)?|```', '', response)
    
    # Try to find content that looks like a dictionary ({...})
    dict_match = re.search(r'\{.*\}', response, re.DOTALL)
    if dict_match:
        return dict_match.group(0)
    
    return response.strip()

def validate_node_definition(node_def: Dict) -> bool:
    """Validate node definition structure"""
    if not isinstance(node_def, dict):
        return False
    return all(
        isinstance(v, dict) and all(isinstance(k, str) for k in v.keys())
        for v in node_def.values()
    )

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def get_node_definitions(chain, structure: str, example: Dict) -> Dict[str, Dict[str, str]]:
    try:
        # Obtener la respuesta del LLM utilizando predict con argumentos nombrados
        response = chain.predict(structure=structure, example=example)
        
        # Preprocessing - clean up the response
        cleaned_response = extract_dict_from_llm_response(response)
        
        # Log the cleaned response for debugging
        logger.info(f"Cleaned response: {cleaned_response}")
        
        # Parsear la respuesta
        node_defs = ast.literal_eval(cleaned_response)
        
        # Validar la estructura
        if not validate_node_definition(node_defs):
            raise ValueError("Invalid node definition structure")
        return node_defs
    except (ValueError, SyntaxError) as e:
        logger.error(f"Error parsing node definitions: {e}")
        logger.error(f"Raw response: {response}")
        raise

# Updated node definition template
node_example = {
    "NodeLabel1": {"property1": "row['property1']", "property2": "row['property2'], ..."},
    "NodeLabel2": {"property1": "row['property1']", "property2": "row['property2'], ..."},
    "NodeLabel3": {"property1": "row['property1']", "property2": "row['property2'], ..."},
}

define_nodes_prompt = PromptTemplate(
    input_variables=["example", "structure"],
    template=("""
        Analyze the dataset structure below and extract the entity labels for nodes and their properties.
        The node properties should be based on the dataset columns and their values.
        
        Return ONLY a valid Python dictionary like this format, with NO additional text:
        {example}
        
        Dataset Structure:
        {structure}
        
        Do not include any explanation, markdown formatting, or code block indicators.
        Your response must be ONLY the Python dictionary that can be parsed with ast.literal_eval().
        """)
)

# Execute with error handling
try:
    node_chain = LLMChain(llm=llm, prompt=define_nodes_prompt)

    node_definitions = get_node_definitions(node_chain, structure=node_structure, example=node_example)
    logger.info(f"Node Definitions: {node_definitions}")
except Exception as e:
    logger.error(f"Failed to get node definitions: {e}")
    raise

INFO:__main__:Cleaned response: {'Contributor': {'name': "row['contributor']"}, 'DocumentContributor': {'name': "row['2docontributor']"}, 'Publication': {'date': "row['date']", 'abstract': "row['abstract']", 'description': "row['description']", 'identifier': "row['identifier']", 'identifier_uri': "row['identifier.uri']", 'subject': "row['subject']", 'title': "row['title']"}, 'Thesis': {'degree_name': "row['thesis.degree.name']", 'degree_program': "row['thesis.degree.program']"}}
INFO:__main__:Node Definitions: {'Contributor': {'name': "row['contributor']"}, 'DocumentContributor': {'name': "row['2docontributor']"}, 'Publication': {'date': "row['date']", 'abstract': "row['abstract']", 'description': "row['description']", 'identifier': "row['identifier']", 'identifier_uri': "row['identifier.uri']", 'subject': "row['subject']", 'title': "row['title']"}, 'Thesis': {'degree_name': "row['thesis.degree.name']", 'degree_program': "row['thesis.degree.program']"}}


In [125]:
class RelationshipIdentifier:
    """Identifies relationships between nodes in a graph database."""
    
    RELATIONSHIP_EXAMPLE = [
        ("NodeLabel1", "RelationshipLabel", "NodeLabel2"),
        ("NodeLabel1", "RelationshipLabel", "NodeLabel3"),
        ("NodeLabel2", "RelationshipLabel", "NodeLabel3"),
    ]


    PROMPT_TEMPLATE = PromptTemplate(
    input_variables=["structure", "node_definitions", "example"],
    template="""
        Consider the following Dataset Structure:\n{structure}\n\n

        Consider the following Node Definitions:\n{node_definitions}\n\n

        Based on the dataset structure and node definitions, identify relationships (edges) between nodes.\n
        Return the relationships as a list of triples where each triple contains the start node label, relationship label, and end node label, and each triple is a tuple.\n
        Please return only the list of tuples. Please do not report triple backticks to identify a code block, just return the list of tuples.\n\n

        Example:\n{example}
        """
)

    def __init__(self, llm: Any, logger: logging.Logger = None):
        self.llm = llm
        self.logger = logger or logging.getLogger(__name__)
        self.chain = self.PROMPT_TEMPLATE | self.llm

    def validate_relationships(self, relationships: List[Tuple]) -> bool:
        """Validate relationship structure."""
        return all(
            isinstance(rel, tuple) and 
            len(rel) == 3 and 
            all(isinstance(x, str) for x in rel)
            for rel in relationships
        )

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def identify_relationships(self, structure: str, node_definitions: Dict) -> List[Tuple]:
        """Identify relationships with retry logic."""
        try:
            # response = self.chain.run(
            #     structure=structure,
            #     node_definitions=str(node_definitions),
            #     example=str(self.RELATIONSHIP_EXAMPLE)
            # )
            response = self.chain.invoke({
                "structure": structure, 
                "node_definitions": str(node_definitions), 
                "example": str(self.RELATIONSHIP_EXAMPLE)
            })
            
            relationships = ast.literal_eval(response)
            
            if not self.validate_relationships(relationships):
                raise ValueError("Invalid relationship structure")
                
            self.logger.info(f"Identified {len(relationships)} relationships")
            return relationships
            
        except Exception as e:
            self.logger.error(f"Error identifying relationships: {e}")
            raise

    def get_relationship_types(self) -> List[str]:
        """Extract unique relationship types."""
        return list(set(rel[1] for rel in self.identify_relationships()))

# Usage
identifier = RelationshipIdentifier(llm=llm)
relationships = identifier.identify_relationships(node_structure, node_definitions)
print("Relationships:", relationships)

INFO:__main__:Identified 3 relationships


Relationships: [('Publication', 'HAS_CONTRIBUTOR', 'Contributor'), ('Publication', 'HAS_DOCUMENT_CONTRIBUTOR', 'DocumentContributor'), ('Thesis', 'IS_A', 'Publication')]


In [126]:
class CypherQueryBuilder:
    """Builds Cypher queries for Neo4j graph database."""

    INPUT_EXAMPLE = """
    NodeLabel1: value1, value2
    NodeLabel2: value1, value2
    """
    
    EXAMPLE_CYPHER = example_cypher = """
    CREATE (n1:NodeLabel1 {property1: "row['property1']", property2: "row['property2']"})
    CREATE (n2:NodeLabel2 {property1: "row['property1']", property2: "row['property2']"})
    CREATE (n1)-[:RelationshipLabel]->(n2);
    """

    PROMPT_TEMPLATE = PromptTemplate(
    input_variables=["structure", "node_definitions", "relationships", "example"],
    template="""
        Consider the following Node Definitions:\n{node_definitions}\n\n
        Consider the following Relationships:\n{relationships}\n\n
        Generate Cypher queries to create nodes and relationships using the node definitions and relationships below. Remember to replace the placeholder values with actual data from the dataset.\n
        Include all the properties in the Node Definitions for each node as defined and create relationships.\n
        Return a single string with each query separated by a semicolon.\n
        Don't include any other text or quotation marks in the response.\n
        Please return only the string containing Cypher queries. Please do not report triple backticks to identify a code block.\n\n

        Example Input:\n{input}\n\n

        Example Output Cypher query:\n{cypher}
    """
)

    def __init__(self, llm: Any, logger: logging.Logger = None):
        self.llm = llm
        self.logger = logger or logging.getLogger(__name__)
        # self.chain = LLMChain(llm=llm, prompt=self.PROMPT_TEMPLATE)
        self.chain = self.PROMPT_TEMPLATE | self.llm

    def validate_cypher_query(self, query: str) -> bool:
        """Validate Cypher query syntax using LLM and regex patterns."""
        
        VALIDATION_PROMPT = PromptTemplate(
            input_variables=["query"],
            template="""
            Validate this Cypher query and return TRUE or FALSE:
            
            Query: {query}
            
            Rules to check:
            1. Valid CREATE statements
            2. Proper property formatting
            3. Valid relationship syntax
            4. No missing parentheses
            5. Valid property names
            6. Valid relationship types
            
            Return only TRUE if query is valid, FALSE if invalid.
            """
        )
        
        try:
            # Basic pattern validation
            basic_valid = all(re.search(pattern, query) for pattern in [
                r'CREATE \(',  
                r'\{.*?\}',    
                r'\)-\[:.*?\]->'
            ])
            
            if not basic_valid:
                return False
                
            # LLM validation
            validation_chain = VALIDATION_PROMPT | self.llm
            result = validation_chain.invoke({"query": query})
            
            # Parse result
            is_valid = "TRUE" in result.upper()
            
            if not is_valid:
                self.logger.warning(f"LLM validation failed for query: {query}")
                
            return is_valid
            
        except Exception as e:
            self.logger.error(f"Validation error: {e}")
            return False

    def sanitize_query(self, query: str) -> str:
        """Sanitize and format Cypher query."""
        return (query
                .strip()
                .replace('\n', ' ')
                .replace('  ', ' ')
                .replace("'row[", "row['")
                .replace("]'", "']"))

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    def build_queries(self, node_definitions: Dict, relationships: List) -> str:
        """Build Cypher queries with retry logic."""
        try:
            response = self.chain.invoke({
                "node_definitions": str(node_definitions),
                "relationships": str(relationships),
                "input": self.INPUT_EXAMPLE,
                "cypher": self.EXAMPLE_CYPHER
            })

            # Get response inside triple backticks
            if '```' in response:
                response = response.split('```')[1]

            
            # Sanitize response
            queries = self.sanitize_query(response)
            
            # Validate queries
            if not self.validate_cypher_query(queries):
                raise ValueError("Invalid Cypher query syntax")
                
            self.logger.info("Successfully generated Cypher queries")
            return queries
            
        except Exception as e:
            self.logger.error(f"Error building Cypher queries: {e}")
            raise

    def split_queries(self, queries: str) -> List[str]:
        """Split combined queries into individual statements."""
        return [q.strip() for q in queries.split(';') if q.strip()]

# Usage
builder = CypherQueryBuilder(llm=llm)
cypher_queries = builder.build_queries(node_definitions, relationships)
print("Cypher Queries:", cypher_queries)

INFO:__main__:Successfully generated Cypher queries


Cypher Queries: CREATE (c:Contributor {name: row['contributor']}); CREATE (dc:DocumentContributor {name: row['2docontributor']}); CREATE (p:Publication {date: row['date'], abstract: row['abstract'], description: row['description'], identifier: row['identifier'], identifier_uri: row['identifier.uri'], subject: row['subject'], title: row['title']}); CREATE (t:Thesis {degree_name: row['thesis.degree.name'], degree_program: row['thesis.degree.program']}); CREATE (p)-[:HAS_CONTRIBUTOR]->(c); CREATE (p)-[:HAS_DOCUMENT_CONTRIBUTOR]->(dc); CREATE (t)-[:IS_A]->(p);


In [133]:
# Iterate over dataframe with progress bar
def sanitize_value(value):
    if isinstance(value, str):
        # Neo4j escapes quotes by doubling them
        escaped_value = value.replace('"', '""')
        return f'"{escaped_value}"'
    elif value is None:
        return "null"
    else:
        return str(value)

# Iterate over dataframe with progress bar
logs = ""
total_rows = len(df)

for index, row in tqdm(df.iterrows(), 
                     total=total_rows,
                     desc="Loading data to Neo4j",
                     position=0,
                     leave=True):
    
    # Replace placeholders with actual values from each row
    cypher_query = cypher_queries
    for column in df.columns:
        placeholder = f"row['{column}']"
        if placeholder in cypher_query:
            cypher_query = cypher_query.replace(
                placeholder, 
                sanitize_value(row[column])
            )
    
    # Instead of splitting by semicolons within the query,
    # use the builder's split_queries method only on the template
    individual_queries = builder.split_queries(cypher_query)
    
    # Execute each query statement
    for q in individual_queries:
        try:
            conn.execute_query(q)
        except Exception as e:
            logs += f"Error on row {index+1}: {str(e)}\n"


# Display logs
print(logs) # Uncomment to display logs

Loading data to Neo4j: 100%|██████████| 729/729 [00:24<00:00, 29.85it/s]

Error on row 52: {code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input '"Cereales Waru Perú"': expected an expression, ',' or '}' (line 1, column 568 (offset: 567))
"CREATE (p:Publication {date: 2023, abstract: "En el tercer capítulo, se realiza la descripción de la empresa, su misión, visión y valores, definiendo sus procesos y productos principales y realizando una breve descripción de sus instalaciones.", description: "Tesis de pregrado", identifier: 1079981, identifier_uri: "Https://hdl.handle.net/20.500.12590/17756", subject: "Inocuidad||HACCP||Puntos críticos de Control (PCC)||Codex alimentario||Árbol de decisiones||Calidad", title: "Propuesta de implementación del plan HACCP para la mejora del proceso productivo de ""Cereales Waru Perú"""})"
                                                                                                                                                                                                                                 




In [43]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jrobischon/wikipedia-movie-plots")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/jrobischon/wikipedia-movie-plots?dataset_version_number=1...


100%|██████████| 29.9M/29.9M [00:02<00:00, 13.5MB/s]

Extracting files...





Path to dataset files: C:\Users\itsma\.cache\kagglehub\datasets\jrobischon\wikipedia-movie-plots\versions\1


In [145]:
query = """
MATCH (n)-[r]->(m)
RETURN labels(n) AS from_node, type(r) AS relationship, labels(m) AS to_node
LIMIT 25;
"""
conn.execute_query(query)


[<Record from_node=[] relationship='HAS_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='HAS_DOCUMENT_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='IS_A' to_node=[]>,
 <Record from_node=[] relationship='HAS_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='HAS_DOCUMENT_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='IS_A' to_node=[]>,
 <Record from_node=[] relationship='HAS_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='HAS_DOCUMENT_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='IS_A' to_node=[]>,
 <Record from_node=[] relationship='HAS_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='HAS_DOCUMENT_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='IS_A' to_node=[]>,
 <Record from_node=[] relationship='HAS_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='HAS_DOCUMENT_CONTRIBUTOR' to_node=[]>,
 <Record from_node=[] relationship='IS_A' to_node=[]>,
 <Record from_node=[

In [144]:
query = """
CALL db.labels()
"""
conn.execute_query(query)


[<Record label='Publication'>,
 <Record label='Thesis'>,
 <Record label='Contributor'>,
 <Record label='DocumentContributor'>]

In [146]:
conn.reset_database()


Database resetted successfully!


In [147]:
llm_transformer = LLMGraphTransformer(
    llm=llm,
)

df_sample = df # Reduce sample size for faster processing

documents = []
for _, row in tqdm(df_sample.iterrows(), 
                   total=len(df_sample), 
                   desc="Creating documents",
                   position=0, 
                   leave=True):
    try:
        # Format text with proper line breaks
        text = ""

        for col in df.columns:
            text += f"{col}: {row[col]}\n"
        
        documents.append(Document(page_content=text))
        
    except KeyError as e:
        tqdm.write(f"Missing column: {e}")
    except Exception as e:
        tqdm.write(f"Error processing row: {e}")

Creating documents: 100%|██████████| 729/729 [00:00<00:00, 12979.89it/s]


In [148]:
graph_documents = await llm_transformer.aconvert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='renzo gabriel', type='Person', properties={}), Node(id='Ingeniero industrial', type='Degree', properties={}), Node(id='Tesis', type='Work', properties={}), Node(id='Escuela profesional de ingeniería industrial', type='Program', properties={}), Node(id='Valencia rivera', type='Person', properties={}), Node(id='Disponibilidad', type='Subject', properties={}), Node(id='establecer indicadores que permitan realizar un adecuado seguimiento y control de la propuesta', type='Objective', properties={}), Node(id='Análisis de criticidad', type='Subject', properties={}), Node(id='mejorar la disponibilidad de los grifos contra incendios ubicados en la provincia de Arequipa', type='Objective', properties={}), Node(id='Organigrama', type='Subject', properties={}), Node(id='zegarra ramos', type='Person', properties={}), Node(id='felipe eladio', type='Person', properties={}), Node(id='Propuesta de mejora aplicada al mantenimiento de los grifos contra incendios públicos, ubicados en la p

In [150]:
graph = Neo4jGraph(url=uri, username=user, password=password, enhanced_schema=True)
graph.add_graph_documents(graph_documents)

ValueError: Could not connect to Neo4j database. Please ensure that the url is correct

In [None]:
graph.refresh_schema()

# llm_chat = ChatGoogleGenerativeAI(
#     model="gemini-1.5-pro",
#     temperature=0,
#     max_tokens=None,
#     timeout=None,
#     max_retries=2,
#     api_key=api_key
# )

CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Identify the main node, and return all the relationships and nodes connected to it.
If no properties are provided, assume the nodes have only a property id.
Please don't filter on relationships or connected nodes.

Format the query as follows:
MATCH p=(n:NodeLabel)-[r]-(m)
WHERE n.id = 'value1'
RETURN p

The question is:
{question}"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

chain = GraphCypherQAChain.from_llm(
    llm, 
    graph=graph, 
    verbose=True, 
    allow_dangerous_requests=True, 
    return_intermediate_steps=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT
)

chain.run("Give me an overview of the movie titled David Copperfield.")