In [1]:
from langchain.graphs import Neo4jGraph

url = "bolt://localhost:7687"
username ="neo4j"
password = "ilya3939"
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [2]:
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [4]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )


In [5]:
prompt_for_academic_paper = f"""
## Creating a Knowledge Graph from Academic Papers

### Objective
Design a sophisticated algorithm to read academic papers and extract structured information for constructing a knowledge graph. This graph aims to elucidate the relationships between various methods, tasks prioritized within the research, their comparative superiority, and potential replacements for methods.

### Instructions

#### Node Identification and Labeling
- **Entities and Concepts**: Identify primary entities such as methods, tasks, and research findings as nodes. Label these nodes with elementary types for consistency.
- **Method** nodes represent algorithms or techniques introduced or discussed.
- **Task** nodes represent problems or objectives the methods aim to solve or achieve.
- **Finding** nodes encapsulate specific outcomes, comparisons, or advancements highlighted in the paper.

### Allowed Node Labels:
- **Method**: Represents algorithms or techniques introduced or discussed within the paper.
- **Task**: Represents problems or objectives that methods aim to solve or achieve.
- **Finding**: Encapsulates specific outcomes, comparisons, or advancements highlighted.
- **Person**: Represents authors or researchers mentioned in the paper.
- **Institution**: Represents universities, research centers, or organizations affiliated with the authors.
- **Concept**: Represents fundamental concepts or theories underlying methods or tasks.
- **Dataset**: Represents datasets used or introduced in the paper for experimentation or evaluation.

#### Relationships
- **Defining Relationships**: Establish clear relationships between nodes to reflect the paper's content accurately.
- Use relationships like **"addressesTask"**, **"outperformsMethod"**, and **"canBeReplacedWith"** to demonstrate the interaction between methods and tasks, comparative superiority, and methodological replacements.
- Ensure relationships are directly extracted from the text, clearly indicating the nature of connections between entities.

### Allowed Relationship Types:
- **addressesTask**: Indicates that a method is applied to address a specific task.
- **outperformsMethod**: Indicates that one method outperforms another in terms of effectiveness, efficiency, or other metrics.
- **canBeReplacedWith**: Suggests that one method can be replaced with another, offering similar or improved outcomes.
- **authoredBy**: Links a paper or finding to its authors.
- **affiliatedWith**: Associates authors or researchers with their respective institutions.
- **usesDataset**: Indicates that a method utilizes a specific dataset for training, testing, or validation.
- **buildsUpon**: Signifies that a method or concept builds upon previous work, indicating progression or enhancement.
- **comparesWith**: Indicates a direct comparison between two or more methods or tasks within the paper.


#### Handling Numerical Data and Dates
- **Attributes**: Incorporate numerical data and dates as attributes of nodes where relevant, such as the publication year of the method or performance metrics of findings.
- Follow the key-value format with camelCase keys, avoiding quotes within property values.

#### Coreference Resolution
- **Entity Consistency**: Maintain consistent identifiers for entities mentioned multiple times, using the most complete and unambiguous identifier found in the paper.
- This ensures clarity and coherence in the knowledge graph, aiding in the accurate representation of relationships and findings.

#### Graph Construction Guidelines
- **Simplicity and Clarity**: Aim for a knowledge graph that is easily navigable, making the research's key points, methodologies, and conclusions accessible to a broad audience.
- **Data Source**: Utilize academic papers as the primary source of information, focusing on sections that discuss methods, results, and conclusions for direct extraction of relevant data.

### Example
- **Nodes**:
- Method: "Convolutional Neural Network (CNN)"
- Task: "Image Classification"
- Finding: "Improved accuracy over previous methods by 5%"
- **Relationships**:
- "CNN" **addressesTask** "Image Classification"
- "CNN" **outperformsMethod** "Traditional Neural Networks"
- "CNN" **canBeReplacedWith** "ResNet for complex image tasks"

This structured approach to creating a knowledge graph from academic papers facilitates the extraction of insightful information regarding methodological innovations, task relevance, and the comparative effectiveness of various approaches within the research community.
"""

wiki_prompt = """
# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
"""

In [13]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
llm = ChatOpenAI(model="gpt-4-0125-preview", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          prompt_for_academic_paper
          ),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [14]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

#### TeXの場合

In [59]:
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader
loader = TextLoader("./data/MAML/senstive.tex")
raw_documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents)

#### 普通にPDF=>文字だけを入れる

In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader

def convert_pdf_to_text(pdf_path, txt_path):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    content = ""
    for page in pages:
        content += page.page_content
    with open(txt_path, "w") as file:
        file.write(content)

#convert_pdf_to_text("./DB/PDF/meta-ticket.pdf", "./DB/TXT/meta-ticket.txt")

### PDFをテキストに変換する

In [None]:
import os
import glob
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader

DB_PATH = "miniF2F/"
PDF_PATH = "PDF/"
TXT_PATH = "TXT/"

pdf_directory = os.path.join(DB_PATH, PDF_PATH)
txt_directory = os.path.join(DB_PATH, TXT_PATH)

def convert_pdf_to_text(pdf_path, txt_path):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    content = ""
    for page in pages:
        content += page.page_content
    with open(txt_path, "w") as file:
        file.write(content)

# Find all PDF files in the directory
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))

# Convert each PDF file to text
for pdf_file in tqdm(pdf_files):
    file_name = os.path.basename(pdf_file)
    txt_file = os.path.join(txt_directory, os.path.splitext(file_name)[0] + ".txt")
    convert_pdf_to_text(pdf_file, txt_file)

### テキストをKGに変換する

In [27]:
import os
import glob
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import TextLoader


def convert_txt_to_documents(txt_path):
    loader = TextLoader(txt_path)
    raw_documents = loader.load()
    text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=48)
    documents = text_splitter.split_documents(raw_documents)
    return documents

In [28]:
# Find all TXT files in the directory
txt_files = glob.glob(os.path.join(txt_directory, "*.txt"))

"""
for txt_file in tqdm(txt_files):
    documents = convert_txt_to_documents(txt_file)
    for i, d in tqdm(enumerate(documents), total=len(documents)):
        extract_and_store_graph(d)
"""

# One by one
documents = convert_txt_to_documents(txt_files[0])
for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

100%|██████████| 7/7 [04:29<00:00, 38.52s/it]


In [24]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0.5, model="gpt-4-0125-preview"),
    qa_llm=ChatOpenAI(temperature=0.5, model="gpt-4-0125-preview"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
cypher_chain.run("What do you know about Llemma?")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (n {name: "Gptf"})-[:COMPARESWITH|OUTPERFORMSMETHOD|USESDATASET|AFFILIATEDWITH|ADDRESSESTASK|USESMETHOD|BUILDSUPON|HASFINDING|IMPACTEDBY|CONTRIBUTEDTO*]-(relatedNodes)
RETURN n, relatedNodes
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


"I don't know the answer."

### グラフを削除したい場合

In [25]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]