# Graph Extraction from Text

This tutorial utilizes huggingface LLMs to preprocess and generate knowledge graphs from input raw text.

# Install Dependencies

In [1]:
%pip install llama-index graspologic numpy==1.24.4 scipy==1.12.0
%pip install llama-index llama-index-llms-huggingface transformers accelerate

Collecting llama-index
  Downloading llama_index-0.12.37-py3-none-any.whl.metadata (12 kB)
Collecting graspologic
  Downloading graspologic-3.4.1-py3-none-any.whl.metadata (5.8 kB)
Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.12.0
  Downloading scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.8-py3-none-any.whl.metadata (438 bytes)
Collecting llama-index-cli<0.5,>=0.4.1 (from llama-index)
  Downloading llama_index_cli-0.4.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13,>=0.12.36 (from llama-index)
  Downloading llama_index_core-0.12.37-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-inde

In [2]:
%pip install llama-index-embeddings-huggingface
%pip install llama-index-embeddings-instructor

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.4-py3-none-any.whl.metadata (458 bytes)
Downloading llama_index_embeddings_huggingface-0.5.4-py3-none-any.whl (8.9 kB)
Installing collected packages: llama-index-embeddings-huggingface
Successfully installed llama-index-embeddings-huggingface-0.5.4
Note: you may need to restart the kernel to use updated packages.
Collecting llama-index-embeddings-instructor
  Downloading llama_index_embeddings_instructor-0.3.0-py3-none-any.whl.metadata (808 bytes)
Collecting instructorembedding<2.0.0,>=1.0.1 (from llama-index-embeddings-instructor)
  Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl.metadata (20 kB)
Collecting sentence-transformers<3.0.0,>=2.2.2 (from llama-index-embeddings-instructor)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading llama_index_embeddings_instructor-0.3.0-py3-none-any.whl (3.6 kB)
Downloading InstructorEmbedding-1.0.1-py2.p

# Load and Preprocess Data

In [3]:
import pandas as pd
news = pd.read_csv("https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv")[:5]
news

Unnamed: 0,title,date,text
0,Chevron: Best Of Breed,2031-04-06T01:36:32.000000000+00:00,JHVEPhoto Like many companies in the O&G secto...
1,FirstEnergy (NYSE:FE) Posts Earnings Results,2030-04-29T06:55:28.000000000+00:00,FirstEnergy (NYSE:FE – Get Rating) posted its ...
2,Dáil almost suspended after Sinn Féin TD put p...,2023-06-15T14:32:11.000000000+00:00,The Dáil was almost suspended on Thursday afte...
3,Epic’s latest tool can animate hyperrealistic ...,2023-06-15T14:00:00.000000000+00:00,"Today, Epic is releasing a new tool designed t..."
4,"EU to Ban Huawei, ZTE from Internal Commission...",2023-06-15T13:50:00.000000000+00:00,The European Commission is planning to ban equ...


In [4]:
import pandas as pd
from llama_index.core import Document

# Convert data into LlamaIndex Document objects
documents = [
    Document(text=f"{row['title']}: {row['text']}")
    for _, row in news.iterrows()
]

In [5]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)

# Insert hugging huggingface token to access LLMs and embeddings

In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
import pandas as pd
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.huggingface import HuggingFaceLLM
import re
from llama_index.core import PropertyGraphIndex
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch

llm = HuggingFaceLLM(
    model_name="mistralai/Mistral-7B-Instruct-v0.3",  # or any supported model
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.3",
    context_window=4096,
    max_new_tokens=2048,
    device_map="cuda",
    model_kwargs={
        "torch_dtype": torch.bfloat16,  # Even better if your hardware supports it
        "offload_folder": "offload",  # For very large models
    },
   
    generate_kwargs={"temperature": 0.7, "do_sample": True},
    tokenizer_kwargs={"padding_side": "left", "truncation_side": "left"},
)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

2025-05-22 12:14:56.588249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747916096.778316      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747916096.833791      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [8]:
import re

entity_pattern = r"entity_name:\s*(.*?)\s*entity_type:\s*(.*?)\s*entity_description:\s*(.*?)(?=\n|$)"
relationship_pattern = r"source_entity:\s*(.*?)\s*target_entity:\s*(.*?)\s*relation:\s*(.*?)\s*relationship_description:\s*(.*?)(?=\n|$)"

def parse_fn(response_str: str):
    entities = re.findall(entity_pattern, response_str, flags=re.DOTALL)
    relationships = re.findall(relationship_pattern, response_str, flags=re.DOTALL)
    return entities, relationships

In [10]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

from typing import Any, List, Callable, Optional, Union, Dict
from IPython.display import Markdown, display

from llama_index.core.async_utils import run_jobs
from llama_index.core.indices.property_graph.utils import (
    default_parse_triplets_fn,
)
from llama_index.core.graph_stores.types import (
    EntityNode,
    KG_NODES_KEY,
    KG_RELATIONS_KEY,
    Relation,
)
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.prompts.default_prompts import (
    DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
)
from llama_index.core.schema import TransformComponent, BaseNode
from llama_index.core.bridge.pydantic import BaseModel, Field

_entities=[]
_rels=[]

class GraphRAGExtractor(TransformComponent):
    """Extract triples from a graph.

    Uses an LLM and a simple prompt + output parsing to extract paths (i.e. triples) and entity, relation descriptions from text.

    Args:
        llm (LLM):
            The language model to use.
        extract_prompt (Union[str, PromptTemplate]):
            The prompt to use for extracting triples.
        parse_fn (callable):
            A function to parse the output of the language model.
        num_workers (int):
            The number of workers to use for parallel processing.
        max_paths_per_chunk (int):
            The maximum number of paths to extract per chunk.
    """

    llm: LLM
    extract_prompt: PromptTemplate
    parse_fn: Callable
    num_workers: int
    max_paths_per_chunk: int

    def __init__(
        self,
        llm: Optional[LLM] = None,
        extract_prompt: Optional[Union[str, PromptTemplate]] = None,
        parse_fn: Callable = default_parse_triplets_fn,
        max_paths_per_chunk: int = 10,
        num_workers: int = 4,
    ) -> None:
        """Init params."""
        from llama_index.core import Settings

        if isinstance(extract_prompt, str):
            extract_prompt = PromptTemplate(extract_prompt)

        super().__init__(
            llm=llm or Settings.llm,
            extract_prompt=extract_prompt or DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
            parse_fn=parse_fn,
            num_workers=num_workers,
            max_paths_per_chunk=max_paths_per_chunk,
        )

    @classmethod
    def class_name(cls) -> str:
        return "GraphExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes."""
        return asyncio.run(
            self.acall(nodes, show_progress=show_progress, **kwargs)
        )

    async def _aextract(self, node: BaseNode) -> BaseNode:
        """Extract triples from a node."""
        assert hasattr(node, "text")

        text = node.get_content(metadata_mode="llm")
        try:
            llm_response = await self.llm.apredict(
                self.extract_prompt,
                text=text,
                max_knowledge_triplets=self.max_paths_per_chunk,
            )
            entities, entities_relationship = self.parse_fn(llm_response)
            print(entities)
            _entities.append(entities)
            print(entities_relationship)
            _rels.append(entities_relationship)
        except ValueError:
            entities = []
            entities_relationship = []

        existing_nodes = node.metadata.pop(KG_NODES_KEY, [])
        existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])
        metadata = node.metadata.copy()
        for entity, entity_type, description in entities:
            metadata[
                "entity_description"
            ] = description  # Not used in the current implementation. But will be useful in future work.
            entity_node = EntityNode(
                name=entity, label=entity_type, properties=metadata
            )
            existing_nodes.append(entity_node)

        metadata = node.metadata.copy()
        for triple in entities_relationship:
            subj, rel, obj, description = triple
            subj_node = EntityNode(name=subj, properties=metadata)
            obj_node = EntityNode(name=obj, properties=metadata)
            metadata["relationship_description"] = description
            rel_node = Relation(
                label=rel,
                source_id=subj_node.id,
                target_id=obj_node.id,
                properties=metadata,
            )

            existing_nodes.extend([subj_node, obj_node])
            existing_relations.append(rel_node)


        node.metadata[KG_NODES_KEY] = existing_nodes
        node.metadata[KG_RELATIONS_KEY] = existing_relations
        return node

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes async."""
        jobs = []
        for node in nodes:
            jobs.append(self._aextract(node))

        return await run_jobs(
            jobs,
            workers=self.num_workers,
            show_progress=show_progress,
            desc="Extracting paths from text",
        )

In [11]:
KG_TRIPLET_EXTRACT_TMPL = """
Extract entities and their relationships from the text below.

For each entity, provide:
- entity_name: (capitalize it)
- entity_type: (one or two words)
- entity_description: (short summary)

For each relationship between two entities, provide:
- source_entity: (name of the source entity)
- target_entity: (name of the target entity)
- relation: (short label)
- relationship_description: (brief reason for the relation)

TEXT:
####################
{text}
####################

OUTPUT FORMAT:
First list all entities, then list all relationships.

Example:

entity_name: Apple
entity_type: Company
entity_description: A technology company known for iPhones.

entity_name: iPhone
entity_type: Product
entity_description: A smartphone designed and sold by Apple.

source_entity: Apple
target_entity: iPhone
relation: Produces
relationship_description: Apple designs and sells the iPhone.

(Your output starts here:)
"""

In [12]:
kg_extractor = GraphRAGExtractor(
    llm=llm,
    extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
    max_paths_per_chunk=2,
    parse_fn=parse_fn,
)

## Graph Index

In [14]:
import re
from llama_index.core.graph_stores import SimplePropertyGraphStore
import networkx as nx
from graspologic.partition import hierarchical_leiden

# self.max_paths_per_chunk
_entities=[]
_rels=[]

from llama_index.core.llms import ChatMessage
class GraphRAGStore(SimplePropertyGraphStore):
    community_summary = {}
    max_cluster_size = 5
    model = llm
    extract_prompt = PromptTemplate(
            "Given relationships from a knowledge graph in the form: "
            "entity1 -> entity2 -> relation -> description, write a concise summary. "
            "Include the entity names and key points from the descriptions to explain the nature and importance of each relationship clearly and coherently. "
            "Examples:\n"
            "1.\n"
            "Input: Einstein -> Theory of Relativity -> developed -> Einstein formulated the theory to explain how space and time are linked for objects moving at a constant speed.\n"
            "Output: Einstein developed the Theory of Relativity to explain the connection between space and time for objects in uniform motion.\n\n"
            "2.\n"
            "Input: Apple Inc. -> iPhone -> manufactures -> Apple designs and produces the iPhone, a widely used smartphone that revolutionized mobile technology.\n"
            "Output: Apple Inc. manufactures the iPhone, a groundbreaking smartphone that transformed mobile technology."
    )
    
    def generate_community_summary(self, text):
        """Generate summary for a given text using an LLM."""
        clean_response = self.model.predict(
            self.extract_prompt,
            text=text,
            max_knowledge_triplets=2,
            )
        
        
        clean_response = re.sub(r"^assistant:\s*", "", str(clean_response)).strip()
        return clean_response

    def build_communities(self):
        """Builds communities from the graph and summarizes them."""
        nx_graph = self._create_nx_graph()
        community_hierarchical_clusters = hierarchical_leiden(
            nx_graph, max_cluster_size=self.max_cluster_size
        )
        community_info = self._collect_community_info(
            nx_graph, community_hierarchical_clusters
        )
        self._summarize_communities(community_info)

    def _create_nx_graph(self):
        """Converts internal graph representation to NetworkX graph."""
        nx_graph = nx.Graph()
        for node in self.graph.nodes.values():
            nx_graph.add_node(str(node))
        for relation in self.graph.relations.values():
            nx_graph.add_edge(
                relation.source_id,
                relation.target_id,
                relationship=relation.label,
                description=relation.properties["relationship_description"],
            )
        return nx_graph

    def _collect_community_info(self, nx_graph, clusters):
        """Collect detailed information for each node based on their community."""
        community_mapping = {item.node: item.cluster for item in clusters}
        community_info = {}
        for item in clusters:
            cluster_id = item.cluster
            node = item.node
            if cluster_id not in community_info:
                community_info[cluster_id] = []

            for neighbor in nx_graph.neighbors(node):
                if community_mapping[neighbor] == cluster_id:
                    edge_data = nx_graph.get_edge_data(node, neighbor)
                    if edge_data:
                        detail = f"{node} -> {neighbor} -> {edge_data['relationship']} -> {edge_data['description']}"
                        community_info[cluster_id].append(detail)
        return community_info

    def _summarize_communities(self, community_info):
        """Generate and store summaries for each community."""
        for community_id, details in community_info.items():
            details_text = (
                "\n".join(details) + "."
            )  # Ensure it ends with a period
            self.community_summary[
                community_id
            ] = self.generate_community_summary(details_text)

    def get_community_summaries(self):
        """Returns the community summaries, building them if not already done."""
        if not self.community_summary:
            self.build_communities()
        return self.community_summary

In [15]:
from llama_index.core import PropertyGraphIndex
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # or any other HF embedding model
)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

index = PropertyGraphIndex(
    nodes=nodes,
    property_graph_store=GraphRAGStore(),
    kg_extractors=[kg_extractor],
    embed_model=embed_model,      
    show_progress=True,
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Extracting paths from text:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[('European Commission', 'Governmental body', 'The executive branch of the European Union.'), ('Huawei Technologies Co.', 'Company', 'A Chinese multinational technology company.'), ('ZTE Corp.', 'Company', 'A Chinese multinational telecommunications equipment company.'), ('5G mobile networks', 'Network', 'The fifth-generation mobile network technology.'), ('TikTok Inc.', 'Company', 'A Chinese-based social media company.')]
[('European Commission', 'Huawei Technologies Co.', 'Bans', 'The European Commission plans to ban Huawei equipment from its own internal telecommunications networks.'), ('European Commission', 'ZTE Corp.', 'Bans', 'The European Commission plans to ban ZTE equipment from its own internal telecommunications networks.'), ('European Commission', '5G mobile networks', 'Anticipates update guidance', 'The European Commission anticipates updating its guidance on 5G mobile networks within the bloc.'), ('European Commission', 'Countries', 'Encourages', 'The European Commission

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[('Chevron', 'Company', 'An oil and gas company based in the United States.'), ('Q2', 'Quarter', 'The second quarter of the year.'), ('CVX', 'Stock Symbol', 'The stock ticker symbol for Chevron.'), ('O&G sector', 'Industry', 'The oil and gas industry.'), ('NYSE', 'Stock Exchange', 'New York Stock Exchange, a major stock exchange in the United States.')]
[('Chevron', 'CVX', 'Is associated with', 'Chevron is represented on the NYSE by the stock symbol CVX.'), ('Chevron', 'O&G sector', 'Operates within', 'Chevron is an oil and gas company and operates within the O&G sector.'), ('Chevron', 'Q2', 'Has financial performance in', "Chevron's Q2 earnings estimates have risen sharply."), ('Chevron', 'Q2', 'Has stock performance in', "Chevron's stock has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply."), ('Chevron', 'JHVEPhoto', 'Is compared to (in terms of)', "JHVEPhoto is used as a point of comparison for Chevron's stock perform

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[('Dáil', 'Legislative Body', 'The lower house of the Oireachtas (Parliament) of Ireland.'), ('Sinn Féin TD', 'Politician', 'A Teachta Dála (TD, member of parliament) representing Sinn Féin.'), ('John Brady', 'Politician', 'A Sinn Féin TD who walked across the Dáil and placed an on-call pager in front of the Minister for Housing.'), ("Darragh O'Brien", 'Politician', 'The Minister for Housing, Urban Development and Local Government in Ireland.'), ('Minister for Housing', 'Government Position', 'A cabinet position in the government of Ireland responsible for housing policy.'), ('Pager', 'Device', 'A one-way communication device used to send short messages.'), ('Order of Business', 'Parliamentary Procedure', 'A schedule of business to be considered during a meeting of the Dáil.'), ('Ceann Comhairle', 'Position', 'The presiding officer of the Dáil.'), ('Retained Firefighters', 'Workers', 'Part-time firefighters who keep the services going in smaller communities around Ireland.'), ('Staged 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[('FirstEnergy', 'Company', 'A publicly traded utilities provider.'), ('NYSE:FE', 'Stock Ticker', 'The stock ticker symbol for FirstEnergy on the New York Stock Exchange.'), ('Earnings per share', 'Financial Metric', "A measure of a company's profitability, calculated as net income divided by the number of shares outstanding."), ('RTT News', 'News Agency', 'A financial news agency.'), ('Net margin', 'Financial Metric', 'A measure of profitability that represents the percentage of sales remaining after all operating expenses have been deducted.'), ('Return on equity', 'Financial Metric', "A ratio that measures a corporation's profitability by revealing how much profit a company generates with the money shareholders have invested.")]
[('FirstEnergy', 'Earnings per share', 'Reports', 'FirstEnergy reported earnings per share during the quarter mentioned.'), ('FirstEnergy', 'Consensus estimate', 'Exceeds', "FirstEnergy's earnings per share was greater than the consensus estimate."), ('RTT N

Extracting paths from text: 100%|██████████| 5/5 [03:13<00:00, 38.77s/it] 


[('Epic Games', 'Company', 'A company known for the Unreal Engine.'), ('Unreal Engine', 'Software', 'A popular game engine developed by Epic Games.'), ('MetaHuman Animator', 'Tool', 'A tool for animating hyperrealistic MetaHumans using performance capture.'), ('iPhone 12', 'Product', 'A smartphone designed and sold by Apple.'), ('MetaHuman', 'Character', 'A hyperrealistic digital character created by Epic Games.'), ('Game Developers Conference', 'Event', 'An annual conference for game developers.'), ('3Lateral', 'Team', 'A team within Epic Games.'), ('Radivoje Bukvić', 'Person', 'An actor who provided the facial performance for the MetaHuman in the short film.'), ('Mika Antić', 'Person', 'The author of the poem used in the short film.')]
[('Epic Games', 'MetaHuman Animator', 'Develops', 'Epic Games develops the MetaHuman Animator tool.'), ('Epic Games', '3Lateral', 'Owns', 'Epic Games owns the 3Lateral team.'), ('Epic Games', 'Unreal Engine', 'Develops', 'Epic Games develops the Unreal

Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s]
Generating embeddings: 100%|██████████| 11/11 [00:01<00:00,  7.01it/s]


# Creating community summaries

In [26]:
# Now you can safely build communities
index.property_graph_store.build_communities()


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [27]:
index.property_graph_store.community_summary[0]

'Summary:\n\n1. The relationship between John Smith and the University of Oxford is that of alumnus, as John Smith graduated from the institution with a degree in Computer Science.\n\n2. John Smith is also associated with Google Inc., where he currently holds the position of Senior Software Engineer.\n\n3. John Smith has authored several research papers, with one of his notable collaborations being with Dr. Jane Doe at the Massachusetts Institute of Technology (MIT). Their joint paper, "Deep Learning in Computer Vision," was published in the prestigious Journal of Artificial Intelligence Research.\n\n4. John Smith is also a mentor for a non-profit organization, Code for Good, where he shares his expertise in programming and computer science with underprivileged youth.\n\n5. John Smith has been recognized for his contributions to the field of artificial intelligence with the ACM A.M. Turing Award in 2020.\n\n6. John Smith is married to Emily Johnson, a well-known journalist for The New 

# Extracting entities and relationships

In [22]:
rels = []
for r in _rels:
    for t in r:
        rels.append(t)
        
entities = []
for r in _entities:
    for t in r:
        entities.append(t)

## Extracting data as simple .csvs

In [25]:
rels_df = pd.DataFrame(rels, columns =['source','target','relationship','originalText'])
entities_df = pd.DataFrame(entities, columns =['entity','type','description'])

entities_df[:5]

Unnamed: 0,entity,type,description
0,European Commission,Governmental body,The executive branch of the European Union.
1,Huawei Technologies Co.,Company,A Chinese multinational technology company.
2,ZTE Corp.,Company,A Chinese multinational telecommunications equ...
3,5G mobile networks,Network,The fifth-generation mobile network technology.
4,TikTok Inc.,Company,A Chinese-based social media company.


## Extracting data as rdf triples

In [28]:
import urllib.parse

entity_base_uri="http://graphrag.example.com/resource/" 
ontology_base_uri="http://graphrag.example.com/ontology/"

rels_df = pd.DataFrame(rels, columns =['source','target','relationship','originalText'])
entities_df = pd.DataFrame(entities, columns =['entity','type','description'])

triples = []


def sanitize_uri_component(component):
    return urllib.parse.quote(component.replace(" ", "_"), safe="")

# Create triples for entities
for _, row in entities_df.iterrows():
    entity_uri = f"<{entity_base_uri}{sanitize_uri_component(row['entity'])}>"
    type_uri = f"<{ontology_base_uri}{sanitize_uri_component(row['type'])}>"
    desc_literal = f'"{row["description"]}"'
    desc_literal = desc_literal.replace("\"", "'")

    # rdf:type triple
    triples.append(f"{entity_uri} <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> {type_uri} .")

    # custom description triple
    triples.append(f"{entity_uri} <{ontology_base_uri}description> {desc_literal} .")

# Create triples for relationships
for _, row in rels_df.iterrows():
    source_uri = f"<{entity_base_uri}{sanitize_uri_component(row['source'])}>"
    target_uri = f"<{entity_base_uri}{sanitize_uri_component(row['target'])}>"
    relation_uri = f"<{ontology_base_uri}{sanitize_uri_component(row['relationship'])}>"

    triples.append(f"{source_uri} {relation_uri} {target_uri} .")

triples[:5]

['<http://graphrag.example.com/resource/European_Commission> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://graphrag.example.com/ontology/Governmental_body> .',
 "<http://graphrag.example.com/resource/European_Commission> <http://graphrag.example.com/ontology/description> 'The executive branch of the European Union.' .",
 '<http://graphrag.example.com/resource/Huawei_Technologies_Co.> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://graphrag.example.com/ontology/Company> .',
 "<http://graphrag.example.com/resource/Huawei_Technologies_Co.> <http://graphrag.example.com/ontology/description> 'A Chinese multinational technology company.' .",
 '<http://graphrag.example.com/resource/ZTE_Corp.> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://graphrag.example.com/ontology/Company> .']

## Export as neo4j ready .csv files

In [31]:
rels_df = pd.DataFrame(rels, columns =['source','target','relationship','originalText'])
entities_df = pd.DataFrame(entities, columns =['entity','type','description'])
nodes_df = entities_df.rename(columns={
    'entity': 'id:ID',
    'type': ':LABEL',
    'description': 'description'
})

# Relationships CSV
rels_df_clean = rels_df.rename(columns={
    'source': ':START_ID',
    'target': ':END_ID',
    'relationship': ':TYPE'
})
rels_df_clean = rels_df_clean[[':START_ID', ':END_ID', ':TYPE']]


rels_df_clean[:5]

Unnamed: 0,:START_ID,:END_ID,:TYPE
0,European Commission,Huawei Technologies Co.,Bans
1,European Commission,ZTE Corp.,Bans
2,European Commission,5G mobile networks,Anticipates update guidance
3,European Commission,Countries,Encourages
4,European Commission,Huawei Technologies Co.,Considers high risk
