In [1]:
from langchain_text_splitters import TokenTextSplitter

def read_text(path: str) -> str:
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            with open(path, "r", encoding=enc) as f:
                return f.read()
        except UnicodeDecodeError:
            pass
    # last resort
    with open(path, "r", errors="replace") as f:
        return f.read()

content = read_text("ft_guide.txt")

text_splitter = TokenTextSplitter(chunk_size=1200, chunk_overlap=100)
texts = text_splitter.split_text(content)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
len(texts[4])

2752

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0.0, model="gpt-4o-mini")


In [4]:
prompt_template = """
-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [large language model, differential privacy, federated learning, healthcare, adversarial training, security measures, open-source tool, dataset, learning rate, AdaGrad, RMSprop, adapter architecture, LoRA, API, model support, evaluation metrics, deployment, Python library, hardware accelerators, hyperparameters, data preprocessing, data imbalance, GPU-based deployment, distributed inference]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{{tuple_delimiter}}<entity_name>{{tuple_delimiter}}<entity_type>{{tuple_delimiter}}<entity_description>)

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: an integer score between 1 to 10, indicating strength of the relationship between the source entity and target entity
Format each relationship as ("relationship"{{tuple_delimiter}}<source_entity>{{tuple_delimiter}}<target_entity>{{tuple_delimiter}}<relationship_description>{{tuple_delimiter}}<relationship_strength>)

3. Return output in The primary language of the provided text is "English." as a single list of all the entities and relationships identified in steps 1 and 2. Use **{{record_delimiter}}** as the list delimiter.

4. If you have to translate into The primary language of the provided text is "English.", just translate the descriptions, nothing else!

5. When finished, output {{completion_delimiter}}.

-Examples-
######################

Example 1:

entity_types: [large language model, differential privacy, federated learning, healthcare, adversarial training, security measures, open-source tool, dataset, learning rate, AdaGrad, RMSprop, adapter architecture, LoRA, API, model support, evaluation metrics, deployment, Python library, hardware accelerators, hyperparameters, data preprocessing, data imbalance, GPU-based deployment, distributed inference]
text:
 LLMs to create synthetic samples that mimic clients’ private data distribution using
differential privacy. This approach significantly boosts SLMs’ performance by approximately 5% while
maintaining data privacy with a minimal privacy budget, outperforming traditional methods relying
solely on local private data.
In healthcare, federated fine-tuning can allow hospitals to collaboratively train models on patient data
without transferring sensitive information. This approach ensures data privacy while enabling the de-
velopment of robust, generalisable AI systems.
8https://ai.meta.com/responsible-ai/
9https://huggingface.co/docs/hub/en/model-cards
10https://www.tensorflow.org/responsible_ai/privacy/guide
101 Frameworks for Enhancing Security
Adversarial training and robust security measures[111] are essential for protecting fine-tuned models
against attacks. The adversarial training approach involves training models with adversarial examples
to improve their resilience against malicious inputs. Microsoft Azure’s
------------------------
output:
("entity"{{tuple_delimiter}}DIFFERENTIAL PRIVACY{{tuple_delimiter}}differential privacy{{tuple_delimiter}}Differential privacy is a technique used to create synthetic samples that mimic clients' private data distribution while maintaining data privacy with a minimal privacy budget{{record_delimiter}}
("entity"{{tuple_delimiter}}HEALTHCARE{{tuple_delimiter}}healthcare{{tuple_delimiter}}In healthcare, federated fine-tuning allows hospitals to collaboratively train models on patient data without transferring sensitive information, ensuring data privacy{{record_delimiter}}
("entity"{{tuple_delimiter}}FEDERATED LEARNING{{tuple_delimiter}}federated learning{{tuple_delimiter}}Federated learning is a method that enables collaborative model training on decentralized data sources, such as hospitals, without sharing sensitive information{{record_delimiter}}
("entity"{{tuple_delimiter}}ADVERSARIAL TRAINING{{tuple_delimiter}}adversarial training{{tuple_delimiter}}Adversarial training involves training models with adversarial examples to improve their resilience against malicious inputs{{record_delimiter}}
("entity"{{tuple_delimiter}}SECURITY MEASURES{{tuple_delimiter}}security measures{{tuple_delimiter}}Robust security measures are essential for protecting fine-tuned models against attacks{{record_delimiter}}
("relationship"{{tuple_delimiter}}DIFFERENTIAL PRIVACY{{tuple_delimiter}}FEDERATED LEARNING{{tuple_delimiter}}Differential privacy is used in federated learning to maintain data privacy while training models collaboratively{{tuple_delimiter}}8{{record_delimiter}}
("relationship"{{tuple_delimiter}}HEALTHCARE{{tuple_delimiter}}FEDERATED LEARNING{{tuple_delimiter}}Federated learning is applied in healthcare to train models on patient data without transferring sensitive information{{tuple_delimiter}}9{{record_delimiter}}
("relationship"{{tuple_delimiter}}ADVERSARIAL TRAINING{{tuple_delimiter}}SECURITY MEASURES{{tuple_delimiter}}Adversarial training is a security measure used to protect models against attacks by improving their resilience{{tuple_delimiter}}8{{completion_delimiter}}
#############################


Example 2:

entity_types: [large language model, differential privacy, federated learning, healthcare, adversarial training, security measures, open-source tool, dataset, learning rate, AdaGrad, RMSprop, adapter architecture, LoRA, API, model support, evaluation metrics, deployment, Python library, hardware accelerators, hyperparameters, data preprocessing, data imbalance, GPU-based deployment, distributed inference]
text:
ARD [82] is an innovative open-source tool developed to enhance the safety of interactions
with large language models (LLMs). This tool addresses three critical moderation tasks: detecting
2https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModelForCausalLM
63 harmful intent in user prompts, identifying safety risks in model responses, and determining when a
model appropriately refuses unsafe requests. Central to its development is WILDGUARD MIX3, a
meticulously curated dataset comprising 92,000 labelled examples that include both benign prompts and
adversarial attempts to bypass safety measures. The dataset is divided into WILDGUARD TRAIN, used
for training the model, and WILDGUARD TEST, consisting of high-quality human-annotated examples
for evaluation.
The WILDGUARD model itself is fine-tuned on the Mistral-7B language model using the WILDGUARD
TRAIN dataset, enabling it to perform all
------------------------
output:
```plaintext
("entity"{{tuple_delimiter}}ARD{{tuple_delimiter}}open-source tool{{tuple_delimiter}}ARD is an innovative open-source tool developed to enhance the safety of interactions with large language models by addressing moderation tasks such as detecting harmful intent, identifying safety risks, and determining appropriate refusals of unsafe requests)
{{record_delimiter}}
("entity"{{tuple_delimiter}}LARGE LANGUAGE MODELS{{tuple_delimiter}}large language model{{tuple_delimiter}}Large language models (LLMs) are advanced AI models designed to understand and generate human-like text, which ARD aims to interact with safely)
{{record_delimiter}}
("entity"{{tuple_delimiter}}WILDGUARD MIX3{{tuple_delimiter}}dataset{{tuple_delimiter}}WILDGUARD MIX3 is a meticulously curated dataset comprising 92,000 labeled examples, including benign prompts and adversarial attempts, used for training and evaluating safety measures in language models)
{{record_delimiter}}
("entity"{{tuple_delimiter}}WILDGUARD TRAIN{{tuple_delimiter}}dataset{{tuple_delimiter}}WILDGUARD TRAIN is a subset of the WILDGUARD MIX3 dataset used specifically for training the model on safety measures)
{{record_delimiter}}
("entity"{{tuple_delimiter}}WILDGUARD TEST{{tuple_delimiter}}dataset{{tuple_delimiter}}WILDGUARD TEST is a subset of the WILDGUARD MIX3 dataset consisting of high-quality human-annotated examples used for evaluating the model's performance)
{{record_delimiter}}
("entity"{{tuple_delimiter}}MISTRAL-7B{{tuple_delimiter}}large language model{{tuple_delimiter}}Mistral-7B is a language model that the WILDGUARD model is fine-tuned on using the WILDGUARD TRAIN dataset to enhance its safety performance)
{{record_delimiter}}
("entity"{{tuple_delimiter}}ADVERSARIAL ATTEMPTS{{tuple_delimiter}}adversarial training{{tuple_delimiter}}Adversarial attempts are part of the WILDGUARD MIX3 dataset, used to test and improve the model's ability to handle unsafe or harmful inputs)
{{record_delimiter}}
("entity"{{tuple_delimiter}}SAFETY MEASURES{{tuple_delimiter}}security measures{{tuple_delimiter}}Safety measures are protocols and techniques implemented to ensure that large language models interact safely with users, which ARD and the WILDGUARD dataset aim to enhance)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}ARD{{tuple_delimiter}}LARGE LANGUAGE MODELS{{tuple_delimiter}}ARD is designed to enhance the safety of interactions with large language models by addressing critical moderation tasks{{tuple_delimiter}}8)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}ARD{{tuple_delimiter}}WILDGUARD MIX3{{tuple_delimiter}}ARD uses the WILDGUARD MIX3 dataset to train and evaluate its moderation capabilities{{tuple_delimiter}}7)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}WILDGUARD MIX3{{tuple_delimiter}}WILDGUARD TRAIN{{tuple_delimiter}}WILDGUARD TRAIN is a subset of the WILDGUARD MIX3 dataset used for training{{tuple_delimiter}}9)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}WILDGUARD MIX3{{tuple_delimiter}}WILDGUARD TEST{{tuple_delimiter}}WILDGUARD TEST is a subset of the WILDGUARD MIX3 dataset used for evaluation{{tuple_delimiter}}9)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}WILDGUARD TRAIN{{tuple_delimiter}}MISTRAL-7B{{tuple_delimiter}}The WILDGUARD TRAIN dataset is used to fine-tune the Mistral-7B language model{{tuple_delimiter}}8)
{{record_delimiter}}
("relationship"{{tuple_delimiter}}ADVERSARIAL ATTEMPTS{{tuple_delimiter}}SAFETY MEASURES{{tuple_delimiter}}Adversarial attempts are used to test and improve safety measures in language models{{tuple_delimiter}}7)
{{completion_delimiter}}
```
#############################



-Real Data-
######################
entity_types: [large language model, differential privacy, federated learning, healthcare, adversarial training, security measures, open-source tool, dataset, learning rate, AdaGrad, RMSprop, adapter architecture, LoRA, API, model support, evaluation metrics, deployment, Python library, hardware accelerators, hyperparameters, data preprocessing, data imbalance, GPU-based deployment, distributed inference]
text: {input_text}
######################
output:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

In [5]:
chain = prompt | llm | StrOutputParser()

In [6]:
response = chain.invoke({"input_text": texts[25]})

In [7]:
print(response)

```plaintext
("entity"{tuple_delimiter}LORA{tuple_delimiter}adapter architecture{tuple_delimiter}LoRA is a fine-tuning technique that introduces two low-rank matrices to approximate weight updates, significantly reducing the number of trainable parameters and improving efficiency in memory and computation for large models{record_delimiter}
("entity"{tuple_delimiter}QLORA{tuple_delimiter}adapter architecture{tuple_delimiter}QLoRA is an extended version of LoRA that quantises weight parameters to 4-bit precision, allowing for greater memory efficiency and enabling fine-tuning on less powerful hardware while maintaining performance levels comparable to traditional methods{record_delimiter}
("entity"{tuple_delimiter}DORA{tuple_delimiter}adapter architecture{tuple_delimiter}Weight-Decomposed Low-Rank Adaptation (DoRA) is a fine-tuning methodology that decomposes model weights into magnitude and directional components, leveraging LoRA's efficiency for substantial updates without altering the

In [29]:
import pandas as pd

entities_raw = r"D:\Narwal\knowledge_graphs_using_networkX\ragtest\output\entities.parquet"
entities_raw = pd.read_parquet(entities_raw)  
entities = entities_raw[
    [
        "id",
        "human_readable_id",
        "title",
        "type",
        "description",
        "text_unit_ids",
    ]
].copy()
entities.head(5)

Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids
0,fa72024a-fa11-4c0c-9b32-070d074cde4d,0,VENKATESH BALAVADHANI PARTHASARATHY,PERSON,Venkatesh Balavadhani Parthasarathy is one of ...,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
1,df0e3246-70fe-40e3-baca-8a3c01140011,1,AHTSHAM ZAFAR,PERSON,Ahtsham Zafar is an accomplished author recogn...,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
2,dffb68e8-6683-4f2c-931c-5d2f14c09d11,2,AAFAQ KHAN,PERSON,Aafaq Khan is one of the authors of the techni...,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
3,0804f87b-0d37-47e1-9e6d-fe2224c55170,3,ARSALAN SHAHID,PERSON,Arsalan Shahid is one of the authors of the te...,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
4,23a79c80-7299-4726-9b8f-db184bf58bee,4,CEADAR,ORGANIZATION,"CeADAR is Ireland’s Centre for AI, located at ...",[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...


In [30]:
path = r"D:\Narwal\knowledge_graphs_using_networkX\ragtest\output\relationships.parquet"
relationships = pd.read_parquet(path)
relationships = relationships[
    [
        "id",
        "human_readable_id",
        "source",
        "target",
        "description",
        "weight",
        "combined_degree",
        "text_unit_ids",
    ]
].copy()
relationships.head(5)


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,1a7787b6-777f-48b7-934c-7bfc7d70ce62,0,VENKATESH BALAVADHANI PARTHASARATHY,CEADAR,Venkatesh Balavadhani Parthasarathy is affilia...,8.0,7,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
1,4b5d550f-a809-4e79-955a-c36ca0848a20,1,AHTSHAM ZAFAR,CEADAR,Ahtsham Zafar is affiliated with CeADAR as an ...,8.0,8,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
2,0cade9e1-afd7-480d-a2d7-3b771362d339,2,AAFAQ KHAN,CEADAR,Aafaq Khan is affiliated with CeADAR as an aut...,8.0,7,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
3,e5738970-ce0e-4db8-9b6c-8a00745bd71d,3,ARSALAN SHAHID,CEADAR,Arsalan Shahid is affiliated with CeADAR as an...,8.0,7,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...
4,188cd76f-4b80-42bd-8aa4-5dfb3a768d86,4,CEADAR,UNIVERSITY COLLEGE DUBLIN,CeADAR is a research group within University C...,9.0,8,[30c05f4301f700b0f85dd15af3f3ba0f50fdd82b8cefb...


In [38]:
import pandas as pd

entities_raw = pd.read_parquet(r"D:\Narwal\knowledge_graphs_using_networkX\ragtest\output\entities.parquet")
communities = pd.read_parquet(r"D:\Narwal\knowledge_graphs_using_networkX\ragtest\output\communities.parquet")

nodes = (
    communities[["community", "level", "entity_ids"]]
    .explode("entity_ids")
    .rename(columns={"entity_ids": "id"})
    .merge(
        entities_raw[["id", "human_readable_id", "title", "degree", "x", "y"]],
        on="id",
        how="inner"
    )
    [["id", "human_readable_id", "title", "community", "level", "degree", "x", "y"]]
)
nodes

Unnamed: 0,id,human_readable_id,title,community,level,degree,x,y
0,43264a05-af44-47d5-8707-63c53792334e,807,ADAM LERER,0,0,1,0.0,0.0
1,aebd3291-ad0e-49f3-8687-bddcba7756e0,763,ARXIV,0,0,65,0.0,0.0
2,91fdee27-db4c-442c-87e4-add5d87a4bfb,804,ADAM PASZKE,0,0,1,0.0,0.0
3,316b2889-c1e9-405f-aaf2-87f938641440,757,ALEXANDER RATNER,0,0,1,0.0,0.0
4,7df79d37-fb66-432c-bdf1-ff3b5a6c388f,765,ANA MARASOVIĆ,0,0,1,0.0,0.0
...,...,...,...,...,...,...,...,...
1050,3be4137f-98b7-43d4-8292-d8a7bce3b86d,533,DATA CURATION,89,2,1,0.0,0.0
1051,8cef081b-2ee5-4bce-841b-f39d16e425b3,499,DISTRIBUTED TRAINING,89,2,1,0.0,0.0
1052,80b05f67-9c3f-4749-83c0-ced4b6eaf748,535,INFERENCE,89,2,1,0.0,0.0
1053,e35449d5-eae0-46f8-b62d-7a585cd16df0,534,REINFORCEMENT LEARNING WITH HUMAN FEEDBACK,89,2,1,0.0,0.0


In [35]:
community_reports = pd.read_parquet(
    r"D:\Narwal\knowledge_graphs_using_networkX\ragtest\output\community_reports.parquet"
)

community_reports.head()


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,4d06c18b850a48cea2440cd242f09886,86,86,2,24,[],Fine-Tuning in Machine Learning,The community focuses on the process of fine-t...,# Fine-Tuning in Machine Learning\n\nThe commu...,8.0,The impact severity rating is high due to the ...,[{'explanation': 'Fine-tuning is a vital proce...,"{\n ""title"": ""Fine-Tuning in Machine Learni...",2025-12-31,11
1,4b0df43931ee4b958f3037bb17e34c23,87,87,2,24,[],Pre-Training and Comparative Overview of LLMs,The community focuses on the pre-training phas...,# Pre-Training and Comparative Overview of LLM...,4.0,The impact severity rating is moderate due to ...,[{'explanation': 'The pre-training phase is a ...,"{\n ""title"": ""Pre-Training and Comparative ...",2025-12-31,2
2,de9ed0f44c0a496b9a739efbb3073a22,88,88,2,30,[],NVIDIA and AI Technologies Community,"The community centers around NVIDIA, a leading...",# NVIDIA and AI Technologies Community\n\nThe ...,8.5,The impact severity rating is high due to NVID...,[{'explanation': 'NVIDIA is a prominent player...,"{\n ""title"": ""NVIDIA and AI Technologies Co...",2025-12-31,8
3,d308efe6878b490ea1815ecb3756e10c,89,89,2,30,[],NVIDIA NeMo and AI Model Development,"The community centers around NVIDIA NeMo, a fr...",# NVIDIA NeMo and AI Model Development\n\nThe ...,8.0,The impact severity rating is high due to the ...,[{'explanation': 'NVIDIA NeMo serves as the fo...,"{\n ""title"": ""NVIDIA NeMo and AI Model Deve...",2025-12-31,6
4,19fb04ead5044b6aaec4f9e76a4d4468,21,21,1,0,[],arXiv Research Community,The arXiv research community is centered aroun...,# arXiv Research Community\n\nThe arXiv resear...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'arXiv is a leading repositor...,"{\n ""title"": ""arXiv Research Community"",\n ...",2025-12-31,57


In [40]:
display(entities.columns,relationships.columns,community_reports.columns,nodes.columns)


Index(['id', 'human_readable_id', 'title', 'type', 'description',
       'text_unit_ids'],
      dtype='object')

Index(['id', 'human_readable_id', 'source', 'target', 'description', 'weight',
       'combined_degree', 'text_unit_ids'],
      dtype='object')

Index(['id', 'human_readable_id', 'community', 'level', 'parent', 'children',
       'title', 'summary', 'full_content', 'rank', 'rating_explanation',
       'findings', 'full_content_json', 'period', 'size'],
      dtype='object')

Index(['id', 'human_readable_id', 'title', 'community', 'level', 'degree', 'x',
       'y'],
      dtype='object')

In [63]:
entities.iloc[[900],:].values

array([['99aeb224-a1cc-45b6-80a4-a2a33a6ee8ee', 900, 'KEVIN CLARK',
        'PERSON',
        'Kevin Clark is a researcher focused on large-scale machine learning.',
        array(['582cc63e645b76407bfddcc88baf3064807b8d6557ddf9ce2b5dc1f8f0bd0977986f4875d2309c3648ec4b1a283a7fb8924cd32e0918ebde112169d28bbd71c0'],
              dtype=object)                                                                                                                        ]],
      dtype=object)

In [58]:
import subprocess
from typing import Optional

def query_graphrag(
    query: str,
    method: str = "global",
    root_path: str = "./ragtest",
    timeout: Optional[int] = None,
    community_level: int = 2,
    dynamic_community_selection: bool = False
) -> str:

    if community_level < 0:
        raise ValueError("Community level must be non-negative")

    command = [
        "graphrag", "query",
        "--root", root_path,
        "--method", method,
        "--query", query,
        "--community-level", str(community_level),
    ]

    if dynamic_community_selection:
        command.append("--dynamic-community-selection")

    result = subprocess.run(
        command,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace",
        timeout=timeout
    )

    # if GraphRAG fails, show stderr clearly
    if result.returncode != 0:
        raise RuntimeError(
            f"GraphRAG failed (exit {result.returncode})\n\nSTDERR:\n{result.stderr}\n\nSTDOUT:\n{result.stdout}"
        )

    return result.stdout.strip()


In [60]:
result = query_graphrag(
    query="How does a company choose between RAG, fine-tuning, and different PEFT approaches?",
    method="local",
    root_path=r"D:\Narwal\knowledge_graphs_using_networkX\ragtest",
    community_level=2
)
print(result)


## Choosing Between RAG, Fine-Tuning, and PEFT Approaches

When a company is faced with the decision of selecting between Retrieval-Augmented Generation (RAG), fine-tuning, and various Parameter-Efficient Fine-Tuning (PEFT) approaches, several factors must be considered. Each method has its unique advantages and challenges, and the choice largely depends on the specific needs of the project, available resources, and desired outcomes.

### Understanding RAG and Its Benefits

RAG combines retrieval and generation processes to enhance the quality of generated outputs by leveraging external information. This method is particularly beneficial when the goal is to improve accuracy and relevance in responses, as it allows models to access a broader range of data beyond their training set [Data: Reports (1); Entities (29)]. However, implementing RAG can introduce complexities related to data retrieval and integration, which must be carefully managed to ensure optimal performance [Data: Relation