In [None]:
%pip install pypdf
%pip install llama_index
%pip install llama-index-llms-ollama
%pip install llama-index-embeddings-langchain
%pip install langchain-huggingface
%pip install ragas

In [8]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.prompts.prompts import SimpleInputPrompt

from langchain_huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

In [9]:
Settings.llm = Ollama(model="llama3", request_timeout=500.0)
Settings.embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))

In [10]:
documents=SimpleDirectoryReader("../silver/data").load_data()

# Basic Retreiver

In [11]:
node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)

In [15]:
nodes[563]

TextNode(id_='node_563', embedding=None, metadata={'file_path': 'c:\\Users\\NailFerroukhi\\Desktop\\dauphine\\thesis\\basic_rag\\notebooks\\..\\silver\\data\\output.txt', 'file_name': 'output.txt', 'file_type': 'text/plain', 'file_size': 2336905, 'creation_date': '2024-07-25', 'last_modified_date': '2024-08-01'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e4ec6066-24e1-475f-8210-0d038a584a75', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'c:\\Users\\NailFerroukhi\\Desktop\\dauphine\\thesis\\basic_rag\\notebooks\\..\\silver\\data\\output.txt', 'file_name': 'output.txt', 'file_type': 'text/plain', 'file_size': 2336905, 'creation_date': '2024-07-25', 'last_modified_date': '2024-08-01'}, ha

In [14]:
# by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

In [6]:
vector_index = VectorStoreIndex(nodes)

In [None]:
retriever = vector_index.as_retriever(similarity_top_k=5)

In [None]:
retrieved_nodes = retriever.retrieve("زراعة")

In [None]:
from llama_index.core.response.notebook_utils import display_source_node

for node in retrieved_nodes:
    display_source_node(node, source_length=2000)

# Chunk References: Smaller Child Chunks Referring to Bigger Parent Chunk

In [7]:
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import IndexNode

In [8]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)

In [9]:
node_parser

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x00000179E12A9360>, id_func=<function default_id_func at 0x00000179C48CE320>, chunk_size=1024, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')

In [10]:
base_nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
base_nodes

In [12]:
sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [
    SimpleNodeParser.from_defaults(chunk_size=c, chunk_overlap=20) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)

    # also add original node to node
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

In [13]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

In [None]:
all_nodes_dict

In [15]:
len(all_nodes_dict)

19383

In [16]:
vector_index_chunk = VectorStoreIndex(all_nodes)

In [None]:
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=5)

In [None]:
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)

In [None]:
nodes = retriever_chunk.retrieve(
    "Eric Von Hippe"
)
for node in nodes:
    display_source_node(node, source_length=2000)

# Evaluation

In [16]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
import nest_asyncio

nest_asyncio.apply()

In [None]:
base_nodes

In [18]:
eval_dataset = generate_question_context_pairs([nodes[563]], Settings.llm, num_questions_per_chunk=2)

100%|██████████| 1/1 [03:08<00:00, 188.63s/it]


In [19]:
eval_dataset.save_json("eval_dataset.json")

In [None]:
import os
import tqdm

for node in tqdm.tqdm(base_nodes):
    file_path = os.path.join("eval_dataset", f"llama2_eval_dataset_{node.id_}.json")
    if os.path.exists(file_path):
        continue
    eval_dataset = generate_question_context_pairs([node], Settings.llm)
    eval_dataset.save_json(file_path)


In [49]:
import os
import json

def merge_json_files(input_folder, output_file):
    merged_data = []
    
    # Iterate over all files in the specified folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            filepath = os.path.join(input_folder, filename)
            
            # Open and read the JSON file
            with open(filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)
                merged_data.append(data)
    
    # Write the merged data to the output file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(merged_data, outfile, indent=4)

# Usage
input_folder = 'eval_dataset'  # Replace with the path to your folder
output_file = 'eval_dataset/llama2_eval_dataset.json'  # Replace with the path to your output file

merge_json_files(input_folder, output_file)


In [47]:
eval_dataset.save_json("llama2_eval_dataset.json")
# eval_dataset = EmbeddingQAFinetuneDataset.from_json("data/llama2_eval_dataset.json")

In [None]:
# base
base_retriever = vector_index.as_retriever(similarity_top_k=top_k)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=base_retriever
)

results_base = await retriever_evaluator.evaluate_dataset(
    eval_dataset, show_progress=True
)

In [None]:
# chunk
vector_retriever_chunk = vector_index_chunk.as_retriever(
    similarity_top_k=top_k
)
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever_chunk
)

results_chunk = await retriever_evaluator.aevaluate_dataset(
    eval_dataset, show_progress=True
)

In [None]:
# sentence
sentence_retriever = sentence_index.as_retriever(
    similarity_top_k=top_k,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ]
)

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=sentence_retriever
)

results_sentence = await retriever_evaluator.aevaluate_dataset(
    eval_dataset, show_progress=True
)

In [None]:
full_results_df = get_retrieval_results_df(
    [
        "Base Retriever",
        "Retriever (Chunk References)"
        "Retriever (window)"
    ],
    [results_base, results_chunk, results_sentence],
)
display(full_results_df)

# Sentence Window Retrieval

In [65]:
from llama_index.core.node_parser import SentenceWindowNodeParser

In [66]:
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [67]:
node_parser

SentenceWindowNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x00000179D2D7F9D0>, id_func=<function default_id_func at 0x00000179C48CE320>, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x0000017A0553F400>, window_size=3, window_metadata_key='window', original_text_metadata_key='original_text')

In [68]:
sentence_nodes = node_parser.get_nodes_from_documents(documents)

In [69]:
sentence_index = VectorStoreIndex(sentence_nodes)

In [24]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=5,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [None]:
window_response = query_engine.query(
    "Can you tell me about the key concepts the prospective study around food security"
)
print(window_response)

In [16]:
# check the original sentence that was retrieved for each node, as well as the actual window of sentences that was sent to the LLM.
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: Intern ational Assessmen t of Agricult ural 
Knowledge, Sci ence & Technology fo r Development.  Gl obal Report.    
 - IFPRI  (2005 ).  New Risks and Opportunities for Food Security: sc enarios analyses for 2015 and 
2050.  
 6.1.2.2.   Les études pro spective s ciblan t les re ssources  marine s  
- Plan b leue (2017 ).  Vers un nouvel exerc ice de prospective sur l’environnemen t et le 
développement en  Méditer ranée : Rapport de bench mark des é tudes exist antes . 

------------------
Original Sentence: New Risks and Opportunities for Food Security: sc enarios analyses for 2015 and 
2050.  



In [25]:
window_response = query_engine.query(
    "Donnes des exemples mentionnés dans les documents de politiques de soutien aux systèmes productifs agricoles et halieutiques pour l'analyse stratégique prospective en 2035"
)
print(window_response)

Based on the provided context information, some examples of orientations strategic mentioned in the documents related to agricultural and fisheries systems that can be used for the analysis of a prospective strategy in 2035 are:

1. Orientations stratégiques MAI-Tend et MDM-Mondia (page 599)
2. Préservation des ressources productives (terres, eau, sols…), résilience aux changements climatiques et utilisation des ressources non conventionnelles
3. Développement des institutions professionnelles et participation
4. Substitution aux importations (filières céréales, laits…. ) et contribution au réquilibrage de la balance alimentaire par l'export

These examples can be used as a starting point for the analysis of a prospective strategy in 2035 for the agricultural and fisheries systems.

Please note that these answers are generated based on the provided context information, without prior knowledge or external research.
