In [1]:
import json
import statistics
from openai import OpenAI
import dotenv 
import os
dotenv.load_dotenv()
import pandas as pd

In [2]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
with open("reponses.json", "r", encoding="utf-8") as f:
    repenses = json.load(f)

In [4]:
repenses

[{'question': 'Quel est le nombre de livres qui composent le CGI 2025 ?',
  'level': 'facile',
  'réponse_graphrag2': 'I am sorry but I am unable to answer this question given the provided data.\n',
  'réponse_rag': "## Définition\nLe Code Général des Impôts (CGI) est un ensemble structuré de dispositions légales régissant la fiscalité au Maroc.\n\n## Principe\nLe CGI est organisé en plusieurs livres, chacun traitant d'un domaine spécifique de la fiscalité.\n\n## Points clés\nLe contexte fourni ne mentionne pas explicitement le nombre de livres qui composent le CGI 2025. Les extraits disponibles portent principalement sur des articles spécifiques et leurs dates d'effet, ainsi que sur des dispositions particulières, mais ne donnent pas d'information sur la structure globale du CGI ni sur le nombre de ses livres.\n\n## Références\nAucune information sur le nombre de livres du CGI 2025 n'est disponible dans les extraits fournis [Data: Sources (897, 881, 195)].\n",
  'réponse_graphrag': "J

In [5]:
CRITERIA = {
"comprehensiveness": """How much detail does the answer provide to cover all the aspects and details of the
question? A comprehensive answer should be thorough and complete, without being redundant or irrelevant.
For example, if the question is ’What are the benefits and drawbacks of nuclear energy?’, a comprehensive
answer would provide both the positive and negative aspects of nuclear energy, such as its efficiency,
environmental impact, safety, cost, etc. A comprehensive answer should not leave out any important points
or provide irrelevant information. For example, an incomplete answer would only provide the benefits of
nuclear energy without describing the drawbacks, or a redundant answer would repeat the same information
multiple times.""",
"diversity": """How varied and rich is the answer in providing different perspectives and insights
on the question? A diverse answer should be multi-faceted and multi-dimensional, offering different
viewpoints and angles on the question. For example, if the question is ’What are the causes and effects
of climate change?’, a diverse answer would provide different causes and effects of climate change, such
as greenhouse gas emissions, deforestation, natural disasters, biodiversity loss, etc. A diverse answer
should also provide different sources and evidence to support the answer. For example, a single-source
answer would only cite one source or evidence, or a biased answer would only provide one perspective or
opinion.""",
"directness": """How specifically and clearly does the answer address the question? A direct answer should
provide a clear and concise answer to the question. For example, if the question is ’What is the capital
of France?’, a direct answer would be ’Paris’. A direct answer should not provide any irrelevant or
unnecessary information that does not answer the question. For example, an indirect answer would be ’The
capital of France is located on the river Seine’."""
}

In [6]:
JUDGE_SYSTEM_PROMPT = "You are a helpful assistant responsible for grading two answers to a question that are provided by two different people."

JUDGE_USER_TEMPLATE = """
---Goal---
Given a question and two answers (Answer 1 and Answer 2), assess which answer is better according to the following measure:
{criterion_def}

Your assessment should include two parts:
- Winner: either 1 (if Answer 1 is better) and 2 (if Answer 2 is better) or 0 if they are fundamentally similar and the differences are immaterial.
- Reasoning: a short explanation of why you chose the winner with respect to the measure described above.

Format your response as a JSON object with the following structure:
{{
    "winner": <1, 2, or 0>,
    "reasoning": "Answer 1 is better because <your reasoning>."
}}

---Question---
{question}

---Answer 1---
{answer_1}

---Answer 2---
{answer_2}

Assess which answer is better according to the following measure:
{criterion_name}
"""

In [7]:
def evaluate_pair(question, ans1, ans2, criterion_name):
    """Envoie les deux réponses au LLM Juge et récupère le gagnant."""
    
    prompt = JUDGE_USER_TEMPLATE.format(
        criterion_def=CRITERIA[criterion_name],
        question=question,
        answer_1=ans1,
        answer_2=ans2,
        criterion_name=criterion_name
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini", # Le papier recommande un modèle puissant pour le juge [3]
            messages=[
                {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"}, # Force le JSON pour faciliter le parsing
            temperature=0 # On veut de la cohérence, même si on répète le test
        )
        
        result_text = response.choices[0].message.content
        result_json = json.loads(result_text)
        return result_json.get("winner"), result_json.get("reasoning")
        
    except Exception as e:
        print(f"Erreur lors de l'appel API: {e}")
        return 0, "Error"

In [8]:
def run_comparison(test_set,A,B, criterion="comprehensiveness", repetitions=5):
    """
    Compare GraphRAG vs RAG Classique sur un jeu de données.
    A = GraphRAG (Answer 1)
    B = RAG Classique (Answer 2)
    """
    question_scores = []

    print(f"--- Début de l'évaluation sur le critère : {criterion.upper()} ---")

    for i, item in enumerate(test_set):
        print(f"\nQuestion {i+1}: {item['question'][:50]}...")
        
        scores_for_this_question = []
        
        # Répéter l'évaluation 5 fois pour gérer la stochasticité du LLM
        for r in range(repetitions):
            winner, reasoning = evaluate_pair(
                item['question'], 
                item[A], 
                item[B], 
                criterion
            )
            
            # Attribution des points selon la méthodologie [4]
            if winner == 1:
                score = 100  # A gagne
            elif winner == 2:
                score = 0    # B gagne
            else:
                score = 50   # Égalité
                
            scores_for_this_question.append(score)
            print(f"  Rep {r+1}: Gagnant={winner} (Score={score}) | Raison: {reasoning}")

        # Calculer la moyenne pour CETTE question
        avg_question_score = statistics.mean(scores_for_this_question)
        question_scores.append(avg_question_score)
        print(f"  >> Moyenne pour Question {i+1}: {avg_question_score}")
        final_win_rate = statistics.mean(question_scores)
    
    print("\n" + "="*50)
    print(f"RÉSULTAT FINAL ({criterion})")
    print(f"{A} Win Rate: {final_win_rate:.2f}%")
    print("="*50)
    
    if final_win_rate > 50:
        print(f"Conclusion: {A} est globalement meilleur.")
    elif final_win_rate < 50:
        print(f"Conclusion: {B} est globalement meilleur.")
    else:
        print("Conclusion: Égalité parfaite.")

In [9]:

S=["réponse_rag" ,"réponse_graphrag","réponse_graphrag2","réponse_graphrag2_C1","réponse_graphrag2_C2"]
S[0],S[1]

('réponse_rag', 'réponse_graphrag')

### comprehensiveness criteria

In [51]:
#comaparaison entre RAG vs GRAPHRAG
run_comparison(repenses,S[0],S[1], criterion="comprehensiveness", repetitions=5)


--- Début de l'évaluation sur le critère : COMPREHENSIVENESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books while also discussing the general structure of the CGI, which adds depth to the response. In contrast, Answer 2 is very brief and only states the lack of information without providing any additional context or details.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books while also providing relevant details about the CGI's structure, which 

In [53]:
#RAG vs GraphRAG2_C0
run_comparison(repenses,S[0],S[2], criterion="comprehensiveness", repetitions=5)


--- Début de l'évaluation sur le critère : COMPREHENSIVENESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), discusses its organization into books, and acknowledges the lack of specific information regarding the number of books in CGI 2025. In contrast, Answer 2 simply states an inability to answer the question without providing any relevant context or information.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI) and discusses its organization into books, even though it does not specify the exact number of books in CGI 2025. It offers context and relevant details about the CGI, while Answer 2 simply states an inability to answer the question without providing any additional information.
  Rep 3: Gagnant=1 (Score=100) | Raison: Answer 1 i

In [54]:
#RAG vs GraphRAG2_C1

run_comparison(repenses,S[0],S[3], criterion="comprehensiveness", repetitions=5)


--- Début de l'évaluation sur le critère : COMPREHENSIVENESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured response that explains the context of the Code Général des Impôts (CGI) and discusses its organization into books, even though it does not provide the exact number of books. It acknowledges the lack of specific information regarding the number of books in CGI 2025, which shows an attempt to address the question comprehensively. In contrast, Answer 2 simply states an inability to answer without providing any relevant information or context.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI) and discusses its organization into books, even though it does not specify the exact number of books in CGI 2025. It offers context and relevant information about the CGI, while Answer 2 simpl

In [55]:
#RAG vs GraphRAG2_C2
run_comparison(repenses,S[0],S[4], criterion="comprehensiveness", repetitions=5)

--- Début de l'évaluation sur le critère : COMPREHENSIVENESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context regarding the number of books. Although it does not provide a specific number, it offers relevant information about the CGI's structure and acknowledges the lack of explicit data on the number of books. In contrast, Answer 2 simply states an inability to answer the question without providing any context or information.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI) and discusses its organization into books, even though it does not specify the exact number of books in CGI 2025. It offers context and relevant information about the CGI, while Answer 2 simply states an inabili

### diversity criteria

In [56]:
#RAG vs GRAPHRAG
run_comparison(repenses,S[0],S[1], criterion="diversity", repetitions=4)


--- Début de l'évaluation sur le critère : DIVERSITY ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It discusses the lack of explicit information about the number of books in CGI 2025 while also referencing specific articles and their effects. This multi-faceted approach offers more insights and perspectives compared to Answer 2, which simply states a lack of information without any additional context or detail.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It discusses the lack of explicit information about the number of books in CGI 2025 while also referencing specific articles and the

In [57]:
#RAG vs GRAPHRAG2_C0
run_comparison(repenses,S[0],S[2], criterion="diversity", repetitions=4)

--- Début de l'évaluation sur le critère : DIVERSITY ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books while still offering insights into the CGI's structure and purpose. In contrast, Answer 2 simply states an inability to answer without providing any additional context or information.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books in CGI 2025 while still offering insights into the structure of the CGI. In contrast, Answer 2 simply states an in

In [58]:
#RAG vs GRAPHRAG2_C1
run_comparison(repenses,S[0],S[3], criterion="diversity", repetitions=4)

--- Début de l'évaluation sur le critère : DIVERSITY ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books in CGI 2025 while still offering insights into the structure and purpose of the CGI. In contrast, Answer 2 simply states an inability to answer without providing any additional context or information.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It attempts to address the question by discussing the lack of explicit information about the number of books in CGI 2025, which shows an effort to engage with the topic. In

In [59]:
#RAG vs GRAPHRAG_C2
run_comparison(repenses,S[0],S[4], criterion="diversity", repetitions=4)

--- Début de l'évaluation sur le critère : DIVERSITY ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books in CGI 2025 while still offering relevant insights into the topic. In contrast, Answer 2 simply states an inability to answer without providing any additional context or information.
  Rep 2: Gagnant=1 (Score=100) | Raison: Answer 1 is better because it provides a structured overview of the Code Général des Impôts (CGI), explaining its definition, organization, and the context of the question. It acknowledges the lack of specific information about the number of books in CGI 2025 while still offering relevant insights into the topic. In contrast, Answer 2 simply states an inability

### directness criteria

In [60]:
#RAG vs GRAPHRAG
run_comparison(repenses,S[0],S[1], criterion="directness", repetitions=5)

--- Début de l'évaluation sur le critère : DIRECTNESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the lack of information regarding the number of books in the CGI 2025 without providing unnecessary context or details. In contrast, Answer 1 includes extraneous information about the CGI and its organization, which does not directly answer the question.
  Rep 2: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the lack of information regarding the number of books in the CGI 2025 without providing unnecessary context or details. In contrast, Answer 1 includes extraneous information about the CGI and its organization, which does not directly answer the question.
  Rep 3: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the lack of information regarding the number of books in the CGI 2025 without providing unnecessary context or

In [61]:
#RAG vs GRAPHRAG2_C0
run_comparison(repenses,S[0],S[2], criterion="directness", repetitions=5)

--- Début de l'évaluation sur le critère : DIRECTNESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 2: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 3: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 4: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly state

In [62]:
#RAG vs GRAPHRAG2_C1
run_comparison(repenses,S[0],S[3], criterion="directness", repetitions=5)

--- Début de l'évaluation sur le critère : DIRECTNESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 2: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 3: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 4: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly state

In [10]:
#RAG vs GRAPHRAG_C2
run_comparison(repenses,S[0],S[4], criterion="directness", repetitions=5)

--- Début de l'évaluation sur le critère : DIRECTNESS ---

Question 1: Quel est le nombre de livres qui composent le CGI ...
  Rep 1: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 2: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 3: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly states the inability to answer the question, while Answer 1 provides unnecessary background information about the CGI without addressing the specific question about the number of books.
  Rep 4: Gagnant=2 (Score=0) | Raison: Answer 2 is better because it directly state