In [None]:
#import libraries needed
import os
import torch
import numpy as np
import pandas as pd
import base64
from PIL import Image
from neo4j import GraphDatabase
from groq import Groq
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json
from tqdm import tqdm
from IPython.display import Image as IPyImage, display
from ragas.metrics import Faithfulness, ContextRelevance, ResponseRelevancy
from ragas.dataset_schema import SingleTurnSample 
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
load_dotenv()

In [None]:
# Neo4j Connection Details
URI = os.getenv("NEO4J_URI")
AUTH = ("neo4j", os.getenv("NEO4J_PASSWORD"))

driver = GraphDatabase.driver(URI, auth=AUTH)
driver.verify_connectivity()
print("Connection established.")

Connection established.


In [None]:
#Set the Groq API key for calling the MLLM later from Groq.
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

# Initialize the Groq client
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])


In [None]:
#Load the embedding model from huggingface and use the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = SentenceTransformer("BAAI/bge-m3").to(device)

Using device: cpu


In [None]:
# Load Gemini 2.0 Flash model from Google AI Studio API as LLM evaluator for RAGAS evaluation
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

config = {
    "model": "gemini-2.0-flash",  
    "temperature": 0.0,
    "max_tokens": None,
    "top_p": 0.8,
}

# Initialize with Google AI Studio
evaluator_llm = LangchainLLMWrapper(ChatGoogleGenerativeAI(
    model=config["model"],
    temperature=config["temperature"],
    max_tokens=config["max_tokens"],
    top_p=config["top_p"],
))

In [None]:
#Load the Google text embedding model for RAGAS evaluation
evaluator_embeddings = LangchainEmbeddingsWrapper(GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",  # Google's text embedding model
))

In [None]:
#Text embedding function
def generate_text_embedding(text):
    embedding = model.encode(text, convert_to_numpy=True)
    return embedding.tolist()

In [None]:
#Retrieve top K nodes from Neo4j based on the text embedding
def retrieve_top_k_nodes(query, top_k=5):
    embedding = generate_text_embedding(query)
    results = []

    index_map = {
        "Place": "place_index",
        "Content": "content_index",
        "Type": "type_index",
        "State": "state_index",
    }

    with driver.session() as session:
        for label, index_name in index_map.items():
            result = session.run("""
                CALL db.index.vector.queryNodes($index_name, 5, $embedding)
                YIELD node, score
                RETURN $label AS source, node, score, elementId(node) AS node_id
            """, label=label, index_name=index_name, embedding=embedding)

            for record in result:
                data = record.data()
                node = data["node"]
                node["__id"] = data["node_id"]
                results.append({
                    "source": data["source"],
                    "score": data["score"],
                    "node": node
                })

    results.sort(key=lambda x: -x['score'])
    return results[:top_k]

In [None]:
# Expand a place node to include its details
def expand_place_node(place_id):
    with driver.session() as session:
        result = session.run("""
            MATCH (p:Place)
            WHERE elementId(p) = $id
            OPTIONAL MATCH (p)-[:HAS_TYPE]->(t:Type)
            OPTIONAL MATCH (p)-[:IN_STATE]->(s:State)
            OPTIONAL MATCH (p)-[:HAS_EN_CONTENT]->(c_en:Content)
            OPTIONAL MATCH (p)-[:HAS_MS_CONTENT]->(c_ms:Content)
            RETURN 
                p.title AS title,
                t.name AS type,
                s.name AS state,
                c_en.text AS en_content,
                c_ms.text AS ms_content,
                p.image_url AS image_url
        """, id=place_id)

        expanded = []
        for r in result:
            expanded.append({
                "title": r["title"],
                "type": r.get("type", ""),
                "state": r.get("state", ""),
                "en_content": r.get("en_content", ""),
                "ms_content": r.get("ms_content", ""),
                "image_url": r.get("image_url", "")
            })

    return expanded


In [None]:
# Function to expand a matched node by traversing to related Place nodes
def expand_related_nodes(match, max_places=5):
    label = match["source"]
    node = match["node"]
    node_id = node["__id"]

    places = []

    with driver.session() as session:
        # If the matched node is already a Place, directly expand and return it
        if label == "Place":
            return expand_place_node(node_id)

        # If the matched node is a State, find Places in that state
        elif label == "State":
            result = session.run("""
                MATCH (s:State)<-[:IN_STATE]-(p:Place)
                WHERE elementId(s) = $id
                RETURN elementId(p) AS pid
                LIMIT $limit
            """, id=node_id, limit=max_places)

        # If the matched node is a Type, find Places of that type
        elif label == "Type":
            result = session.run("""
                MATCH (t:Type)<-[:HAS_TYPE]-(p:Place)
                WHERE elementId(t) = $id
                RETURN elementId(p) AS pid
                LIMIT $limit
            """, id=node_id, limit=max_places)

        # If the matched node is Content (description), find associated Places
        elif label == "Content":
            result = session.run("""
                MATCH (c:Content)<-[:HAS_EN_CONTENT|HAS_MS_CONTENT]-(p:Place)
                WHERE elementId(c) = $id
                RETURN elementId(p) AS pid
                LIMIT $limit
            """, id=node_id, limit=max_places)

        else:
            return []

        for r in result:
            pid = r["pid"]
            places.extend(expand_place_node(pid))

    return places


In [None]:
def generate_graphrag_answer(query, top_k=5, max_related=5):

    # Step 1: Retrieve top-k relevant nodes from the graph based on semantic similarity
    matches = retrieve_top_k_nodes(query, top_k=top_k)

    context_strings = []
    context_places = []  # Places passed into LLM
    related_count = 0

    # Step 2: Expand each matched node to find related Place nodes
    for m in matches:
        if related_count >= max_related:
            break
        
        # Retrieve related places based on the matched node
        related = expand_related_nodes(m, max_places=max_related - related_count)

        for item in related:
            if related_count >= max_related:
                break
            # Format each place‚Äôs information into context
            context_strings.append(
                f"--- Context ---\n"
                f"Title   : {item['title']}\n"
                f"Type    : {item['type']}\n"
                f"State   : {item['state']}\n"
                f"Content (Malay)  : {item['ms_content']}\n\n"
                f"Content (English): {item['en_content']}"
            )
            context_places.append(item)
            related_count += 1

    # Step 3: Combine all contexts into one block for the LLM prompt
    context_block = "\n\n".join(context_strings)

     # Step 4: Construct  prompt with rules for answering
    prompt = f"""
You are a knowledgeable and friendly assistant specializing in Malaysian tourism.
Your goal is to answer user questions based on the context provided below. Please follow these guidelines carefully:

1. If the user asks in *Malay, respond fully in **Malay*.
2. If the user asks in *English, respond fully in **English*.
3. If the query contains a mix of Malay and English, determine which language dominates:  
   - If *60% or more of the words or sentence structure* are in *English, respond in **English*.  
   - If *60% or more* are in *Malay, respond in **Malay*.
4. If the question is *not related to Malaysian tourism*, politely inform the user that you can only assist with Malaysian tourism topics.
5. Do **NOT** answer any questions that are:
   - harmful
   - sexual
   - offensive
   - unrelated to Malaysian tourism
   Politely decline such questions.
6. If the user expresses negative emotions (e.g., sad, depressed, hopeless), respond with gentle encouragement and recommend positive travel destinations to uplift them.

Context:
{context_block}

Question: {query}

Answer:"""

    # Step 5: Call the LLM (via Groq API) with the prompt
    completion = groq_client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_completion_tokens=512,
    )

    raw_answer = completion.choices[0].message.content.strip()
    answer_lower = raw_answer.lower()

    # Step 6: Track which place titles were mentioned in the LLM's response
    mentioned_places = []
    seen_titles = set()
    for place in context_places:
        title = place['title']
        if title.lower() in answer_lower and title.lower() not in seen_titles:
            mentioned_places.append(place)
            seen_titles.add(title.lower())

    # Step 7: Collect image references (only unique ones)
    image_refs = []
    seen_image_urls = set()
    for place in mentioned_places:
        image_url = place.get("image_url", "")
        if image_url and image_url not in seen_image_urls:
            image_refs.append(f"\nüì∑ {place['title']} image: {image_url}")
            seen_image_urls.add(image_url)

    # Step 8: Return final answer with image links, original matches, context, and referenced places
    final_answer = raw_answer + "\n" + "\n".join(image_refs)

    return final_answer, matches, context_block, mentioned_places


In [None]:
# Prepare DataFrame with 50 questions for evaluation 
evaluation_df = pd.DataFrame({
    "Question": [
        "Apakah pantai yang sesuai untuk bercuti di Terengganu?",
        "Cadangan tempat pelancongan di Melaka?",
        "Cadangan tempat pelancongan di Johor?",
        "Cadangan tempat pelancongan di Sabah?",
        "Cadangan tempat pelancongan di Sarawak?",
        "Cadangan tempat pelancongan di Selangor?",
        "Cadangan tempat pelancongan di Labuan?",
        "Cadangan tempat pelancongan di Kuala Lumpur?",
        "Cadangan tempat pelancongan di Putrajaya?",
        "Cadangan tempat pelancongan di Penang?",
        "Cadangan tempat pelancongan di Terengganu?",
        "Cadangan tempat pelancongan di Kelantan?",
        "Cadangan tempat pelancongan di Kedah?",
        "Cadangan tempat pelancongan di Perlis?",
        "Cadangan tempat pelancongan di Perak?",
        "Cadangan tempat pelancongan di Negeri Sembilan?",
        "Cadangan tempat pelancongan di Ipoh?",
        "Cadangan tempat pelancongan di Kuantan?",
        "Cadangan tempat pelancongan di Kota Kinabalu?",
        "Gunung tertinggi di Sabah?",
        "Senarai Nama Pulau-pulau yang boleh snorkeling",
        "Tempat menarik untuk bawa anak-anak?",
        "Tempat bersejarah di George Town?",
        "Aktiviti alam semula jadi di Pahang?",
        "Apa yang boleh dibuat di Genting Highlands?",
        "Apa tempat menarik di Johor Bahru?",
        "Cadangan lokasi camping selamat?",
        "Festival budaya di Malaysia?",
        "Tempat menarik di Cameron Highlands?",
        "Ada aktiviti eco-tourism di Sarawak?",
        "Tempat percutian keluarga di Malaysia?",
        "Apa pulau terbaik untuk diving?",
        "Tempat heritage UNESCO di Malaysia?",
        "Apakah zoo terbaik di Malaysia?",
        "Ada taman tema air yang besar?",
        "Tempat melihat matahari terbenam?",
        "Apa yang menarik tentang Taman Negara?",
        "Boleh ceritakan tentang Gunung Kinabalu?",
        "Apakah aktiviti yang boleh dibuat di Langkawi Sky Bridge?",
        "Apa yang boleh dilihat di Penang Hill?",
        "Apa keistimewaan Mount Mulu?",
        "Apa yang ada di Kuala Gandah Elephant Sanctuary?",
        "Apakah jenis tempat Gua Tempurung?",
        "Boleh jelaskan tentang Bako National Park?",
        "Apakah Cameron Highlands sesuai untuk bercuti?",
        "Tempat apa itu Sipadan Island?",
        "Aktiviti menarik di Sunway Lagoon?",
        "The Habitat Penang Hill ini tempat apa?",
        "Nak tahu tentang Colmar Tropicale di mana?",
        "Apa tarikan utama di Legoland Malaysia?",
    ]

})

In [None]:
output_path = "MalaysiaTourGraphRAG_eval.json"
results = []

# Accumulators to compute average scores across all evaluated questions
total_context_relevance = 0.0
total_faithfulness = 0.0
total_response_relevance = 0.0
count = 0

print("\nüîç Starting GraphRAG evaluation on 50 questions ...")

questions = evaluation_df["Question"].tolist()

# Loop through each evaluation question
for q in tqdm(questions, desc="Evaluating"):
    try:
        answer, _, context, _ = generate_graphrag_answer(q)

        # Prepare the input sample for RAGAS evaluation
        sample = SingleTurnSample(
            user_input=q,
            response=answer,
            retrieved_contexts=[context],
        )

        # Run each RAGAS metric 
        context_relevance_score = await ContextRelevance(llm=evaluator_llm).single_turn_ascore(sample)
        faithfulness_score = await Faithfulness(llm=evaluator_llm).single_turn_ascore(sample)
        response_relevancy_score = await ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings).single_turn_ascore(sample)

        # Update score accumulators
        total_context_relevance += context_relevance_score
        total_faithfulness += faithfulness_score
        total_response_relevance += response_relevancy_score
        count += 1

        result_entry = {
            "question": q,
            "generated_answer": answer,
            "retrieved_context": context,
            "context_relevance_score": context_relevance_score,
            "faithfulness_score": faithfulness_score,
            "answer_relevance_score": response_relevancy_score
        }

        results.append(result_entry)

         # Save intermediate results to file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

    except Exception as e:
        print(f"‚ö†Ô∏è Error for question: {q} ‚Üí {e}")

# Final average metrics
if count > 0:
    avg_context_relevance = total_context_relevance / count
    avg_faithfulness = total_faithfulness / count
    avg_response_relevance = total_response_relevance / count

    print(f"\nüìä Evaluation complete. Averages over {count} questions:")
    print(f" - Average Context Relevance  : {avg_context_relevance:.4f}")
    print(f" - Average Faithfulness       : {avg_faithfulness:.4f}")
    print(f" - Average Answer Relevance   : {avg_response_relevance:.4f}")
else:
    print("‚ö†Ô∏è No successful evaluations to compute averages.")

print(f"\n‚úÖ Results saved to: {output_path}")



üîç Starting GraphRAG evaluation on 50 questions ...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:54<00:00, 13.10s/it]


üìä Evaluation complete. Averages over 50 questions:
 - Average Context Relevance  : 0.8150
 - Average Faithfulness       : 0.7208
 - Average Answer Relevance   : 0.7666

‚úÖ Results saved to: MalaysiaTourGraphRAG_eval.json



