In [None]:
import pandas as pd
import os
import time
from tqdm import tqdm
from operator import itemgetter
from neo4j import GraphDatabase
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_community.vectorstores import Neo4jVector
from dotenv import load_dotenv
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


load_dotenv()

# –ù–∞—Å—Ç—Ä–æ–π–∫–∏
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
BENCHMARK_FILE = "benchmark_qa.csv"  # –§–∞–π–ª –∏–∑ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ —à–∞–≥–∞
RESULTS_FILE = "benchmark_results.csv"


driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
llm = ChatOllama(model="qwen3:8b", temperature=0)
evaluator_llm = ChatOllama(model="qwen3:8b", temperature=0) 
embeddings = OllamaEmbeddings(
    model="qwen3-embedding:0.6b"
)

# 1. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –º–æ–¥–µ–ª–∏ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (–¥–æ–ª–∂–Ω–∞ —Å–æ–≤–ø–∞–¥–∞—Ç—å —Å —Ç–æ–π, —á—Ç–æ –±—É–¥–µ—Ç –ø—Ä–∏ –ø–æ–∏—Å–∫–µ)
print("‚è≥ –ù–∞—á–∏–Ω–∞—é —Å–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–≥–æ –∏–Ω–¥–µ–∫—Å–∞ –∏ –≤—ã—á–∏—Å–ª–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤...")
print("–≠—Ç–æ –º–æ–∂–µ—Ç –∑–∞–Ω—è—Ç—å –≤—Ä–µ–º—è, –µ—Å–ª–∏ –∫–æ–º–ø–∞–Ω–∏–π –º–Ω–æ–≥–æ...")

vector_store = Neo4jVector.from_existing_index(
    embedding=embeddings,
    index_name="global_knowledge_index",
    url=NEO4J_URI,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD,
    node_label="Searchable",
    text_node_property="description",
    embedding_node_property="embedding"
)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

‚è≥ –ù–∞—á–∏–Ω–∞—é —Å–æ–∑–¥–∞–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–Ω–æ–≥–æ –∏–Ω–¥–µ–∫—Å–∞ –∏ –≤—ã—á–∏—Å–ª–µ–Ω–∏–µ —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤...
–≠—Ç–æ –º–æ–∂–µ—Ç –∑–∞–Ω—è—Ç—å –≤—Ä–µ–º—è, –µ—Å–ª–∏ –∫–æ–º–ø–∞–Ω–∏–π –º–Ω–æ–≥–æ...


In [16]:
rag_prompt = ChatPromptTemplate.from_template("""
–¢—ã —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–π –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç. –û—Ç–≤–µ—Ç—å –Ω–∞ –≤–æ–ø—Ä–æ—Å, –æ—Å–Ω–æ–≤—ã–≤–∞—è—Å—å –¢–û–õ–¨–ö–û –Ω–∞ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω–æ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ.
–ï—Å–ª–∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –≤ –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ –Ω–µ—Ç, –æ—Ç–≤–µ—Ç—å "I don't know based on the context".

<context>
{context}
</context>

Question: {input}
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# –≠—Ç–∞ –º–∞–≥–∏—è LCEL –¥–µ–ª–∞–µ—Ç —Å–ª–µ–¥—É—é—â–µ–µ:
# 1. –ü—Ä–∏–Ω–∏–º–∞–µ—Ç {"input": "–≤–æ–ø—Ä–æ—Å"}
# 2. –ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ:
#    - –ò—â–µ—Ç –¥–æ–∫—É–º–µ–Ω—Ç—ã —á–µ—Ä–µ–∑ retriever -> –∫–ª–∞–¥–µ—Ç –≤ –∫–ª—é—á 'context'
#    - –ü—Ä–æ–∫–∏–¥—ã–≤–∞–µ—Ç –≤–æ–ø—Ä–æ—Å -> –∫–ª–∞–¥–µ—Ç –≤ –∫–ª—é—á 'input'
# 3. .assign –≤—ã—á–∏—Å–ª—è–µ—Ç –∫–ª—é—á 'answer', –∏—Å–ø–æ–ª—å–∑—É—è –∫–æ–Ω—Ç–µ–∫—Å—Ç –∏ –≤–æ–ø—Ä–æ—Å
rag_chain = RunnableParallel(
    {"context":  itemgetter("input") | retriever, "input": itemgetter("input")}
).assign(
    answer=(
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | rag_prompt
        | llm
        | StrOutputParser()
    )
)

print("‚úÖ RAG-—Ü–µ–ø–æ—á–∫–∞ —Å–æ–±—Ä–∞–Ω–∞ –Ω–∞ LCEL.")

‚úÖ RAG-—Ü–µ–ø–æ—á–∫–∞ —Å–æ–±—Ä–∞–Ω–∞ –Ω–∞ LCEL.


In [17]:
# --- 3. –õ–æ–≥–∏–∫–∞ –û—Ü–µ–Ω—â–∏–∫–∞ (LLM-as-a-Judge) ---

def evaluate_answer(question, ground_truth, prediction):
    """
    –ü—Ä–æ—Å–∏—Ç LLM —Å—Ä–∞–≤–Ω–∏—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–π –æ—Ç–≤–µ—Ç —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º.
    –í–æ–∑–≤—Ä–∞—â–∞–µ—Ç: TRUE (–µ—Å–ª–∏ —Å–º—ã—Å–ª —Å–æ–≤–ø–∞–¥–∞–µ—Ç) –∏–ª–∏ FALSE.
    """
    # –ü—Ä–æ–º–ø—Ç –¥–ª—è —Å—É–¥—å–∏
    eval_prompt = f"""
    You are a strict evaluator. Compare the predicted answer with the ground truth.
    
    Question: {question}
    Ground Truth: {ground_truth}
    Prediction: {prediction}
    
    Task: Does the Prediction match the factual meaning of the Ground Truth? 
    Ignore minor phrasing differences.
    If the prediction says "I don't know", mark it as FALSE.
    
    Respond with exactly one word: TRUE or FALSE.
    """
    
    try:
        response = evaluator_llm.invoke(eval_prompt)
        content = response.content.strip().upper()
        return "TRUE" in content
    except:
        return False

# --- 4. –ó–∞–ø—É—Å–∫ –ë–µ–Ω—á–º–∞—Ä–∫–∞ ---

def run_benchmark():
    if not os.path.exists(BENCHMARK_FILE):
        print(f"–§–∞–π–ª {BENCHMARK_FILE} –Ω–µ –Ω–∞–π–¥–µ–Ω! –°–Ω–∞—á–∞–ª–∞ —Å–≥–µ–Ω–µ—Ä–∏—Ä—É–π—Ç–µ –≤–æ–ø—Ä–æ—Å—ã.")
        return

    df = pd.read_csv(BENCHMARK_FILE)
    print(f"üöÄ –ó–∞–ø—É—Å–∫ —Ç–µ—Å—Ç–∞ –Ω–∞ {len(df)} –≤–æ–ø—Ä–æ—Å–∞—Ö...")

    results = []
    correct_count = 0

    # –ò—Å–ø–æ–ª—å–∑—É–µ–º tqdm –¥–ª—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è –ø—Ä–æ–≥—Ä–µ—Å—Å–∞
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        question = row['question']
        ground_truth = str(row['ground_truth'])
        
        start_time = time.time()
        
        try:
            # 1. –ó–∞–ø—É—Å–∫ RAG
            response = rag_chain.invoke({"input": question})
            prediction = response['answer']
            
            # –ò–∑–≤–ª–µ–∫–∞–µ–º –Ω–∞–π–¥–µ–Ω–Ω—ã–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞
            retrieved_docs = [doc.page_content for doc in response['context']]
            context_text = " || ".join(retrieved_docs) # –°–æ—Ö—Ä–∞–Ω—è–µ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç –æ–¥–Ω–æ–π —Å—Ç—Ä–æ–∫–æ–π
            
            # 2. –û—Ü–µ–Ω–∫–∞
            is_correct = evaluate_answer(question, ground_truth, prediction)
            
            if is_correct:
                correct_count += 1
                
        except Exception as e:
            prediction = f"ERROR: {e}"
            is_correct = False
            context_text = ""

        elapsed_time = time.time() - start_time

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç
        results.append({
            "question": question,
            "ground_truth": ground_truth,
            "prediction": prediction,
            "is_correct": is_correct,
            "retrieved_context": context_text, # –í–∞–∂–Ω–æ –≤–∏–¥–µ—Ç—å, —á—Ç–æ –Ω–∞—à–µ–ª —Ä–µ—Ç—Ä–∏–≤–µ—Ä
            "time_sec": round(elapsed_time, 2),
            "question_type": row.get('question_type', 'unknown')
        })

    # --- 5. –ê–Ω–∞–ª–∏–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ ---
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(RESULTS_FILE, index=False)
    
    accuracy = (correct_count / len(df)) * 100
    
    print("\n" + "="*30)
    print(f"üèÅ –ë–ï–ù–ß–ú–ê–†–ö –ó–ê–í–ï–†–®–ï–ù")
    print(f"üìä –¢–æ—á–Ω–æ—Å—Ç—å (Accuracy): {accuracy:.2f}%")
    print(f"üíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: {RESULTS_FILE}")
    print("="*30)
    
    # –í—ã–≤–æ–¥ –ø—Ä–æ–±–ª–µ–º–Ω—ã—Ö –º–µ—Å—Ç (—Ç–æ–ø-5 –æ—à–∏–±–æ–∫)
    print("\n–ü—Ä–∏–º–µ—Ä—ã –æ—à–∏–±–æ–∫:")
    errors = results_df[results_df['is_correct'] == False].head(5)
    for i, row in errors.iterrows():
        print(f"Q: {row['question']}")
        print(f"Truth: {row['ground_truth']}")
        print(f"Pred: {row['prediction']}")
        print("-" * 20)

if __name__ == "__main__":
    run_benchmark()

üöÄ –ó–∞–ø—É—Å–∫ —Ç–µ—Å—Ç–∞ –Ω–∞ 100 –≤–æ–ø—Ä–æ—Å–∞—Ö...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [18:56<00:00, 11.36s/it]


üèÅ –ë–ï–ù–ß–ú–ê–†–ö –ó–ê–í–ï–†–®–ï–ù
üìä –¢–æ—á–Ω–æ—Å—Ç—å (Accuracy): 42.00%
üíæ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤: benchmark_results.csv

–ü—Ä–∏–º–µ—Ä—ã –æ—à–∏–±–æ–∫:
Q: What is the current market capitalization of TSCO?
Truth: 28121638912
Pred: I don't know based on the context.
--------------------
Q: Where is the headquarters of American Tower Corporation (REI located?
Truth: Boston, MA
Pred: I don't know based on the context.
--------------------
Q: What is the official website for AbbVie Inc.?
Truth: https://www.abbvie.com
Pred: I don't know based on the context.
--------------------
Q: How many full-time employees work for Workday, Inc.?
Truth: 20588.0
Pred: I don't know based on the context.
--------------------
Q: What is the stock ticker symbol for Baxter International Inc.?
Truth: BAX
Pred: I don't know based on the context.
--------------------



