In [None]:
#LLM context similarity testing

from rag_evaluation import evaluate_llm_similarity,evaluate_results_file
# List of models to evaluate
models = [
    "mistral",
    "llama3",
    "gemma2:2b",
    "gemma2:9b",
    "qwen2:1.5b",
    "qwen2:7b",
    "qwen2.5:0.5b",
    "qwen2.5:1.5b",
    "zephyr:7b",
    "deepseek-r1:1.5b",
    "deepseek-r1:7b",
    "phi3:3.8b",
    "granite3-moe:1b",
    "granite3-moe:3b"
]


# Loop through each model
for model_name in models:
    # Define the output file path
    output_file = f"llm_similarity_results/{model_name}_evaluation_results.json"
    
    # Evaluate LLM similarity
    evaluate_llm_similarity("similarity_ds.json", model_name=model_name, output_file=output_file)
    
    # Evaluate the results file and get metrics
    metrics = evaluate_results_file(output_file)
    
    # Print or log the metrics for the current model
    print(f"Metrics for {model_name}: {metrics}")

Evaluation Metrics (from llm_similarity_results_singlerun/mistral_evaluation_results.json):
  Accuracy: 0.9900
  Precision: 0.9804
  Recall: 1.0000
  F1 Score: 0.9901
  True Positives (TP): 100
  True Negatives (TN): 98
  False Positives (FP): 2
  False Negatives (FN): 0
Metrics for mistral: {'accuracy': 0.99, 'precision': 0.9803921568627451, 'recall': 1.0, 'f1': 0.9900990099009901, 'tp': 100, 'tn': 98, 'fp': 2, 'fn': 0}
Evaluation Metrics (from llm_similarity_results_singlerun/llama3_evaluation_results.json):
  Accuracy: 0.9350
  Precision: 0.8850
  Recall: 1.0000
  F1 Score: 0.9390
  True Positives (TP): 100
  True Negatives (TN): 87
  False Positives (FP): 13
  False Negatives (FN): 0
Metrics for llama3: {'accuracy': 0.935, 'precision': 0.8849557522123894, 'recall': 1.0, 'f1': 0.9389671361502347, 'tp': 100, 'tn': 87, 'fp': 13, 'fn': 0}
Evaluation Metrics (from llm_similarity_results_singlerun/gemma2:2b_evaluation_results.json):
  Accuracy: 0.9700
  Precision: 0.9434
  Recall: 1.0000

In [None]:
# RAG evaluation by single EVAL_LLM for every RAG_LLM

from rag_evaluation import evaluate_questions,calculate_accuracy


rag_models = [
    "mistral",
    "llama3",
    "gemma2:2b",
    "gemma2:9b",
    "qwen2:1.5b",
    "qwen2:7b",
    "qwen2.5:0.5b",
    "qwen2.5:1.5b",
    "zephyr:7b",
    "deepseek-r1:1.5b",
    "deepseek-r1:7b",
    "phi3:3.8b",
    "granite3-moe:1b",
    "granite3-moe:3b"
]

output_file = "eval_results_rag_model_individual/evaluation_results_{}_{}.json"
json_file = "eval_results_rag_model_3comb/evaluation_results_{}.json"

eval_model = "qwen2.5:1.5b"

model_accuracies = {}
for rag_model in rag_models:
    print(f"Evaluating model: {rag_model}")
    #evaluate_questions("rag_eval_ds.json", output_file=output_file.format(rag_model), rag_model=rag_model,eval_models=['mistral','gemma2:9b','qwen2.5:1.5b'])
    evaluate_questions(json_file=json_file.format(rag_model), output_file=output_file.format(eval_model,rag_model), rag_model=rag_model,eval_models=[eval_model])

    accuracy = calculate_accuracy(output_file.format(eval_model,rag_model))
    model_accuracies[rag_model] = accuracy

for model, accuracy in model_accuracies.items():
    print(f"Model: {model}, Accuracy: {accuracy:.2f}%")

sorted_models = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)
print("\nSorted by accuracy:")
for model, accuracy in sorted_models:
    print(f"RAG Model: {model}, Accuracy: {accuracy:.2f}%")

Evaluating model: mistral


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: llama3


[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: gemma2:2b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: gemma2:9b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: qwen2:1.5b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: qwen2:7b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: qwen2.5:0.5b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: qwen2.5:1.5b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: zephyr:7b


[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: deepseek-r1:1.5b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: deepseek-r1:7b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: phi3:3.8b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: granite3-moe:1b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Evaluating model: granite3-moe:3b


[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m


Model: mistral, Accuracy: 0.97%
Model: llama3, Accuracy: 0.99%
Model: gemma2:2b, Accuracy: 0.95%
Model: gemma2:9b, Accuracy: 0.98%
Model: qwen2:1.5b, Accuracy: 0.94%
Model: qwen2:7b, Accuracy: 0.98%
Model: qwen2.5:0.5b, Accuracy: 0.87%
Model: qwen2.5:1.5b, Accuracy: 0.94%
Model: zephyr:7b, Accuracy: 0.99%
Model: deepseek-r1:1.5b, Accuracy: 0.90%
Model: deepseek-r1:7b, Accuracy: 0.98%
Model: phi3:3.8b, Accuracy: 0.94%
Model: granite3-moe:1b, Accuracy: 0.77%
Model: granite3-moe:3b, Accuracy: 0.86%

Sorted by accuracy:
RAG Model: llama3, Accuracy: 0.99%
RAG Model: zephyr:7b, Accuracy: 0.99%
RAG Model: gemma2:9b, Accuracy: 0.98%
RAG Model: qwen2:7b, Accuracy: 0.98%
RAG Model: deepseek-r1:7b, Accuracy: 0.98%
RAG Model: mistral, Accuracy: 0.97%
RAG Model: gemma2:2b, Accuracy: 0.95%
RAG Model: qwen2:1.5b, Accuracy: 0.94%
RAG Model: qwen2.5:1.5b, Accuracy: 0.94%
RAG Model: phi3:3.8b, Accuracy: 0.94%
RAG Model: deepseek-r1:1.5b, Accuracy: 0.90%
RAG Model: qwen2.5:0.5b, Accuracy: 0.87%
RAG Model

In [None]:
# Single RAG_LLM evaluated by every EVAL_LLM
from rag_evaluation import evaluate_questions,calculate_accuracy


eval_models = [
    "mistral",
    "llama3",
    "gemma2:2b",
    "gemma2:9b",
    "qwen2:1.5b",
    "qwen2:7b",
    "qwen2.5:0.5b",
    "qwen2.5:1.5b",
    "zephyr:7b",
    "deepseek-r1:1.5b",
    "deepseek-r1:7b",
    "phi3:3.8b",
    "granite3-moe:1b",
    "granite3-moe:3b"
]



output_file = "eval_results_eval_model/evaluation_results_{}.json"

model_accuracies = {}
for eval_model in eval_models:
    print(f"Evaluating model: {eval_model}")
    evaluate_questions("rag_eval_ds.json", output_file=output_file.format(eval_model), rag_model='mistral',eval_models=[eval_model])
    accuracy = calculate_accuracy(output_file.format(eval_model))
    model_accuracies[eval_model] = accuracy

for model, accuracy in model_accuracies.items():
    print(f"Model: {model}, Accuracy: {accuracy:.2f}%")

sorted_models = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)
print("\nSorted by accuracy:")
for model, accuracy in sorted_models:
    print(f"Eval Model: {model}, Accuracy: {accuracy:.2f}%")

In [None]:
#FINAL - Every RAG_LLM evaluated by 3-comb.

from rag_evaluation import evaluate_questions,calculate_accuracy


rag_models = [
    "mistral",
    "llama3",
    "gemma2:2b",
    "gemma2:9b",
    "qwen2:1.5b",
    "qwen2:7b",
    "qwen2.5:0.5b",
    "qwen2.5:1.5b",
    "zephyr:7b",
    "deepseek-r1:1.5b",
    "deepseek-r1:7b",
    "phi3:3.8b",
    "granite3-moe:1b",
    "granite3-moe:3b"
]


output_file = "eval_results_rag_model_3comb/evaluation_results_{}.json"

model_accuracies = {}
for rag_model in rag_models:
    print(f"Evaluating model: {rag_model}")
    evaluate_questions("rag_eval_ds.json", output_file=output_file.format(rag_model), rag_model=rag_model,eval_models=['mistral','gemma2:9b','qwen2.5:1.5b'])
    #evaluate_questions("eval_results_nest/evaluation_results_qwen2.5:1.5b.json", output_file=output_file.format(rag_model), rag_model=rag_model,eval_models=['mistral','gemma2:9b','qwen2.5:1.5b'])

    accuracy = calculate_accuracy(output_file.format(rag_model))
    model_accuracies[rag_model] = accuracy

for model, accuracy in model_accuracies.items():
    print(f"Model: {model}, Accuracy: {accuracy:.2f}%")

sorted_models = sorted(model_accuracies.items(), key=lambda x: x[1], reverse=True)
print("\nSorted by accuracy:")
for model, accuracy in sorted_models:
    print(f"RAG Model: {model}, Accuracy: {accuracy:.2f}%")

In [None]:
#RAG evaluation by 3-comb. for a single RAG_LLM 

from rag_evaluation import evaluate_questions,calculate_accuracy

output_file = "eval_results_rag_model_3comb/evaluation_results_gemma2:9b_mistral_qwen2.5:1.5b.json"
evaluate_questions("rag_eval_ds.json", output_file=output_file, rag_model='mistral',eval_models=['mistral','gemma2:9b','qwen2.5:1.5b'])

accuracy = calculate_accuracy(output_file)
print(accuracy)

[91mResponse from gemma2:9b: false[0m
[91mResponse from gemma2:9b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from gemma2:9b: false[0m
[91mResponse from mistral: false[0m
[91mResponse from mistral: false[0m
[91mResponse from gemma2:9b: false[0m
[91mMajority voting result: False[0m
[91mResponse from gemma2:9b: false[0m
[91mResponse from mistral: false[0m
[91mResponse from gemma2:9b: false[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mMajority voting result: False[0m
[91mResponse from qwen2.5:1.5b: false[0m
[91mResponse from mistral: false[0m
[91mResponse from gemma2:9b: false[0m
[91mResponse from gemma2:9b: false[0m


0.98
