In [None]:
from ollama import Client
from langchain_ollama import OllamaEmbeddings, OllamaLLM
from dotenv import load_dotenv
from datasets import load_dataset
import os
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

In [None]:
load_dotenv(dotenv_path="./.env")

In [None]:
dataset = load_dataset(
    "explodinggradients/amnesty_qa", "english_v3", trust_remote_code=True
)
eval_dataset = dataset["eval"].select(range(1, 3))

In [None]:
def download_models(models):
    for model in models:
        client = Client(host=os.environ["OLLAMA_BASE_URL"])
        if model not in [i["model"] for i in client.list()["models"]]:
            print(f"Downloading model: {model}")
            client.pull(model=model)
        else:
            print(f"{model} is already downloaded.")


def evaluate_models(models, out_dir):
    for model_name in models:
        print(f"Evaluating model: {model_name}")
        model = OllamaLLM(
            model=model_name,
            verbose=False,
            timeout=600,
            num_ctx=4096,
            disable_streaming=False,
        )
        embeddings = OllamaEmbeddings(model=model_name)
        result = evaluate(
            eval_dataset,
            metrics=[
                context_precision,
                faithfulness,
                answer_relevancy,
                context_recall,
            ],
            llm=model,
            embeddings=embeddings,
            run_config=RunConfig(timeout=2000, max_retries=20, max_wait=120),
        )
        result_df = result.to_pandas()
        result_df.to_json(f"{out_dir}/{model_name.replace(":","_")}.json")

In [None]:
models = [
    "gemma2:2b",
    "llama3.2:1b",
    "phi3:3.8b",
    "qwen2.5:0.5b",
    "qwen2.5:1.5b",
]

print("### PREPARING MODELS...")
download_models(models)

print("### EVALUATING MODELS...")
evaluation = evaluate_models(models, out_dir="./results/ragas")