In [1]:
!pip install llama-index-embeddings-huggingface \
               llama-index-vector-stores-chroma \
               groq \
               llama-index-llms-groq \
               datasets \
               chromadb==0.5.17 \
               peft==0.10.0 \
               transformers==4.41.0 \
               llama-index-readers-file \
               xformers

import os
import json
import torch
import uuid
import time
import random
import asyncio
import logging
import pandas as pd
from typing import List, Dict
from dataclasses import dataclass

# Core components from Llama Index
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    Settings,
    StorageContext,
)
from llama_index.core.evaluation import (
    RetrieverEvaluator,
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    BatchEvalRunner,
    CorrectnessEvaluator,
)
from llama_index.core.node_parser import SentenceSplitter, SimpleNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

from datasets import load_dataset
from groq import Groq as GroqClient
from llama_index.llms.groq import Groq
import chromadb
from google.colab import userdata

logging.getLogger("llama_index.llms.openai.utils").setLevel(logging.ERROR)

import nest_asyncio
nest_asyncio.apply()

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.4.0-py3-none-any.whl.metadata (767 bytes)
Collecting llama-index-vector-stores-chroma
  Downloading llama_index_vector_stores_chroma-0.4.1-py3-none-any.whl.metadata (696 bytes)
Collecting groq
  Downloading groq-0.13.1-py3-none-any.whl.metadata (14 kB)
Collecting llama-index-llms-groq
  Downloading llama_index_llms_groq-0.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting chromadb==0.5.17
  Downloading chromadb-0.5.17-py3-none-any.whl.metadata (6.8 kB)
Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.41.0
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-readers-file
  Downloading llam

In [3]:
@dataclass
class QADataset:
    queries: Dict[str, str]
    corpus: Dict[str, str]
    relevant_docs: Dict[str, List[str]]
    mode: str = "text"

@dataclass
class ExperimentConfig:
    embedding_model: str
    llm_model: str
    chunk_size: int
    dataset_name: str

class RAGExperimentPipeline:
    def __init__(self, groq_api_key: str = None, data_dir: str = "./dane"):
        self.groq_api_key = groq_api_key
        self.data_dir = data_dir
        self.groq_client = GroqClient(api_key=groq_api_key)
        self.persist_dir = "./chroma_db"
        self.collection_name = "collection"

        # Experiment configurations
        self.embedding_models = {
            "minilm": "sentence-transformers/all-MiniLM-L6-v2",
            "mpnet-base": "sentence-transformers/all-mpnet-base-v2",
            "stella": "dunzhang/stella_en_400M_v5",
            "jina_v3": "jinaai/jina-embeddings-v3",
        }
        self.llm_models = ["llama3-8b-8192"]
        self.chunk_sizes = [128, 256, 512, 1024]

        os.makedirs("Result", exist_ok=True)
        os.makedirs(data_dir, exist_ok=True)

    def load_dataset(self, tag_of_interest, num_samples=10):
        dataset = load_dataset("fabiochiu/medium-articles")

        num_of_articles = 0

        with open(f"{self.data_dir}/articles.txt", "w", encoding="utf-8") as f:
            for article in dataset["train"]:
                article_tags = article["tags"]
                if article["text"]:
                    if tag_of_interest in article_tags:
                        f.write(article["text"] + "\n")
                        num_of_articles += 1
                    if num_of_articles >= num_samples:
                        break


    def setup_vector_store(self, config: ExperimentConfig):
        start_time = time.time()

        self.persist_dir = f"./chroma_db_{config.dataset_name}"
        embd = list(self.embedding_models.keys())[list(self.embedding_models.values()).index(config.embedding_model)]
        self.collection_name = f"collection_{embd}_{config.llm_model}_{config.chunk_size}"

        documents = SimpleDirectoryReader(self.data_dir).load_data()
        db = chromadb.PersistentClient(path=self.persist_dir)
        chroma_collection = db.get_or_create_collection(self.collection_name)

        embed_model = HuggingFaceEmbedding(model_name=config.embedding_model, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")
        splitter = SentenceSplitter(chunk_size=config.chunk_size, chunk_overlap=config.chunk_size//4)

        self.index = VectorStoreIndex.from_documents(
            documents,
            storage_context=StorageContext.from_defaults(
                vector_store=ChromaVectorStore(chroma_collection=chroma_collection)
            ),
            embed_model=embed_model,
            transformations=[splitter]
        )

        elapsed_time = time.time() - start_time
        print(f"Vector store for {config.embedding_model} creation completed in {elapsed_time:.2f} seconds.")

        return self.index

    def generate_questions(self, llm_model="llama3-8b-8192", n_of_chunks: int = 10, n_questions_per_chunk: int = 2, seed: int = 42) -> QADataset:
        """Generate questions for randomly selected chunks."""
        prompt_template = f"""Generate {n_questions_per_chunk} distinct questions from the following text.
        Requirements:
        1. Each question should focus on a different aspect of the text.
        2. Questions must be specific to the given content.
        3. Make questions concise and direct.
        4. Avoid yes/no questions.

        Format:
        [Question]

        [Question]"""

        db = chromadb.PersistentClient(path=self.persist_dir)
        collection = db.get_collection(self.collection_name)
        result = collection.get()
        chunks = list(zip(result['ids'], result['documents']))
        print(f"Total num of chunks: {len(chunks)}")

        random.seed(seed)
        selected_chunks = random.sample(chunks, n_of_chunks)

        queries = {}
        corpus = {}
        relevant_docs = {}

        for chunk_idx, (chunk_id, chunk_text) in enumerate(selected_chunks):
            print(f"Processing chunk {chunk_idx + 1}/{len(selected_chunks)}...")
            corpus[chunk_id] = chunk_text

            response = self.groq_client.chat.completions.create(
                messages=[
                    {"role": "user", "content": prompt_template},
                    {"role": "user", "content": chunk_text}
                ],
                model=llm_model
            )

            response_text = response.choices[0].message.content
            questions = [
                line.strip() for line in response_text.splitlines()
                if line.strip() and not line.lower().startswith("here are")
            ]
            for question in questions[:n_questions_per_chunk]:
                question_id = str(uuid.uuid4())
                queries[question_id] = question.strip()
                relevant_docs[question_id] = [chunk_id]

        return QADataset(queries=queries, corpus=corpus, relevant_docs=relevant_docs)

    async def evaluate_response(self, qa_dataset: QADataset):
        query_engine = self.index.as_query_engine(similarity_top_k=3)

        faithfulness_evaluator = FaithfulnessEvaluator()
        relevancy_evaluator = RelevancyEvaluator()

        runner = BatchEvalRunner(
            {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
            workers=1,
        )

        eval_results = await runner.aevaluate_queries(query_engine, queries=list(qa_dataset.queries.values()))

        return eval_results

    async def evaluate_retriever(self, qa_dataset: QADataset):
        retriever = self.index.as_retriever(similarity_top_k=3)
        evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate"],
            retriever=retriever
        )

        eval_results = await evaluator.aevaluate_dataset(qa_dataset)
        print(eval_results)
        return eval_results

    async def evaluate(self, qa_dataset: QADataset, config: ExperimentConfig):
        Settings.llm = Groq(model=config.llm_model, api_key=self.groq_api_key)
        retriever_results = await self.evaluate_retriever(qa_dataset)
        response_results = await self.evaluate_response(qa_dataset)

        return retriever_results, response_results

def display_results(config: ExperimentConfig, retriever_results, response_results):
    metric_dicts = [result.metric_vals_dict for result in retriever_results]
    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    faithfulness_score = sum(
        result.passing for result in response_results['faithfulness']
    ) / len(response_results['faithfulness'])

    relevancy_score = sum(
        result.passing for result in response_results['relevancy']
    ) / len(response_results['relevancy'])

    results_df = pd.DataFrame({
            "Metric": ["Hit Rate", "MRR", "Faithfulness", "Relevancy"],
            "Score": [hit_rate, mrr, faithfulness_score, relevancy_score],
            "dataset": config.dataset_name,
            "embedding_model": config.embedding_model,
            "chunk_size": config.chunk_size,
            "llm_model": config.llm_model,
        })

    return results_df

async def run_experiments():
    pipeline = RAGExperimentPipeline(groq_api_key=userdata.get("GROQ_API_KEY3"))
    results = []

    for tag_name in ["Psychology", "Google", "Books"]:
        print(f"Processing tag: {tag_name}")
        contexts = pipeline.load_dataset(tag_name, num_samples=70)

        for embed_name, embed_model in pipeline.embedding_models.items():
            for chunk_size in pipeline.chunk_sizes:
                for llm_model in pipeline.llm_models:
                    config = ExperimentConfig(
                        embedding_model=embed_model,
                        llm_model=llm_model,
                        chunk_size=chunk_size,
                        dataset_name=tag_name
                    )

                    print(f"Running configuration: {config}")
                    index = pipeline.setup_vector_store(config)
                    qa_dataset = pipeline.generate_questions(llm_model=config.llm_model, n_of_chunks = 10)
                    retriever_results, response_results = await pipeline.evaluate(qa_dataset, config)

                    results_df = display_results(
                        config,
                        retriever_results,
                        response_results
                    )
                    results.append(results_df)

    final_results = pd.concat(results)
    final_results.to_csv("Result/experiment_results.csv")
    return final_results

if __name__ == "__main__":
    results = asyncio.run(run_experiments())

    summary = pd.pivot_table(
        results,
        values=["Score"],
        index=["dataset", "embedding_model", "chunk_size", "llm_model"],
        columns=["Metric"]
    )
    print("\nExperiment Summary:")
    print(summary)


