In [None]:
%pip install -r requirements.txt

In [None]:
from huggingface_hub import login

login("api_key_here")

In [None]:
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex, set_global_tokenizer

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.base.llms.types import CompletionResponse
from llama_index.llms.ollama import Ollama as LlamaIndexOllama
from llama_index.core.retrievers import QueryFusionRetriever
from transformers import AutoTokenizer, BitsAndBytesConfig
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.llms import CustomLLM, LLMMetadata
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.evaluation import (
    EmbeddingQAFinetuneDataset,
    RetrieverEvaluator,
    generate_question_context_pairs
)
import nest_asyncio
import warnings
import argparse
import torch

nest_asyncio.apply()

warnings.filterwarnings('ignore')



system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided. Only use the context provided and STRICTLY say you dont know if you dont know."
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"

bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)

llm_quest = HuggingFaceLLM(
        model_name="Qwen/Qwen2.5-1.5B-Instruct",
        tokenizer_name="Qwen/Qwen2.5-1.5B-Instruct",
        system_prompt=system_prompt,
        query_wrapper_prompt=query_wrapper_prompt,
        context_window=3900,
        max_new_tokens=256,
        model_kwargs={"quantization_config": bnb_config},
        generate_kwargs={"temperature": 0.1},
        device_map="auto",
    )

set_global_tokenizer(
    AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct").encode
)

embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en-v1.5")

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description="Tune RAG Model MLflow")
    parser.add_argument('--dataset_dir', type=str, default="./data", help="Directory for the dataset")
    parser.add_argument('--chunk_size', type=int, default=512, help="Chunk size for splitting documents")
    parser.add_argument('--top_k', type=int, default=5, help="Top K similar nodes to retrieve")
    parser.add_argument('--model_name', type=str, default='llama3.2:1b', help="Model name")
    parser.add_argument('--embedder_name', type=str, default='nomic-embed-text:latest', help="Embedder name")
    parser.add_argument('--dataset_name', type=str, default='pg_eval_dataset_index.json', help="Dataset name")
    parser.add_argument('--chunk_questions', type=int, default=2, help="Number of questions per chunk")
    return parser.parse_args()

async def tune_rag(args):
    documents = SimpleDirectoryReader(args.dataset_dir).load_data()
    node_parser = SentenceSplitter(chunk_size=args.chunk_size, chunk_overlap=100)
    nodes = node_parser.get_nodes_from_documents(documents)

    for idx, node in enumerate(nodes):
        node.id_ = f"node_{idx}"

    vector_index = VectorStoreIndex(nodes)

    retriever = vector_index.as_retriever(similarity_top_k=args.top_k, similarity_cutoff=0.7)

    # Generate new dataset
    qa_dataset = generate_question_context_pairs(
        nodes, 
        llm=llm_quest, 
        num_questions_per_chunk=args.chunk_questions
    )
    print(f"this is the qa_dataset outside the function: {qa_dataset}")
    
    # Save the newly generated dataset
    qa_dataset.save_json(args.dataset_name)

    print(f"Generated and saved new dataset to {args.dataset_name}")

In [None]:
# Manually define args in Colab
from types import SimpleNamespace

Settings.embed_model = embed_model
args = SimpleNamespace(
    dataset_dir="./data",
    chunk_size=512,
    top_k=5,
    model_name="Qwen/Qwen2.5-1.5B-Instruct",
    embedder_name="nomic-embed-text:latest",
    dataset_name="pg_eval_dataset_index_BERT.json",
    chunk_questions=2,

)
await tune_rag(args)