<a href="https://colab.research.google.com/github/KeerthuBalu/AI-project/blob/main/RAG%20experiment%20tracking%20using%20mlflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# !pip install chromadb==0.5.5 langchain-chroma==0.1.2 langchain==0.2.11 langchain-community==0.2.10 langchain-text-splitters==0.2.2 langchain-groq==0.1.6 transformers==4.43.2 sentence-transformers==3.0.1 unstructured==0.15.0 unstructured[pdf]==0.15.0


In [1]:

import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

In [2]:
os.environ["GROQ_API_KEY"] ="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"


In [3]:
# Define your question-answer pairs and ground truth
qa_pairs = [
    {"question": "What is MLflow?",
     "answer": "MLflow is an open-source platform for managing machine learning workflows."},
    {"question": "What is Apache Spark?",
     "answer": "Apache Spark is an open-source distributed computing system for big data processing."}
]


In [12]:
# Generate embeddings for questions using HuggingFace embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create documents with metadata
from langchain.schema import Document
documents = [
    Document(page_content=pair["answer"], metadata={"source": f"QA Pair {i+1}"})
    for i, pair in enumerate(qa_pairs)
]

In [13]:
# Initialize ChromaDB with these documents
persist_directory = "memory_db"
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [14]:
# Create a retriever from the vector store
retriever = vectordb.as_retriever()

# Initialize the Groq LLM
llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0
)

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)



In [15]:
# MLflow tracking
!pip install MLflow
import mlflow
mlflow.set_experiment("RAG_with_LangChain_and_Groq")

with mlflow.start_run(run_name="RAG Experiment"):
    mlflow.log_param("Embedding Model", "sentence-transformers/all-MiniLM-L6-v2")
    mlflow.log_param("LLM Model", "llama-3.1-70b-versatile")
    mlflow.log_param("Retriever Type", "ChromaDB")




In [9]:
# User-defined query and evaluation
queries = [
    {"query": "What is MLflow?", "ground_truth": "MLflow is an open-source platform for managing machine learning workflows."},
    {"query": "What is Apache Spark?", "ground_truth": "Apache Spark is an open-source distributed computing system for big data processing."}
]

In [10]:
# Evaluate responses
for idx, item in enumerate(queries):
    print(f"Query {idx+1}: {item['query']}")
    response = qa_chain.invoke({"query": item["query"]})

    # Extract LLM response and compare with ground truth
    llm_response = response["result"]
    ground_truth = item["ground_truth"]

    print(f"LLM Response: {llm_response}")
    print(f"Ground Truth: {ground_truth}")
    print(f"Source: {response['source_documents'][0].metadata['source']}")
    print("-" * 50)



Query 1: What is MLflow?




LLM Response: MLflow is an open-source platform for managing machine learning workflows.
Ground Truth: MLflow is an open-source platform for managing machine learning workflows.
Source: QA Pair 1
--------------------------------------------------
Query 2: What is Apache Spark?
LLM Response: Apache Spark is an open-source distributed computing system for big data processing.
Ground Truth: Apache Spark is an open-source distributed computing system for big data processing.
Source: QA Pair 2
--------------------------------------------------
