<a href="https://colab.research.google.com/github/HRI328/AXA-CASE-1/blob/main/AI_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?

In [None]:
import chromadb
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
class FreeRAGSystem:
    def __init__(self, embedding_model="all-MiniLM-L6-v2", llm_model="microsoft/DialoGPT-medium"):
        # Initialize embedding model
        self.embedding_model = SentenceTransformer(embedding_model)

        # Initialize LLM for generation
        self.llm_model = llm_model
        if "openai" in llm_model.lower():
            raise ValueError("Please use a free model like microsoft/DialoGPT-medium, facebook/blenderbot-400M-distill, or similar")

        # Initialize text generation pipeline
        self.generator = pipeline(
            "text-generation",
            model=llm_model,
            tokenizer=llm_model,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device=0 if torch.cuda.is_available() else -1
        )

        # Initialize vector database
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(name="rag_collection", get_or_create=True)

    def add_documents(self, documents, metadata=None):
        """Add documents to the vector database using free embeddings"""
        embeddings = self.embedding_model.encode(documents).tolist()

        self.collection.add(
            embeddings=embeddings,
            documents=documents,
            metadatas=metadata,
            ids=[f"id_{i}" for i in range(len(documents))]
        )
        print(f"There are {self.collection.count()} records in the vector database")

    def retrieve(self, query, n_results=3):
        """Retrieve relevant documents from vector database"""
        query_embedding = self.embedding_model.encode([query]).tolist()

        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results
        )
        return results['documents'][0]

    def generate(self, query, retrieved_docs, max_new_tokens=200):
        """Generate answer using retrieved documents and free LLM"""
        context = "\n".join(retrieved_docs)

        prompt = f"""Based on the following information:

{context}

Question: {query}

Please provide a helpful answer:"""

        try:
            # Generate response using free LLM
            response = self.generator(
                prompt,
                max_new_tokens=max_new_tokens, # Use max_new_tokens for clarity
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.generator.tokenizer.eos_token_id,
                truncation=True # Explicitly activate truncation for the input prompt
            )

            return response[0]['generated_text'].replace(prompt, "").strip()
        except Exception as e:
            return f"Error generating response: {str(e)}"


In [None]:
# Alternative implementation with more model options
class AdvancedFreeRAG:
    def __init__(self, embedding_model="all-MiniLM-L6-v2", llm_model="microsoft/DialoGPT-medium"):
        self.embedding_model = SentenceTransformer(embedding_model)

        # Initialize ChromaDB
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(name="advanced_rag", get_or_create=True)

        # Model configuration
        self.llm_model_name = llm_model
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model)
        self.model = AutoModelForCausalLM.from_pretrained(llm_model)

        # Add padding token if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def add_documents(self, documents, metadata=None):
        """Add documents to vector database"""
        embeddings = self.embedding_model.encode(documents).tolist()

        self.collection.add(
            embeddings=embeddings,
            documents=documents,
            metadatas=metadata,
            ids=[f"doc_{i}" for i in range(len(documents))]
        )
        print(f"There are {self.collection.count()} records in the vector database")

    def retrieve(self, query, n_results=3):
        """Retrieve relevant documents"""
        query_embedding = self.embedding_model.encode([query]).tolist()

        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results
        )
        return results['documents'][0]

    def generate(self, query, retrieved_docs, max_new_tokens=150):
        """Generate answer with more control over the generation"""
        context = "\n".join(retrieved_docs)

        prompt = f"""Context: {context}

Question: {query}

Answer:"""

        # Tokenize input, explicitly truncate if necessary
        inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=self.tokenizer.model_max_length)

        # Generate response
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_new_tokens=max_new_tokens, # Use max_new_tokens for clarity
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.1
            )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the prompt from response
        answer = response.replace(prompt, "").strip()
        return answer


In [None]:
# Example usage with different free models
if __name__ == "__main__":
    # Option 1: Basic RAG with DialoGPT
    print("=== Basic RAG with DialoGPT ===")
    rag_basic = FreeRAGSystem(
        embedding_model="all-MiniLM-L6-v2",
        llm_model="microsoft/DialoGPT-medium"
    )

    # Add some documents
    documents = [
        "Python is a high-level programming language known for its simplicity and readability.",
        "Vector databases store data as mathematical vectors for efficient similarity search.",
        "RAG (Retrieval-Augmented Generation) combines information retrieval with language models.",
        "Open source models like those from Hugging Face provide free access to AI technology.",
        "Sentence transformers are used to convert text into numerical vector representations.",
        "RAG is AI combines a large language model's (LLM) ability to generate text with an information retrieval system",
        "RAG pulls in relevant, external information before generating a response"
    ]

    rag_basic.add_documents(documents)

    # Test query
    query = "What is RAG?"
    retrieved_docs = rag_basic.retrieve(query)
    answer = rag_basic.generate(query, retrieved_docs)

    print("Question:", query)
    print("Retrieved documents:", retrieved_docs)
    print("Generated answer:", answer)
    print()

    # Option 2: Advanced RAG with different model
    print("=== Advanced RAG with Different Model ===")
    try:
        rag_advanced = AdvancedFreeRAG(
            embedding_model="all-MiniLM-L6-v2",
            llm_model="microsoft/DialoGPT-large"  # Smaller model for faster inference
        )

        rag_advanced.add_documents(documents)
        retrieved_docs_adv = rag_advanced.retrieve("What are vector databases?")
        # print(retrieved_docs_adv)
        answer_adv = rag_advanced.generate("What are vector databases?", retrieved_docs_adv)

        print("Question: What are vector databases?")
        print("Retrieved documents:", retrieved_docs_adv)
        print("Generated answer:", answer_adv)
    except Exception as e:
        print(f"Note: Model download failed, but you can use other models. Error: {e}")

# List of free models you can use:
FREE_EMBEDDING_MODELS = [
    "all-MiniLM-L6-v2",  # Good balance of speed and quality
    "all-mpnet-base-v2",  # Higher quality, slower
    "paraphrase-MiniLM-L3-v2",  # Fastest
    "multi-qa-MiniLM-L6-cos-v1"  # Optimized for QA
]

FREE_LLM_MODELS = [
    "microsoft/DialoGPT-small",
    "microsoft/DialoGPT-medium",
    "microsoft/DialoGPT-large",
    "facebook/blenderbot-400M-distill",
    "facebook/blenderbot-1B-distill",
    "gpt2",  # Small but fast
    "distilgpt2"  # Even smaller
]

=== Basic RAG with DialoGPT ===


Device set to use cuda:0


There are 7 records in the vector database
Question: What is RAG?
Retrieved documents: ['RAG pulls in relevant, external information before generating a response', 'RAG (Retrieval-Augmented Generation) combines information retrieval with language models.', "RAG is AI combines a large language model's (LLM) ability to generate text with an information retrieval system"]
Generated answer: D

=== Advanced RAG with Different Model ===
There are 7 records in the vector database
Question: What are vector databases?
Retrieved documents: ['Vector databases store data as mathematical vectors for efficient similarity search.', 'Sentence transformers are used to convert text into numerical vector representations.', 'Open source models like those from Hugging Face provide free access to AI technology.']
Generated answer: 
