<a href="https://colab.research.google.com/github/HarshSonaiya/DL/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## RAG ARCHITECTURES

1. RAG Token   
2. RAG Sequence
3. RAG End to End
4. RAG with Dense Retriver
5. RAG with Hybrid Retriever
6. RAG with knowledge Base  
7. RAG with Multi-Hop Retriever
8. Iterative RAG
9. RAG for Multi-Modal Retriever
10. RAG with Dual-Encoder


## TYPES OF RETREIVERS

1. Sparse Retriever
2. Dense Retreiver
3. Neural Retreiver
4. Lexical Matching Retriver
5. Knowledge-Base Retriver
6. Filter Retriver
7. Embedded Retriver
8. Cross-Encoders
9. Multi-Hop Retriver
10. Memory Augmented Retrievers

In [91]:
! pip install langchain
! pip install -U langchain-community
! pip install sentence_transformers
! pip install chromadb



In [156]:
import os
import uuid
import getpass
import chromadb
from langchain_community.document_loaders import PyMuPDFLoader as pymupdf
from sentence_transformers import SentenceTransformer
from textwrap import wrap  # For chunking

In [149]:
# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')
client = chromadb.Client()

In [46]:
# os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key:··········


In [68]:
def extract_text_from_pdf():

    file1_elements = pymupdf("/content/LSTMs.pdf")
    file2_elements = pymupdf("/content/Weekly Internship Report 3.pdf")
    file3_elements = pymupdf("/content/Kolkata-The-Statesman-04-TH-SEPTEMBER-2024.pdf")

    text_content1 = file1_elements.load()
    text_content2 = file2_elements.load()
    text_content3 =file3_elements.load()

    return text_content1, text_content2, text_content3

In [111]:
def chunk_text(text):
    """Chunk the text into smaller segments."""
    return text.split('\n')


In [112]:
def process_text(documents):
    """Process the documents: chunk, encode, and store."""
    chunks = []
    for doc in documents:
        text = doc.page_content
        doc_chunks = chunk_text(text)
        chunks.extend(doc_chunks)

    embeddings = model.encode(chunks)

    # Convert embeddings to lists
    embeddings_as_lists = [embedding.tolist() for embedding in embeddings]

    return chunks, embeddings_as_lists

In [113]:
file1, file2, file3 = extract_text_from_pdf()

In [148]:
file1_chunks, file1_embeddings = process_text(file1)
file2_chunks, file2_embeddings = process_text(file2)
file3_chunks, file3_embeddings = process_text(file3)


In [157]:
# Create a collection for each PDF
def create_and_populate_collection(collection_name, chunks, embeddings):
    collection = client.create_collection(name=collection_name)
    # Generate unique IDs for each chunk
    ids = [str(uuid.uuid4()) for _ in range(len(chunks))]  # Import uuid module if not already done
    collection.add(documents=chunks, embeddings=embeddings, ids=ids)  # Use documents argument for chunks

In [159]:
# Example usage
def populate_collections(file1_chunks, file1_embeddings, file2_chunks, file2_embeddings, file3_chunks, file3_embeddings):
    create_and_populate_collection("file001_collection", file1_chunks, file1_embeddings)
    create_and_populate_collection("file002_collection", file2_chunks, file2_embeddings)
    create_and_populate_collection("file003_collection", file3_chunks, file3_embeddings)

populate_collections(file1_chunks, file1_embeddings, file2_chunks, file2_embeddings, file3_chunks, file3_embeddings)

## DENSE RETRIEVER

In [168]:
client.list_collections()

[Collection(id=77ad0f87-9610-4eef-9c14-2fea5e8ee494, name=file003_collection),
 Collection(id=880d27b5-0537-4e45-8498-5bdb897a33f6, name=file002_collection),
 Collection(id=e7963c68-1524-4c50-833d-d96dfc1544e7, name=file001_collection)]

In [176]:
collection_names=[]
for name in client.list_collections():
  collection_names.append(name.name)

print(collection_names)

['file003_collection', 'file002_collection', 'file001_collection']


In [177]:
collections = {name: client.get_collection(name=name) for name in collection_names}


In [178]:
print(collections)

{'file003_collection': Collection(id=77ad0f87-9610-4eef-9c14-2fea5e8ee494, name=file003_collection), 'file002_collection': Collection(id=880d27b5-0537-4e45-8498-5bdb897a33f6, name=file002_collection), 'file001_collection': Collection(id=e7963c68-1524-4c50-833d-d96dfc1544e7, name=file001_collection)}


In [211]:
def apply_dense_retriever(collection_name, query):
    collection = client.get_or_create_collection(collection_name)
    for doc in collection.get():
        # Check if 'documents' is a valid key in doc and if doc is a dictionary-like object
        if isinstance(doc, str) and 'documents' in doc:
            print(doc[3])
        else:
            print("Unexpected format for 'doc':", type(doc))
            print(doc) # Print the entire doc for inspection
    # ... rest of your code ...
    return 1

In [212]:
query = "What is the main topic of the document?"

# Apply Dense Retriever
print("\nDense Retriever Results:")
for name in collection_names:
    print(f"\nCollection: {name}")
    dense_results = apply_dense_retriever(name, query)




Dense Retriever Results:

Collection: file003_collection
Unexpected format for 'doc': <class 'str'>
ids
Unexpected format for 'doc': <class 'str'>
embeddings
Unexpected format for 'doc': <class 'str'>
metadatas
u
Unexpected format for 'doc': <class 'str'>
uris
Unexpected format for 'doc': <class 'str'>
data
Unexpected format for 'doc': <class 'str'>
included

Collection: file002_collection
Unexpected format for 'doc': <class 'str'>
ids
Unexpected format for 'doc': <class 'str'>
embeddings
Unexpected format for 'doc': <class 'str'>
metadatas
u
Unexpected format for 'doc': <class 'str'>
uris
Unexpected format for 'doc': <class 'str'>
data
Unexpected format for 'doc': <class 'str'>
included

Collection: file001_collection
Unexpected format for 'doc': <class 'str'>
ids
Unexpected format for 'doc': <class 'str'>
embeddings
Unexpected format for 'doc': <class 'str'>
metadatas
u
Unexpected format for 'doc': <class 'str'>
uris
Unexpected format for 'doc': <class 'str'>
data
Unexpected format 