<a href="https://colab.research.google.com/github/Hamza-Chekireb/llama11b-vision-rag-api/blob/main/API_of_MVP_Retrieval_Augmented_Generation_with_ColPali_and_Llama_3_2_11b_v11062024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 0.Requirement and Importation

In [None]:
!pip install pymilvus

In [None]:
!pip install colpali_engine

In [None]:
import torch
from PIL import Image
from colpali_engine.models import ColPali, ColPaliProcessor
# from colpali_engine.utils import process_images, process_queries

In [None]:
# !pip install colpali_engine

In [None]:
from tqdm import tqdm
from PIL import Image
import os

In [None]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextStreamer
from IPython.display import Markdown,display
import os

In [None]:
# Colpali Model
from colpali_engine.models import ColPali

# Colpali queries and images preprocessing
from colpali_engine.models.paligemma.colpali.processing_colpali import ColPaliProcessor

# Retruever Processor
from colpali_engine.utils.processing_utils import BaseVisualRetrieverProcessor

# Accelerate calculations
from colpali_engine.utils.torch_utils import ListDataset, get_torch_device

# Pytorch Data Loader Object
from torch.utils.data import DataLoader

# Pytorch Library
import torch

# Type Validation
from typing import List, cast

#### Check the availability of GPUs.

In [None]:
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.get_device_name(0)

### 1.Embedding Processus

#### 1.0. Downloading and Configuring the Colpali Model

In [None]:
device = get_torch_device('auto')
model_name = "vidore/colpali-v1.2"
model = ColPali.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
).eval()

In [None]:
# Used to process queries and images to fit the model's input requirements beforehand
processor = cast(ColPaliProcessor, ColPaliProcessor.from_pretrained(model_name))

#### 1.1 Queries Embeddings

In [None]:
# Embed the queries.
def queries_embedding(queries : list):
  # Create a DataLoader to iterate over the list of queries, processing each query
  # individually to fit model input requirements
  dataloader = DataLoader(
      dataset=ListDataset[str](queries),
      batch_size=1,
      shuffle=False,
      collate_fn=lambda x: processor.process_queries(x),
  )
  qs: List[torch.Tensor] = []

  for batch_query in dataloader:
      with torch.no_grad():
          batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
          embeddings_query = model(**batch_query)
      qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
  return qs

#### 1.2. Document Embedding Process

In [None]:
# !pip install pdf2image
# !pip install pdf2jpg

In [None]:
from pymilvus import connections, Collection, utility
from pdf2jpg import pdf2jpg
from pdf2image import convert_from_path
import os

##### 1.2.1. Convert PDF to PNG images.

In [None]:
!apt-get install -y poppler-utils

In [None]:
!pdfinfo --version

In [None]:
# Document Path
pdf_path = "/content/VRAG_Test_Documents.pdf"

# The Output Folder Path
output_folder = "/content/"
def conver_pdf2image(pdf_path, output_folder):

    # Images List
    images = convert_from_path(pdf_path=pdf_path, output_folder=output_folder, dpi=300, fmt="jpg")

    # Extract Path
    basename = os.path.splitext(os.path.basename(pdf_path))
    # basename[0]

    # Save each page as an image
    for page_num, page in enumerate(images, start=1):
        image_name = f"{basename}_page_{page_num}.png"
        image_path = os.path.join(output_folder,image_name)
        page.save(image_path,"PNG")
    # Save each page as an image
    for page_num, page in enumerate(images, start=1):
        image_name = f"{basename}_page_{page_num}.png"
        image_path = os.path.join(output_folder,image_name)
        page.save(image_path,"PNG")

In [None]:
conver_pdf2image(pdf_path, output_folder)

In [None]:
image_folder = "/content"
image_list = []
for filename in os.listdir(image_folder):
    if filename.lower().endswith(".jpg"):
        file_path = os.path.join(image_folder, filename)
        image = Image.open(file_path)
        image_list.append(image)
image_list

In [None]:
image_list[3]

In [None]:
dataloader = DataLoader(
    dataset=ListDataset[str](image_list),
    batch_size=1,
    shuffle=False,
    collate_fn=lambda x: processor.process_images(x),
)

In [None]:
ds: List[torch.Tensor] = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        # Move all tensors in batch_doc to the same device as the model weights,
        # but keep 'input_ids' as Long or Int.
        batch_doc = {
            k: v.to(dtype=torch.bfloat16, device=device) if k != "input_ids" else v.to(device=device)
            for k, v in batch_doc.items()
        }
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

In [None]:
ds

### 2.Retrieval Process

#### 2.0. Set up connection to Milvus DB


In [None]:
from pymilvus import MilvusClient, DataType, connections, utility
import numpy as np
import concurrent.futures
from pymilvus import Collection, connections
collection_name = "VRAG_BRD_Content"
client = MilvusClient(uri="tcp://0.tcp.in.ngrok.io:14024")

In [None]:
connections.connect("default", host="0.tcp.in.ngrok.io", port="14024")
collection = Collection(collection_name)

#### 2.1. Create a collection

In [None]:
client.list_collections()

In [None]:
# Initialize the retriever with a Milvus client, collection name, and dimensionality of the vector embeddings.
# If the collection exists, load it.
collection_name = "VRAG_BRD_Content"
# if client.has_collection(collection_name=collection_name):
#     client.load_collection(collection_name)
dim = 128

if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)

schema = client.create_schema(
    auto_id=True,
    enable_dynamic_fields=True,
)
schema.add_field(field_name="pk", datatype=DataType.INT64, is_primary=True)
schema.add_field(
    field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=dim
)
schema.add_field(field_name="seq_id", datatype=DataType.INT16)
schema.add_field(field_name="doc_id", datatype=DataType.INT64)

# The page path is stored here.
schema.add_field(field_name="doc", datatype=DataType.VARCHAR, max_length=65535)

client.create_collection(
    collection_name=collection_name, schema=schema
)

In [None]:
client.list_collections()

#### 2.2. Create an index

In [None]:
collection.num_entities

In [None]:
# Release from Memory
client.release_collection(collection_name=collection_name)
#
client.drop_index(
    collection_name=collection_name, index_name="vector"
)

[Index Algorithms](https://milvus.io/docs/index.md?tab=floating)

In [None]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="vector",
    index_name="vector_index",
    index_type="HNSW",  # or any other index type you want
    metric_type="IP",  # or the appropriate metric type
    params={
        "M": 16,
        "efConstruction": 500,
    },  # adjust these parameters as needed
)

In [None]:
collection_name

In [None]:
client.create_index(
    collection_name=collection_name, index_params=index_params, sync=True #to be certain that the indexing process has completed before performing further operations
)

In [None]:
collection.indexes

#### 2.3. Insert Data

In [None]:
filepaths = []
for filename in os.listdir(image_folder):
    if filename.lower().endswith(".jpg"):
        file_path = os.path.join(image_folder, filename)
        filepaths.append(file_path)

In [None]:
def insert(self, data):
    # Insert ColBERT embeddings and metadata for a document into the collection.
    # Embeddings List
    colbert_vecs = [vec for vec in data["colbert_vecs"]]

    # Patches length
    seq_length = len(colbert_vecs)

    doc_ids = [data["doc_id"] for i in range(seq_length)]

    #
    seq_ids = list(range(seq_length))

    #
    # docs = [""] * seq_length

    #
    # docs[0] = data["filepath"]

    # To be tested :
    docs = data["filepath"]


    # Insert the data as multiple vectors (one for each sequence) along with the corresponding metadata.
    client.insert(
        collection_name,
        [
            {
                "vector": colbert_vecs[i],
                "seq_id": seq_ids[i],
                "doc_id": doc_ids[i],
                "doc": docs[i],
            }
            for i in range(seq_length)
        ],
    )


In [None]:
for i in range(len(filepaths)):
    data = {
        "colbert_vecs": ds[i].float().numpy(),
        "doc_id": i,
        "filepath": filepaths[i], #*******************************************************************************************************
    }

    # Insert ColBERT embeddings and metadata for a document into the collection.
    # Embeddings List
    colbert_vecs = [vec for vec in data["colbert_vecs"]]

    # Patches length
    seq_length = len(colbert_vecs)

    doc_ids = [data["doc_id"] for i in range(seq_length)]

    #
    seq_ids = list(range(seq_length))

    # #
    # docs = [""] * seq_length

    #
    docs = [data["filepath"]] * seq_length

    # Insert the data as multiple vectors (one for each sequence) along with the corresponding metadata.
    client.insert(
        collection_name,
        [
            {
                "vector": colbert_vecs[i],
                "seq_id": seq_ids[i],
                "doc_id": doc_ids[i],
                "doc": docs[i],
            }
            for i in range(seq_length)
        ],
    )

#### Optional: Check the Data in Milvus Collections

In [None]:
# collection.load()
# collection.query(expr="pk >= 0")

In [None]:
collection.flush()
collection.num_entities

#### 2.4. Similarity Search

In [None]:
topk = 3
collection_name = "VRAG_BRD_Content"
search_params = {"metric_type": "IP", "params": {}}
collection.load()

def retriever(qs, collection_name, topk=3, search_params= search_params):
  def rerank_single_doc(doc_id, data, client, collection_name):
    # Rerank a single document by retrieving its embeddings and calculating the similarity with the query.
    doc_colbert_vecs = client.query(
        collection_name=collection_name,
        filter=f"doc_id in [{doc_id}, {doc_id + 1}]",
        output_fields=["seq_id", "vector", "doc"],
        limit=1000,
    )
    doc_vecs = np.vstack(
        [doc_colbert_vecs[i]["vector"] for i in range(len(doc_colbert_vecs))]
    )
    score = np.dot(data, doc_vecs.T).max(1).sum()
    return (score, doc_id)
    #***************************************************************************

  images_paths = []
  for query in qs:
      #0. Get all the documents that contain at least 1 similar (token-patch)
      query = query.float().numpy()
      results = client.search(
          collection_name,
          query,
          limit=5,
          output_fields=["vector", "seq_id", "doc_id"],
          search_params=search_params,
      )

      #1. Retrieve all document IDs that contain at least one similarity between the query tokens and the document patches
      doc_ids = set()
      for r_id in range(len(results)): # len(number of tokens) : for each query token
          for r in range(len(results[r_id])): # for each similar patch
              doc_ids.add(results[r_id][r]["entity"]["doc_id"]) # add the document_id to the list
      # print(doc_ids)

      #2. Get the maximum similarity score for each query across all documents :
      # Example: The maximum score for the first query with image one is 20, while the maximum score with image two is 12.
      # Therefore, image one is more similar to the query than the other images.

      #2.1. Create the similarity search function
      scores = []
      #2.2. # Run the rerank(document) task in parallel for up to 300 workers
      with concurrent.futures.ThreadPoolExecutor(max_workers=300) as executor:
              futures = {
                  executor.submit(
                      rerank_single_doc, doc_id, query, client, collection_name
                  ): doc_id
                  for doc_id in doc_ids
              }
              for future in concurrent.futures.as_completed(futures):
                  score, doc_id = future.result()
                  scores.append((score, doc_id))

      scores.sort(key=lambda x: x[0], reverse=True)

      if len(scores) >= topk:
            scores = scores[:topk]
      else:
            scores = scores

      for i in scores:
        image_path = collection.query(expr=f"doc_id == {i[-1]}", output_fields=["doc"], limit=1)[0]['doc']
        images_paths.append(image_path)

      return images_paths, scores

In [None]:
qs = queries_embedding(["List of business definitions ?"])

In [None]:
qs

In [None]:
images_paths, scores = retriever(qs = qs, collection_name=collection_name)

In [None]:
images_paths

#### 2.5. Display the top similar results

In [None]:
for i in images_paths:
  img = Image.open(i)
  display(img)
  print("*"*150)
  print("*"*150)

### 4. VModel utilisation

#### 4.1. unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit

In [None]:
!pip install transformers bitsandbytes

In [None]:
from transformers import BitsAndBytesConfig

In [None]:
!pip install -U bitsandbytes

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import bitsandbytes as bnb

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit")

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit",
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    ),
)

In [None]:
torch.is_bitsandbytes_available()

#### 4.2. Llama 3.2 11b

In [None]:
# image_path = images_paths[0:3]
# image_path

In [None]:
#0. Setup the token to download the model
os.environ["HUGGING_FACE_HUB_TOKEN"] = "*********************************"  # Replace with your actual token
# Get your Hugging Face token from the environment variable
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
# If token is not found, raise an error
if token is None:
    raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable not set. Please set it to your Hugging Face token.")

In [None]:
#1. Download the model
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

llm_model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=token,
    temperature = 0.1
)

# 2. Set up the processor: the processor is used to handle different types of inputs (images and queries).
llm_processor = AutoProcessor.from_pretrained(model_id)

In [None]:
#3. Initialize the streamer
streamer = TextStreamer(llm_processor.tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
# query = "Who is Kaizad ? "
# qs = queries_embedding(query)
# images_paths = retriever(qs = qs, collection_name=collection_name)
# images_paths

In [None]:
images_paths = ['/content/BUSINESS REQUIREMENT DOCUMENT v2 110420241840_page-0001.jpg',
                '/content/BUSINESS REQUIREMENT DOCUMENT v2 110420241840_page-0002.jpg',
                '/content/BUSINESS REQUIREMENT DOCUMENT v2 110420241840_page-0003.jpg']

##### 4.2.1. Inference Generation

In [None]:
!pip install ngrok
!pip install pyngrok
!pip install uvicorn

In [None]:
def vrag(query):

  images_ = []
  collection_name = "VRAG_BRD_Content"
  qs = queries_embedding([query])
  images_paths_, scores = retriever(qs = qs, collection_name=collection_name)

  for image_path_ in images_paths_:
    images_.append(Image.open(image_path_))

  messages = [
      {
          "role": "user",
      "content": [{"type": "image"},{"type": "image"},{"type": "image"},{"type": "text", "text": query}]
          }
  ]

  input_text = llm_processor.apply_chat_template(messages, add_generation_prompt=True)

  inputs = llm_processor(
      images_,
      input_text,
      add_special_tokens=True,
      return_tensors="pt",
  ).to(llm_model.device)
  output =llm_model.generate(**inputs, max_new_tokens=256,  streamer=streamer)
  answer = {"answer": llm_processor.decode(output[0],skip_special_tokens=True, skip_prompt = True)}
  return answer, images_paths_, scores



In [None]:
answer, images_paths, scores = vrag("")

In [None]:
from IPython.display import Markdown, display

In [None]:
display(Markdown(answer['answer']))

In [None]:
from pydantic import BaseModel
class Item(BaseModel):
    query: str

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!pip install fastapi

In [None]:
from fastapi import FastAPI

app = FastAPI()

@app.post('/vrag_llama_11b')
def api(item: Item):
    # Call your function and get the result
    answer = vrag(item.query)
    return answer  # Return the result as JSON

In [None]:
from pyngrok import ngrok
import uvicorn
import nest_asyncio

In [None]:
!ngrok authtoken ****************************************************
# Allow nested asyncio loops
nest_asyncio.apply()
ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
uvicorn.run(app, host='0.0.0.0', port=8000)

In [None]:
# # Main loop
# while True:
#     query = input("Enter your query (or type 'exit' to quit): ")
#     if query.lower() == "exit":
#         print("Exiting the loop.")
#         break

#     # Step 1: Embed the query
#     qs = queries_embedding(query)

#     # Step 2: Retrieve similar images based on the query embedding
#     images_paths = retriever(qs = qs, collection_name="VRAG_BRD_Content")
#     for i in images_paths:
#       img = Image.open(i)
#       display(img)
#       print("*"*150)
#       print("*"*150)

#     # Step 3: Process and prepare input for the model
#     model_input = input_processing(query = query, images_paths = images_paths)

#     # Step 4: Feed the input to your model (pseudo code here)
#     model_output = llm_model.generate(**model_input, max_new_tokens=128,  streamer=streamer)  # Replace with your actual model prediction

#     # Display or process the model output as needed
#     display(Markdown(llm_processor.decode(model_output[0],skip_special_tokens=True)))

In [None]:
# # Load the Llama 3.2 11B Vision model from Hugging Face
# def load_model():
#     model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
#     processor = AutoProcessor.from_pretrained(model_id)

#     # Get your Hugging Face token from the environment variable
#     token = os.environ.get("HUGGING_FACE_HUB_TOKEN")

#     # If token is not found, raise an error
#     if token is None:
#         raise ValueError("HUGGING_FACE_HUB_TOKEN environment variable not set. Please set it to your Hugging Face token.")

#     model = MllamaForConditionalGeneration.from_pretrained(
#         model_id,
#         torch_dtype=torch.bfloat16,
#         device_map="auto",
#         token=token,
#     )

#     return model, processor

# # Test the model with an image
# def test_model(model, processor, image_path, query):

#     # Initialize the streamer
#     streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
#     # Load the image
#     image = Image.open(image_path)

#     # Prepare input for the model
#     messages = [
#         {"role": "user", "content": [
#             {"type": "image"},
#             {"type": "text", "text": query}
#         ]}
#     ]
#     input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
#     inputs = processor(
#         image,
#         input_text,
#         add_special_tokens=False,
#         return_tensors="pt",
#     ).to(model.device)

#     # Generate output
#     output = model.generate(**inputs, max_new_tokens=128,  streamer=streamer)

#     # Decode and print the output
#     return output
#     # print(processor.decode(output[0]))

In [None]:
# # Load the model and processor
# llm_model, llm_processor = load_model()

In [None]:
# queries[0]

In [None]:
# # Specify an image to test (local path in Colab)
# image_path = images_paths[0]
# # Test the model with the specified image
# query = queries[0]
# output = test_model(llm_model, llm_processor, image_path, query = query)