# Multimodal RAG using LlamaIndex, CLIP, & KDB.AI

Note: This example requires a KDB.AI endpoint and API key. Sign up for a free [KDB.AI account](https://kdb.ai/offerings/).

This example explores preparing, embedding (with CLIP), and storing both text and image data within a KDB.AI vector database using LlamaIndex.

## Install dependencies

In [None]:
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai llama-index-embeddings-clip
!pip install llama-index-embeddings-clip
!pip install 'git+https://github.com/openai/CLIP.git'
!pip install kdbai_client matplotlib wikipedia tqdm

## Download data

**Libraries**

In [2]:
import os
from tqdm import tqdm
import wikipedia
import urllib.request

**Data directories and paths**

In [3]:
# Root path
root_path = os.path.abspath(os.getcwd())

# Data directory and path
data_dir = "data"
data_path = os.path.join(root_path, data_dir)
if not os.path.exists(data_path):
    os.mkdir(data_path)

**Download data - wikipedia images & texts**

In [4]:
def download_data(WIKI_TITLES, MAX_IMAGES_PER_TITLE):
    # Text metadata
    text_uuid = 0
    text_metadata = dict()

    # Image metadata
    image_uuid = 0
    image_metadata = dict()

    # Download data - text and images from wiki pages
    for title in tqdm(WIKI_TITLES):
        images_per_wiki_page = 0
        print(title)
        try:
            wiki_page = wikipedia.page(title)

            # Text - data and metadata
            text_uuid += 1
            page_content = wiki_page.content

            content_path = os.path.join(data_path, f"{title}.txt")
            with open(content_path, 'w') as f:
                f.write(page_content)
            text_file_name = f"{title}.txt"

            text_metadata[text_uuid] = {
                "filename": text_file_name,
                "file_path": content_path
            }

            list_img_urls = wiki_page.images
            for url in list_img_urls:
                if url.endswith(".jpg") or url.endswith(".png"):
                    # Limiting images to downloaded per wikipedia page
                    images_per_wiki_page += 1
                    if images_per_wiki_page > MAX_IMAGES_PER_TITLE:
                        break

                    # Image - data and metadata
                    image_uuid += 1
                    image_path = os.path.join(data_path, f"{image_uuid}.jpg")
                    image_file_name = f"{image_uuid}.jpg"

                    urllib.request.urlretrieve(
                        url, image_path
                    )

                    image_metadata[image_uuid] = {
                        "filename": image_file_name,
                        "file_path": image_path,
                    }
        except Exception:
            print(str(Exception("No images found for Wikipedia page: ")) + title)
            continue

    return image_metadata, text_metadata

In [None]:
# Wikipedia titles to retrieve data & max images per title to retrieve
WIKI_TITLES = [
    "Machiavelli",
    "Beethoven",
    "Burj khalifa",
    "Cherry Blossom",
    "Video Game"
]
MAX_IMAGES_PER_TITLE = 10

# Download the data if not downloaded yet
image_metadata, text_metadata = download_data(WIKI_TITLES, MAX_IMAGES_PER_TITLE)

**Resize Images**

In [6]:
import os
import math
from PIL import Image
from tqdm import tqdm

In [7]:
def resize_images(data_path):
    for file in tqdm(os.listdir(data_path)):
        if file.endswith('.jpg'):
            img_path = os.path.join(data_path, file)
            img = Image.open(img_path)
            img_width, img_height = img.size
            aspect_ratio = img_width/img_height

            if img_width > 2048:
                new_width = 2048
                new_height = round(new_width/aspect_ratio)

                img = img.resize((new_width, new_height))
                img.save(img_path)

In [None]:
resize_images(data_path)

**Show images**

In [9]:
from PIL import Image
import matplotlib.pyplot as plt
import os

In [10]:
def show_images(image_paths):
    images_shown = 0
    plt.figure(figsize=(16,9))
    for img_path in image_paths:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(3, 3, images_shown+1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 9:
                break

In [None]:
image_paths = []
for file in os.listdir(data_path):
    if file.endswith('.jpg'):
        img_path = os.path.join(data_path, file)
        image_paths.append(img_path)
show_images(image_paths)

**Show texts**

In [12]:
def show_texts(text_paths):
    texts_shown = 0
    for text_path in text_paths:
        if os.path.isfile(text_path):
            with open(text_path, 'r') as text_file:
                content = text_file.read()
            print(content[0:512])
            print("="*80)

            texts_shown += 1
            if texts_shown >= 3:
                break

In [None]:
text_paths = []
for file in os.listdir(data_path):
    if file.endswith('.txt'):
        text_path = os.path.join(data_path, file)
        text_paths.append(text_path)
show_texts(text_paths)

## KDB.ai session and tables

**Libraries**

In [14]:
import kdbai_client as kdbai
from getpass import getpass

**KDB.ai session**

With the embeddings created, we need to store them in a vector database to enable efficient searching.

### Define KDB.AI Session

KDB.AI comes in two offerings:

1. [KDB.AI Cloud](https://trykdb.kx.com/kdbai/signup/) - For experimenting with smaller generative AI projects with a vector database in our cloud.
2. [KDB.AI Server](https://trykdb.kx.com/kdbaiserver/signup/) - For evaluating large scale generative AI applications on-premises or on your own cloud provider.

Depending on which you use there will be different setup steps and connection details required.

##### Option 1. KDB.AI Cloud

To use KDB.AI Cloud, you will need two session details - a URL endpoint and an API key.
To get these you can sign up for free [here](https://trykdb.kx.com/kdbai/signup).

You can connect to a KDB.AI Cloud session using `kdbai.Session` and passing the session URL endpoint and API key details from your KDB.AI Cloud portal.

If the environment variables `KDBAI_ENDPOINTS` and `KDBAI_API_KEY` exist on your system containing your KDB.AI Cloud portal details, these variables will automatically be used to connect.
If these do not exist, it will prompt you to enter your KDB.AI Cloud portal session URL endpoint and API key details.

In [None]:
KDBAI_ENDPOINT = (
    os.environ["KDBAI_ENDPOINT"]
    if "KDBAI_ENDPOINT" in os.environ
    else input("KDB.AI endpoint: ")
)
KDBAI_API_KEY = (
    os.environ["KDBAI_API_KEY"]
    if "KDBAI_API_KEY" in os.environ
    else getpass("KDB.AI API key: ")
)

session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)

##### Option 2. KDB.AI Server

To use KDB.AI Server, you will need download and run your own container.
To do this, you will first need to sign up for free [here](https://trykdb.kx.com/kdbaiserver/signup/).

You willreceive an email with the required license file and bearer  token needed to download your instance.
Follow instructions in the signup email to get your session up and running.

Once the [setup steps](https://code.kx.com/kdbai/gettingStarted/kdb-ai-server-setup.html) are complete you can then connect to your KDB.AI Server session using `kdbai.Session` and passing your local endpoint.

In [16]:
# session = kdbai.Session(endpoint="http://localhost:8082")

**Table for storing Text data**

In [17]:
# Texts table name and schema
text_table_name = "wiki_texts"
text_table_schema = [
        dict(name="document_id", type="bytes"),
        dict(name="text", type="bytes"),
        dict(name="embeddings", type="float32s"),
        dict(name="filename", type="str"),
        dict(name="file_path", type="str"),
    ]

indexFlat = {
        "name": "flat",
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 1536, 'metric': 'L2'},
    }

In [18]:
# Connect with kdbai database
db = session.database("default")

In [19]:
# Drop table if exists
try:
    db.table(text_table_name).drop()
except kdbai.KDBAIException:
    pass

In [20]:
# Texts table
texts_table = db.create_table(text_table_name, text_table_schema, indexes=[indexFlat])

In [None]:
# Show texts table scheme
texts_table.schema

**Table for storing Image data**

In [22]:
# Image table name and schema
image_table_name = "wiki_imgs"
image_table_schema = [
        dict(name="document_id", type="bytes"),
        dict(name="text", type="bytes"),
        dict(name="embeddings", type="float32s"),
        dict(name="filename", type="bytes"),
        dict(name="file_path", type="bytes")
    ]

indexFlatImg = {
        "name": "flat", # Note: In this multi-index setup, ensure that both the text and image indexes share the same name.
        "type": "flat",
        "column": "embeddings",
        "params": {'dims': 512, 'metric': 'L2'},
    }

In [23]:
# Drop table if exists
try:
    db.table(image_table_name).drop()
except kdbai.KDBAIException:
    pass

In [24]:
# Images table
imgs_table = db.create_table(image_table_name, image_table_schema, indexes=[indexFlatImg])

## Loading data

**Consolidating both image and text metadata**

In [25]:
metadata_dict = dict()

# Adding image metadata to final metadata dictionary
for k, v in image_metadata.items():
    ondisk_file_name = v["file_path"].split('/')[-1]
    metadata_key = f"{data_dir}/{ondisk_file_name}"
    metadata_value = {
        "filename": v["filename"].encode("utf-8"),
        "file_path": f"{data_dir}/{ondisk_file_name}".encode("utf-8")
    }
    metadata_dict[metadata_key] = metadata_value

# Adding text metadata to final metadata dictionary
for k, v in text_metadata.items():
    ondisk_file_name = v["file_path"].split('/')[-1]
    metadata_key = f"{data_dir}/{ondisk_file_name}"
    metadata_value = {
        "filename": v["filename"].encode("utf-8"),
        "file_path": f"{data_dir}/{ondisk_file_name}".encode("utf-8")
    }
    metadata_dict[metadata_key] = metadata_value

**Consolidating input paths**

In [None]:
local_files = []
for k in metadata_dict:
    local_files.append(k)
print(len(local_files))

**Loading data**

In [27]:
from llama_index.core import SimpleDirectoryReader

In [28]:
def get_metadata(filepath):
    return metadata_dict[filepath]

In [None]:
%%time

documents = SimpleDirectoryReader(input_files=local_files, file_metadata=get_metadata)

docs = documents.load_data()
len(docs)

## Creating Multi Modal Vector Index for data

**OpenAI API Key for CLIP Embeddings**

In [30]:
import os
from getpass import getpass

In [31]:
OPENAI_API_KEY = (
    os.environ["OPENAI_API_KEY"]
    if "OPENAI_API_KEY" in os.environ
    else getpass("OpenAI API key: ")
)


**Create vector store, storage context and the index for retrieval, query purposes**

In [32]:
from llama_index.vector_stores.kdbai import KDBAIVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter

# Store
text_store = KDBAIVectorStore(texts_table)
image_store = KDBAIVectorStore(imgs_table)

# Storage context
storage_context = StorageContext.from_defaults(
    vector_store=text_store,
    image_store=image_store,
)

# Settings
Settings.transformations = [SentenceSplitter(chunk_size=500, chunk_overlap=0)]

# Multi Modal Vector Store Index
index = MultiModalVectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
)

#### Now the data is inserted into the KDB.AI vector database, let's take a look:

In [None]:
texts_table.query()

In [None]:
imgs_table.query()

## Multi Modal Retrieval from user's query

**Using index as retriever**

In [35]:
retriever_engine = index.as_retriever(
                            similarity_top_k=1,     
                            image_similarity_top_k=4,
                            vector_store_kwargs={
                                "index" : "flat",
                            },
                        )

**Retrieve most similar text/image embeddings based on query**

In [36]:
from llama_index.core.response.notebook_utils import display_source_node

def retrieve(retriever_engine, query_str):
    retrieved_results = retriever_engine.retrieve(query_str)

    retrieved_image = []
    retrieved_text = []
    for res_node in retrieved_results:
        if res_node.text == '':
            retrieved_image.append(res_node.node.metadata["file_path"])
        else:
            retrieved_text.append(res_node.text)

    return retrieved_image, retrieved_text

In [None]:
query_str = "Tell me about the machiavelli, where he lived and what he did?"

imgs, txts = retrieve(retriever_engine=retriever_engine, query_str=query_str)

# Show retrieved texts
context_str = "".join(txts)
print(context_str[:512])

# Show retrieved images
show_images(imgs)

## Multi Modal RAG

**Multi Modal Model: GPT-4o**

In [63]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

In [64]:
mm_model_name = "gpt-4o"
openai_mm_llm = OpenAIMultiModal(
    model=mm_model_name, api_key=os.environ["OPENAI_API_KEY"], max_new_tokens=1500
)

**Setting Prompt Template and Query Engine for MM-RAG**

In [61]:
from llama_index.core import PromptTemplate
from llama_index.core.query_engine import SimpleMultiModalQueryEngine

In [65]:
qa_tmpl_str = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)
qa_tmpl = PromptTemplate(qa_tmpl_str)

query_engine = index.as_query_engine(
    llm=openai_mm_llm,
    text_qa_template=qa_tmpl,
    vector_store_kwargs={
            "index" : "flat",
            },
)

**MM-RAG: Question Answering**

In [None]:
# Query by the user
query_str = "Tell me more about the machiavelli, where did he lived and what did he do?"

# Response from Multi Modal model
response = query_engine.query(query_str)

# Printing response
print(response.response)

**Showing the retrieved documents i.e images and text used for answering the query**

In [None]:
# Populate retrieved documents - texts and images
imgs = []
txts = []
for text_node in response.metadata["text_nodes"]:
    if text_node.text == '':
        imgs.append(text_node.metadata['file_path'].decode('utf-8'))
    else:
        txts.append(text_node.text)

# Display retrieved documents - texts and images
for txt in txts:
    print(txt)
show_images(imgs)

## Delete the KDB.AI Table
Once finished with the table, it is best practice to drop it.

In [None]:
texts_table.drop()
imgs_table.drop()