In [1]:
!pip install pinecone-client sentence-transformers==2.7.0 faiss-cpu==1.8.0 -qqq

# How a Vector Database Works

## KNN retreive and Limitations

**Practice Dataset**

In [2]:
!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
!tar -xf sift.tar.gz
!mkdir data/sift1M -p
!mv sift/* data/sift1M

--2024-11-27 04:29:59--  ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
           => ‘sift.tar.gz’
Resolving ftp.irisa.fr (ftp.irisa.fr)... 131.254.254.45, 2001:660:7303:254::45
Connecting to ftp.irisa.fr (ftp.irisa.fr)|131.254.254.45|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /local/texmex/corpus ... done.
==> SIZE sift.tar.gz ... 168280445
==> PASV ... done.    ==> RETR sift.tar.gz ... done.
Length: 168280445 (160M) (unauthoritative)


2024-11-27 04:30:59 (2.86 MB/s) - ‘sift.tar.gz’ saved [168280445]



**Load Practice Dataset**

In [3]:
import psutil

def get_memory_usage_mb():
    process = psutil.Process()
    memory_info = process.memory_info()
    return memory_info.rss / (1024 * 1024)

In [4]:
import time
import faiss
from faiss.contrib.datasets import DatasetSIFT1M

ds = DatasetSIFT1M()

xq = ds.get_queries() # data for using
xb = ds.get_database() # saved vector data
gt = ds.get_groundtruth() # ground truth label

**Changes in index/search time and memory usages as data grows**

In [8]:
k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for i in range(1, 10, 2):
    start_memory = get_memory_usage_mb()
    start_indexing = time.time()
    index = faiss.IndexFlatL2(d)
    index.add(xb[:(i+1)*100000])
    end_indexing = time.time()
    end_memory = get_memory_usage_mb()

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()
    print(f"데이터: {(i+1)*100000}개")
    print(f"색인: {(end_indexing - start_indexing) * 10000 :.3f} ms {(end_memory - start_memory):.3f} MB 검색: {(t1 - t0) * 1000 / nq :.3f} ms")

데이터: 200000개
색인: 687.654 ms -390.668 MB 검색: 1.630 ms
데이터: 400000개
색인: 964.215 ms 97.562 MB 검색: 3.251 ms
데이터: 600000개
색인: 1472.666 ms 97.668 MB 검색: 4.910 ms
데이터: 800000개
색인: 1964.438 ms 97.688 MB 검색: 6.555 ms
데이터: 1000000개
색인: 2472.897 ms 97.777 MB 검색: 8.147 ms


# Key parameters of HNSW Index

## Paramter: `m`

In [11]:
import numpy as np

k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for m in [8, 16, 32, 64]:
    index = faiss.IndexHNSWFlat(d, m)
    time.sleep(3)
    start_memory = get_memory_usage_mb()
    start_index = time.time()
    index.add(xb)
    end_memory = get_memory_usage_mb()
    end_index = time.time()
    print(f"M: {m} - Indexing time: {end_index - start_index} s, Memory usage: {end_memory - start_memory} MB")

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1 - t0) * 1000.0 / nq:.3f} ms per query, R@1 {recall_at_1:.3f}")

M: 8 - Indexing time: 10.826627492904663 s, Memory usage: 598.71875 MB
2121.793 ms per query, R@1 0.684
M: 16 - Indexing time: 13.72940731048584 s, Memory usage: 621.51171875 MB
2138.547 ms per query, R@1 0.774
M: 32 - Indexing time: 26.921372652053833 s, Memory usage: 736.640625 MB
2168.507 ms per query, R@1 0.891
M: 64 - Indexing time: 37.020442485809326 s, Memory usage: 1011.65234375 MB
2208.576 ms per query, R@1 0.928


## Parameter: `ef_construction`

In [12]:
k = 1
d = xq.shape[1]
nq = 1000
xq = xq[:nq]

for ef_construction in [40, 80, 160, 320]:
    index = faiss.IndexHNSWFlat(d, 32)
    index.hnsw.efConstruction = ef_construction
    start_memory = get_memory_usage_mb()
    start_index = time.time()
    index.add(xb)
    end_memory = get_memory_usage_mb()
    end_index = time.time()
    print(f"efConstruction: {ef_construction} - Indexing time: {end_index - start_index} s, Memory usage: {end_memory - start_memory} MB")

    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1 - t0) * 1000.0 / nq:.3f} ms per query, R@1 {recall_at_1:.3f}")

efConstruction: 40 - Indexing time: 26.81549048423767 s, Memory usage: 748.63671875 MB
0.019 ms per query, R@1 0.909
efConstruction: 80 - Indexing time: 33.101176261901855 s, Memory usage: 736.4296875 MB
0.014 ms per query, R@1 0.873
efConstruction: 160 - Indexing time: 68.20233583450317 s, Memory usage: 736.23828125 MB
0.016 ms per query, R@1 0.883
efConstruction: 320 - Indexing time: 134.2638123035431 s, Memory usage: 736.45703125 MB
0.018 ms per query, R@1 0.903


## Parameter: `ef_search`

In [13]:
for ef_search in [16, 32, 64, 128]:
    index.hnsw.efSearch = ef_search
    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    recall_at_1 = np.equal(I, gt[:nq, :1]).sum() / float(nq)
    print(f"{(t1 - t0) * 1000.0 / nq:.3f} ms per query, R@1 {recall_at_1:.3f}")

0.018 ms per query, R@1 0.903
0.028 ms per query, R@1 0.963
0.049 ms per query, R@1 0.988
0.086 ms per query, R@1 0.994


# Pinecone

## Pinecone Client

**Connect account & Create index** 

In [14]:
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = ""

pc = Pinecone(api_key=pinecone_api_key)
pc.create_index("llm-book", spec=ServerlessSpec("aws", "us-east-1"), dimension=768)

index = pc.Index('llm-book') # Load Index

**Create embeddings**

In [38]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
klue_dp_train = load_dataset('klue', 'dp', split='train[:100]')

embeddings = sentence_model.encode(klue_dp_train['sentence'])

In [39]:
klue_dp_train['sentence'][0]

'해당 그림을 보면 디즈니 공주들이 브리트니 스피어스의 앨범이나 뮤직비디오, 화보 속 모습을 똑같이 재연했다.'

**Data processing**

In [40]:
embeddings = embeddings.tolist()

# Data format for pinecone: {"id": document ID(str), "values": embeddings(List[float]), "metadata": metadata(dict)}
insert_data = []
for idx, (embedding, text) in enumerate(zip(embeddings, klue_dp_train['sentence'])):
    insert_data.append({"id": str(idx), "values": embedding, "metadata": {'text': text}})

**Save embedding to index**

In [41]:
upsert_response = index.upsert(vectors=insert_data, namespace='llm-book-sub')

**Retrieve index**

In [42]:
query_response = index.query(
    namespace='llm-book-sub',
    top_k=10,
    include_values=True,
    include_metadata=True,
    vector=embeddings[0]
)

query_response

{'matches': [{'id': '0',
              'metadata': {'text': '해당 그림을 보면 디즈니 공주들이 브리트니 스피어스의 앨범이나 뮤직비디오, '
                                   '화보 속 모습을 똑같이 재연했다.'},
              'score': 1.00000012,
              'values': [-1.10073423,
                         0.220477536,
                         0.742353499,
                         0.407551408,
                         0.408452392,
                         -0.566836,
                         0.120408706,
                         0.961137474,
                         0.122868158,
                         0.0803638846,
                         -0.269531369,
                         1.06352246,
                         0.799611807,
                         -0.65263629,
                         0.280845642,
                         0.298936188,
                         0.34992364,
                         -0.300159127,
                         -0.220105439,
                         0.143383831,
                         0.499212712,
    

**document update & delete**

In [None]:
new_text = 'new text for updating'
new_embedding = sentence_model.encode(new_text).tolist()

# update
update_response = index.update(
    id='existing_document_id',
    valeus=new_embedding,
    set_metadata={'text': new_text},
    namespace='llm-book-sub'
)

# delete
delete_response = index.delete(ids=['existing_document_id'], namespace='llm-book-sub')

## LlamaIndex

**Use different vector database in LlamaIndex**

In [None]:
# Pinecone setting
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
pc.create_index(
    "quickstart", dimension=1536, metric="euclidean", spec=ServerlessSpec("aws", "us-east-1"))
pinecone_index = pc.Index("quickstart")

In [None]:
# Connect pinecone index to LlamaIndex
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

# Implementation Multi-modal Retrieve using Pinecone

## Dataset

In [7]:
from datasets import load_dataset

dataset = load_dataset("poloclub/diffusiondb", "2m_first_1k", split='train')

example_index = 867
original_image = dataset[example_index]['image']
original_prompt = dataset[example_index]['prompt']
print(original_prompt)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


cute fluffy baby cat rabbit lion hybrid mixed creature character concept, with long flowing mane blowing in the wind, long peacock feather tail, wearing headdress of tribal peacock feathers and flowers, detailed painting, renaissance, 4 k 


## Image Explanation using GPT-4o

In [8]:
import base64
import requests
from io import BytesIO

def make_base64(image):
    buffered = BytesIO()
    image.save(buffered, format='JPEG')
    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return img_str

def generate_description_from_image_gpt4(prompt, image64):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {client.api_key}"
    }

    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{image64}"
                        }
                    }
                ]
            }
        ],
        "max_tokens": 300
    }
    response_oai = requests.post("https://api.openai.com/v1/chat/completions",
                                 headers=headers, json=payload)
    result = response_oai.json()['choices'][0]['message']['content']
    return result

In [9]:
import os

os.environ["OPENAI_API_KEY"] = ''

In [10]:
from openai import OpenAI
client = OpenAI()

In [11]:
# Image explanation
image_base64 = make_base64(original_image)
described_result = generate_description_from_image_gpt4("Describe provided image", image_base64)

In [12]:
described_result

'The image depicts a majestic lion with an elaborate and colorful mane that appears to be adorned with vibrant, peacock-like feathers. The lion is centered in a lush, natural setting with a soft-focus background featuring greenery and purple flowers. The artwork combines elements of realism with fantastical features, creating a visually striking and imaginative scene.'

## Save prompt

In [13]:
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = ""

pc = Pinecone(api_key=pinecone_api_key)
client = OpenAI()

In [18]:
print(pc.list_indexes())

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 768,
              'host': 'llm-book-9pwawwd.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'llm-book',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'quickstart-9pwawwd.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'quickstart',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [19]:
index_name = "llm-multimodal"
try:
    pc.create_index(
        name=index_name,
        dimension=512,
        metric='cosine',
        spec=ServerlessSpec('aws', 'us-east-1')
    )
    print(f"Index '{index_name}' created successfully.")
except Exception as e:
    print(f"Failed to create index: {e}")
    
index = pc.Index(index_name)

Index 'llm-multimodal' created successfully.


In [20]:
print(pc.list_indexes())

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 768,
              'host': 'llm-book-9pwawwd.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'llm-book',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 512,
              'host': 'llm-multimodal-9pwawwd.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'llm-multimodal',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 1536,
              'host': 'quickstart-9pwawwd.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'quickstart',
              'spec': {'serverless':

**prompt text to embedding vector**

In [21]:
import torch

from tqdm.auto import trange
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, CLIPTextModelWithProjection

device = "cuda" if torch.cuda.is_available() else "cpu"

text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

tokens = tokenizer(dataset['prompt'], padding=True, return_tensors='pt', truncation=True)
batch_size = 16
text_embs = []
for start_idx in trange(0, len(dataset), batch_size):
    with torch.no_grad():
        outputs = text_model(
            input_ids = tokens['input_ids'][start_idx: start_idx+batch_size],
            attention_mask = tokens['attention_mask'][start_idx: start_idx+batch_size])
        text_emb_tmp = outputs.text_embeds
    text_embs.append(text_emb_tmp)
text_embs = torch.cat(text_embs, dim=0)
text_embs.shape

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

  0%|          | 0/63 [00:00<?, ?it/s]

torch.Size([1000, 512])

**Text embedding to pinecone index**

In [22]:
input_data = []

for id_int, emb, prompt in zip(range(0, len(dataset)), text_embs.tolist(), dataset['prompt']):
    input_data.append(
        {
            "id": str(id_int),
            "values": emb,
            "metadata": {
                "prompt": prompt
            }
        }
    )

index.upsert(
    vectors=input_data
)

{'upserted_count': 1000}

## Retrieve image embeddings

**similar prompt retrieve using image embedding**

In [23]:
from transformers import AutoProcessor, CLIPVisionModelWithProjection

vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = processor(images=original_image, return_tensors="pt")

outputs = vision_model(**inputs)
image_embeds = outputs.image_embeds

search_results = index.query(
    vector=image_embeds[0].tolist(),
    top_k=3,
    include_values=False,
    include_metadata=True
)

search_idx = int(search_results['matches'][0]['id'])

2024-11-28 00:35:29.502425: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-28 00:35:29.600139: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 00:35:29.604015: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-11-28 00:35:29.604025: I tensorflow/stream_executor/cuda

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

In [24]:
search_results

{'matches': [{'id': '918',
              'metadata': {'prompt': 'cute fluffy bunny cat lion hybrid mixed '
                                     'creature character concept, with long '
                                     'flowing mane blowing in the wind, long '
                                     'peacock feather tail, wearing headdress '
                                     'of tribal peacock feathers and flowers, '
                                     'detailed painting, renaissance, 4 k '},
              'score': 0.37472102,
              'values': []},
             {'id': '817',
              'metadata': {'prompt': 'cute fluffy baby cat lion hybrid mixed '
                                     'creature character concept, with long '
                                     'flowing mane blowing in the wind, long '
                                     'peacock feather tail, wearing headdress '
                                     'of tribal peacock feathers and flowers, '
           

## Generate image using DALL-E 3

**Func. Genrate and save image using prompt**

In [25]:
from PIL import Image

def generate_image_dalle3(prompt):
    response_oai = client.images.generate(
        model="dall-e-3",
        prompt=str(prompt),
        size="1024x1024",
        quality="standard",
        n=1,
    )
    result = response_oai.data[0].url
    return result

def get_generated_image(image_url):
    generated_image = requests.get(image_url).content
    image_filename = 'gen_img.png'
    with open(image_filename, 'wb') as image_file:
        image_file.write(generated_image)
    return Image.open(image_filename)

In [27]:
# GPT-4o prompt
gpt_described_image_url = generate_image_dalle3(described_result)
gpt4o_prompt_image = get_generated_image(gpt_described_image_url)

# Original prompt
original_prompt_image_url = generate_image_dalle3(original_prompt)
original_prompt_image = get_generated_image(original_prompt_image_url)

# Retrieved prompt
searched_prompt_image_url = generate_image_dalle3(dataset[search_idx]['prompt'])
searched_prompt_image = get_generated_image(searched_prompt_image_url)