# Populate Vector DB with Image Description Embeddings

## Fetch Images from Pickle file

We will generate a text description of each image using a vision LLM. That description will be stored as an embedding for future search.

#### Function to Generate Text Description

In [None]:
import base64
import os
from PIL import Image
import io
import requests
from openai import OpenAI

from config import Config


client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)


def encode_image(image_url):
    with Image.open(requests.get(image_url, stream=True).raw) as img:
    #with Image.open(image_url) as img:
        # Resize the image to maximum dimension of 512 pixels
        img.thumbnail((512, 512))
        buffered = io.BytesIO()
        img.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode('utf-8')


def analyse_image_file(image_url, title):
    encoded_string = encode_image(image_url)
    user_prompt = "Describe in detail only the {title}."
    response = client.chat.completions.create(
        model=Config.VISION_MODEL,
        #max_tokens=Config.MAX_TOKENS,
        messages=[
            {
                "role": "system",
                "content": [{"type": "text", "text": Config.SYSTEM_PROMPT},],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            #"url":image_url,
                            "url": f"data:image/jpeg;base64,{encoded_string}"
                            #"detail": Config.DETAIL,
                        },
                    }
                ],
            },
            {
                "role": "user",
                "content": [{"type": "text", "text": user_prompt},],
            },
        ],
    )
    return response.choices[0].message.content    

#### Read Pickle File

In [None]:
import pickle

# read data to reconstitute into a list of documents "data"
# the data is in the form of LangChain Document objects
with open("website_data.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
"""
page_content='Description:\nYour next special occasion outfit calls for our A-Line Lace Midi Skirt in Bright Chartreuse. Featuring a scalloped hem with eyelash lace, this is a soft and vibrant skirt that will light up your outfit. Create an alluring set with our Lace Shirt in matching Bright Chartreuse.Style/SKU: 416763CBM.3265\n\nDetails:\nCotton blend lace/viscose jersey lining\nMidi-length\nGarment length: 80cm (size XS)\nA-line silhouette\nCentre back invisible zip with hook and eye\nfastening\nScalloped hem with eyelash lace\nFully lined\nMain: 45% cotton, 35% polyamide, 20% viscose\nLining: 95% viscose, 5% elastane\nBright Chartreuse\nModel is 175cm tall and wearing a size XS' 
metadata={
    'source': 'https://blueillusion.com/products/an-a-line-lace-midi-skirt-416763cbm-bright-chartreuse', 
    'title': 'A-Line Lace Midi Skirt', 
    'price': '25.00', 
    'img': '//blueillusion.com/cdn/shop/files/23.04.26.BlueIllusion_s10_1077.jpg', 
    'language': 'en'
}
"""

#### Update page_content with image description

- The Document data imported from the pickle file has two attribute: page_content and metadata.
- The page_content is formatted with a Description section and a Details section.
- We will add a new section called Image Description, which will hold the LLM generated image description.

In [None]:
# read filenames from images_done.txt
with open("images_done.txt", "r") as f:
    images_done = f.read().splitlines()
print(len(images_done))

In [None]:
for document in data:
    # get the image URL from the metadata
    image_url = f"https:{document.metadata['img']}"

    # check if the image has already been processed
    if image_url in images_done:
        continue

    print(f"Processing: {image_url} ... ", end=" ")
    
    # check if the image extension is not jpg or jpeg
    if not image_url.endswith(".jpg") and not image_url.endswith(".jpeg"):
        print("Not JPG")
        continue
    
    # generate the text description from the image
    title = document.metadata['title']
    text_description = analyse_image_file(image_url, title)

    # get the page_content from the data object
    page_content = document.page_content

    # seperate the page_content into two objects - Description and Details
    description, details = page_content.split("Details:")

    # remove everything after Style/SKU
    description = description.split("Style/SKU")[0]

    # append text_description to the description
    description += f"\n\nImage Description:\n{text_description}"

    # recombine the description and details
    page_content = description + f"\n\nDetails: {details}"

    document.page_content = page_content

    print("Done")

    # create a new pickle file
    with open("pickle_progress201.pkl", "wb") as f:
        pickle.dump(data, f)

#### Write New Pickle File

In [None]:
import pickle

# write data to file in a way that it can be reconstituted into a list of documents
with open("website_data_with_image_descriptions.pkl", "wb") as f:
    pickle.dump(data, f)

### Join Pickel Files

In [None]:
import pickle

data_new = []
image_files = []

files = [
    "pickle_progress0.pkl",
    "pickle_progress201.pkl",
]

for file in files:

    # load pickle file
    with open(file, "rb") as f:
        data = pickle.load(f)

    # iterate over data
    yes = 0
    no = 0
    for doc in data:
        page_content = doc.page_content
        image_file = f"https:{doc.metadata['img']}"

        if image_file not in image_files:
            # we have not seen this image, so continue
            if "Image Description:" in page_content:
                # append image_file to image_files
                image_files.append(image_file)

                # an Image Description exists for this document, so append doc to data_new
                yes += 1
                #append doc to data_new
                data_new.append(doc)
            else:
                no += 1
    print(f"yes: {yes}, no: {no}") 
    print (len(data_new))

In [None]:
#print (data_new[50].page_content)
# write data_new to pickle file
with open("website_data_with_image_descriptions.pkl", "wb") as f:
    pickle.dump(data_new, f)