In [1]:
import pandas as pd
import numpy as np

In [10]:
amazon = pd.read_csv("metadata.csv", index_col = 0)

In [11]:
amazon.head()

Unnamed: 0,Image,Text_Description
0,https://images-na.ssl-images-amazon.com/images...,"DB Longboards CoreFlex Crossbow 41"" Bamboo Fib..."
1,https://images-na.ssl-images-amazon.com/images...,"Electronic Snap Circuits Mini Kits Classpack, ..."
2,https://images-na.ssl-images-amazon.com/images...,3Doodler Create Flexy 3D Printing Filament Ref...
3,https://images-na.ssl-images-amazon.com/images...,Guillow Airplane Design Studio with Travel Cas...
4,https://images-na.ssl-images-amazon.com/images...,Woodstock- Collage 500 pc Puzzle|Toys & Games ...


In [12]:
amazon.shape

(1776, 2)

In [13]:
# import os
# import getpass

# # Prompt for OpenAI API key securely
# api_key = getpass.getpass("Enter your OpenAI API key: ")

# # Set the API key as an environment variable
# os.environ["OPENAI_API_KEY"] = api_key

In [14]:
pd.set_option('display.max_colwidth', None)

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load pretrained CLIP model and processor
model_name = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_name)
clip_processor = CLIPProcessor.from_pretrained(model_name)

# Ensure the proper device is set (CUDA if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device
clip_model.to(device)
clip_model.eval()  # Set model to evaluation mode

In [16]:
import torch
from tqdm import tqdm

def truncate_text(text, max_length=77):
    """
    Truncates a text to a specified maximum number of tokens.
    
    Parameters:
        text (str): The input text to be truncated.
        max_length (int): Maximum number of tokens allowed.

    Returns:
        str: Truncated text.
    """
    words = text.split()
    return " ".join(words[:max_length])


def get_text_embeddings_with_truncation(texts, max_length=77, batch_size=32):
    """
    Generates embeddings for text using truncation to fit the maximum token length.
    
    Parameters:
        texts (List[str]): List of input texts.
        max_length (int): Maximum length of tokens for CLIP (default is 77 tokens).
        batch_size (int): Batch size for processing text.

    Returns:
        List[Dict]: A list where each entry contains `truncated_text`, `embedding`, and `text_id`.
    """
    all_text_embeddings = []
    for text_id, text in enumerate(tqdm(texts, desc="Processing texts")):
        # Step 1: Truncate the text to the maximum allowed length
        truncated_text = truncate_text(text, max_length=max_length)
        
        # Step 2: Process the truncated text in batches
        inputs = clip_processor(
            text=[truncated_text], return_tensors="pt", padding=True, truncation=True
        ).to(device)
        with torch.no_grad():
            text_features = clip_model.get_text_features(**inputs)

        # Step 3: Store the embedding with metadata
        all_text_embeddings.append({
            "text_id": text_id,
            "truncated_text": truncated_text,
            "embedding": text_features.cpu()
        })

    return all_text_embeddings


# Example usage
text_descriptions = amazon["Text_Description"].tolist()  # Replace with your DataFrame column
text_embeddings = get_text_embeddings_with_truncation(
    text_descriptions, max_length=77, batch_size=64
)

Processing texts: 100%|██████████| 1776/1776 [00:16<00:00, 108.86it/s]


In [17]:
len(text_embeddings)

1776

In [18]:
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import torch

# Generate image embeddings in batches
def get_image_embeddings_in_batches(image_urls, batch_size=16):
    all_embeddings = []

    for i in tqdm(range(0, len(image_urls), batch_size)):
        batch_urls = image_urls[i:i + batch_size]
        
        # Fetch and preprocess the images in the batch
        image_list = []
        for url in batch_urls:
            try:
                response = requests.get(url, timeout=10)
                image = Image.open(BytesIO(response.content)).convert("RGB")
                image_list.append(image)
            except Exception as e:
                print(f"Error fetching or processing image from {url}: {e}")
                continue
        
        if not image_list:
            continue
        
        inputs = clip_processor(images=image_list, return_tensors="pt", padding=True).to(device)
        
        with torch.no_grad():
            # Generate embeddings for the batch
            image_embeddings = clip_model.get_image_features(**inputs)
        
        # Move to CPU and append to results
        all_embeddings.append(image_embeddings.cpu())
    
    # Concatenate all batch embeddings
    return torch.cat(all_embeddings, dim=0)

# Example usage
image_urls = amazon["Image"].tolist()  # Replace `amazon` with your DataFrame variable
image_embeddings = get_image_embeddings_in_batches(image_urls, batch_size=64)

100%|██████████| 28/28 [03:08<00:00,  6.72s/it]


In [19]:
image_embeddings.shape

torch.Size([1776, 512])

In [20]:
import faiss
import numpy as np

# Initialize FAISS index
embedding_dim = 512
index = faiss.IndexFlatIP(embedding_dim)  # Use Inner Product for cosine similarity

# Initialize metadata storage
metadata_faiss = []

# Normalize and add text embeddings
for entry in text_embeddings:
    embedding = entry["embedding"].numpy()
    norm = np.linalg.norm(embedding)
    if norm > 0:
        normalized_embedding = embedding / norm  # Normalize
    else:
        normalized_embedding = embedding  # Handle zero vector case

    normalized_embedding = normalized_embedding.astype(np.float32)  # Ensure float32
    index.add(normalized_embedding.reshape(1, -1))  # Add to FAISS
    metadata_faiss.append({
        "type": "text",
        "text_id": entry["text_id"],
        "content": entry["truncated_text"]  # Store truncated text for reference
    })

# Normalize and add image embeddings
for i, embedding in enumerate(image_embeddings):
    embedding = embedding.numpy()
    norm = np.linalg.norm(embedding)
    if norm > 0:
        normalized_embedding = embedding / norm  # Normalize
    else:
        normalized_embedding = embedding  # Handle zero vector case

    normalized_embedding = normalized_embedding.astype(np.float32)  # Ensure float32
    index.add(normalized_embedding.reshape(1, -1))  # Add to FAISS
    metadata_faiss.append({
        "type": "image",
        "image_id": i,
        "path": image_urls[i]  # Store the image file path or URL for reference
    })

# Ensure metadata and FAISS index are aligned
assert len(metadata_faiss) == index.ntotal, "Metadata and FAISS index embeddings count mismatch!"

print(f"FAISS index built with {index.ntotal} embeddings.")

FAISS index built with 3552 embeddings.


In [21]:
# Save FAISS index to a file
faiss.write_index(index, "faiss_index.bin")

In [22]:
import pickle

# Save metadata to a file
with open("metadata_faiss.pkl", "wb") as f:
    pickle.dump(metadata_faiss, f)

In [23]:
# Load FAISS index from the file
index = faiss.read_index("faiss_index.bin")

In [24]:
# Load metadata from the file
with open("metadata_faiss.pkl", "rb") as f:
    metadata_faiss = pickle.load(f)

In [87]:
def retrieve_top_k_by_type(query_embedding, k=5, target_type="text"):
    """
    Retrieves the top-k nearest neighbors from the FAISS index filtered by type.

    Parameters:
        query_embedding (torch.Tensor): The query embedding.
        k (int): Number of top results to retrieve.
        target_type (str): The type of results to retrieve ("text" or "image").

    Returns:
        List[Dict]: A list of metadata for the top-k results of the specified type.
    """
    # Step 1: Convert the query embedding to numpy and normalize
    query_embedding = query_embedding.reshape(1, -1).astype(np.float32)
    
    # Step 2: Perform the search in the FAISS index
    distances, indices = index.search(query_embedding, index.ntotal)  # Search all entries
    
    # Step 3: Filter results by the desired type
    filtered_results = []
    for idx, i in enumerate(indices[0]):
        result = {**metadata[i], "distance": distances[0][idx]}
        if result["type"] == target_type:
            filtered_results.append(result)
        if len(filtered_results) >= k:  # Stop after retrieving enough results
            break
    
    return filtered_results

In [88]:
from PIL import Image
import torch
import numpy as np

def generate_query_embedding(text_query=None, image_query_path=None, text_weight=0.5, image_weight=0.5):
    """
    Generates a query embedding based on the provided text and/or image query.

    Parameters:
        text_query (str): Text query for the search (optional).
        image_query_path (str): Path to the query image (optional).
        text_weight (float): Weight for the text embedding (default 0.5).
        image_weight (float): Weight for the image embedding (default 0.5).

    Returns:
        np.ndarray: Normalized query embedding for FAISS search.
    """
    text_embedding = None
    image_embedding = None

    # Generate text embedding if text query is provided
    if text_query:
        text_inputs = clip_processor(text=[text_query], return_tensors="pt").to(device)
        with torch.no_grad():
            text_embedding = clip_model.get_text_features(**text_inputs)
        text_embedding = text_embedding.cpu().numpy()  # Convert to NumPy array

    # Generate image embedding if image query is provided
    if image_query_path:
        response = requests.get(image_query_path, timeout=10)
        query_image = Image.open(BytesIO(response.content)).convert("RGB")
        image_inputs = clip_processor(images=[query_image], return_tensors="pt").to(device)
        with torch.no_grad():
            image_embedding = clip_model.get_image_features(**image_inputs)
        image_embedding = image_embedding.cpu().numpy()  # Convert to NumPy array

    # Handle the three cases: only text, only image, or both
    if text_embedding is not None and image_embedding is not None:
        # Combine text and image embeddings
        combined_embedding = text_weight * text_embedding + image_weight * image_embedding
        query_embedding = combined_embedding
    elif text_embedding is not None:
        query_embedding = text_embedding  # Only text query
    elif image_embedding is not None:
        query_embedding = image_embedding  # Only image query
    else:
        raise ValueError("At least one of text_query or image_query_path must be provided.")

    # Normalize the query embedding (L2 normalization)
    norm = np.linalg.norm(query_embedding)
    if norm > 0:
        normalized_query_embedding = query_embedding / norm
    else:
        normalized_query_embedding = query_embedding  # Handle zero vector case

    return normalized_query_embedding.astype(np.float32)

In [89]:

# Example usage

# # Case 1: Text query only
# query_text = "Can you provide a detailed introduction to DB Longboards and their product range?"
# query_embedding = generate_query_embedding(text_query=query_text)

# # Case 2: Image query only
# query_image_path = image_urls[0]  # Replace with your image path
# query_embedding = generate_query_embedding(image_query_path=query_image_path)

# Case 3: Both text and image query
query_text = "Can you provide a detailed introduction to DB Longboards and their product range?"
query_image_path = image_urls[0]
query_embedding = generate_query_embedding(text_query=query_text, image_query_path=query_image_path)

# Retrieve top-5 relevant items
top_k_results_text = retrieve_top_k_by_type(query_embedding, k=3, target_type="text")
top_k_results_image = retrieve_top_k_by_type(query_embedding, k=3, target_type="image")

print(top_k_results_text)
print(top_k_results_image)

[{'type': 'text', 'text_id': 227, 'content': 'Rayne Longboards Demonseed Longboard Complete | Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards | $249.95', 'distance': 0.5939774}, {'type': 'text', 'text_id': 903, 'content': 'Yocaher New VW Vibe Beach Series Longboard Complete Cruiser and Decks Available for All Shapes (Complete-Oldschool-Blue) | Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards | $69.99', 'distance': 0.5876462}, {'type': 'text', 'text_id': 0, 'content': 'DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete | Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards | $237.68', 'distance': 0.58075}]
[{'type': 'image', 'image_id': 0, 'path': 'https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.

In [90]:
def rag_with_llm(query_text, query_image_path=None, k=3):
    """
    Retrieves relevant text and image results and generates a response using an LLM.

    Parameters:
        query_text (str): The text query from the user.
        query_image_path (str): Path to an optional image query.
        k (int): Number of top results to retrieve per type.

    Returns:
        str: The LLM's response (text, image references, or both).
    """
    # Step 1: Generate query embedding
    query_embedding = generate_query_embedding(
        text_query=query_text, image_query_path=query_image_path
    )
    
    # Step 2: Retrieve top-k text and image results
    text_results = retrieve_top_k_by_type(query_embedding, k=k, target_type="text")
    image_results = retrieve_top_k_by_type(query_embedding, k=k, target_type="image")
    
    # Step 3: Build the LLM prompt
    prompt = "You are a helpful assistant. Respond based on the query and retrieved results.\n\n"
    prompt += f"User Query: {query_text}\n\n"
    
    if query_image_path:
        prompt += f"User also uploaded an image: {query_image_path}\n\n"
    
    prompt += "Retrieved Text Results:\n"
    for i, result in enumerate(text_results):
        prompt += f"{i+1}. {result['content']} (Score: {result['distance']:.2f})\n"
    
    prompt += "\nRetrieved Image Results:\n"
    for i, result in enumerate(image_results):
        prompt += f"{i+1}. Image URL: {result['path']} (Score: {result['distance']:.2f})\n"
    
    prompt += "\nPlease respond appropriately with text, images, or both, based on the query and retrieved results.\n"

    # Step 4: Generate LLM response
    response = llm_generate(prompt)  # Replace with your LLM's generate method
    
    return response

In [106]:
from openai import OpenAI

def llm_generate(prompt):
    """
    Generates a response from an LLM (e.g., OpenAI GPT) given a prompt.

    Parameters:
        prompt (str): The input prompt for the LLM.

    Returns:
        str: The generated response.
    """
    client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Replace with your model
        messages=[
            {"role": "system", "content": "You are an AI assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=500,
    )
    return response.choices[0].message.content

In [107]:
# Query Example
query_text = "Tell me about DB Longboards and show me related products."
query_image_path = None  # Example: Add a path to an image if applicable

# Run the RAG pipeline
response = rag_with_llm(query_text, query_image_path=query_image_path, k=5)

# Output the response
print("LLM Response:")
print(response)

LLM Response:
It seems that the retrieved results do not provide specific information about DB Longboards. However, I can offer a general overview of longboards and what makes DB Longboards popular.

### About DB Longboards
DB Longboards is known for crafting high-quality longboards that cater to a variety of riding styles, including cruising, carving, and downhill racing. Their boards often feature unique designs and are made from durable materials to ensure a smooth ride and longevity. DB Longboards typically focus on providing a balance of performance and aesthetics, making them a favorite among both casual riders and serious enthusiasts.

### Related Products
While I couldn't find specific longboards from DB, here are some general products related to games and accessories that were retrieved:

1. **Nuts** - $9.95
   ![Nuts](https://images-na.ssl-images-amazon.com/images/I/41ZVAsZsdNL.jpg)

2. **What's In the Box** - $19.25
   ![What's In the Box](https://images-na.ssl-images-amazon

In [18]:
from huggingface_hub import login

# Replace with your token
login(token="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")

In [1]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards: 100%|██████████| 4/4 [06:21<00:00, 95.32s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [32]:
def prepare_prompt_with_texts_and_images(user_query, retrieved_items):
    """
    Combines retrieved text and images into a prompt for the LLM.
    """
    text_chunks = [item["chunk"] for item in retrieved_items if item["type"] == "text"]
    image_paths = [item["path"] for item in retrieved_items if item["type"] == "image"]

    prompt = f"""
    User Query: {user_query}

    Context:
    - Relevant Texts: {" ".join(text_chunks)}
    - Relevant Images: {", ".join(image_paths)}

    Provide a detailed response based on the context.
    """
    return prompt


# Generate the prompt
prompt = prepare_prompt_with_texts_and_images(query_text, top_k_results)

# Generate the LLM response
response = pipeline(prompt, max_new_tokens=50,)
print(response[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 1002.00 MiB. GPU 0 has a total capacity of 14.57 GiB of which 166.75 MiB is free. Process 76593 has 756.00 MiB memory in use. Including non-PyTorch memory, this process has 13.66 GiB memory in use. Of the allocated memory 12.93 GiB is allocated by PyTorch, and 627.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from langchain.vectorstores import Chroma
from langchain.schema import Document

text_documents = [Document(page_content=text) for text in text_descriptions]
image_documents = [Document(page_content=path) for path in image_paths]

documents = text_documents + image_documents
embeddings = np.vstack([text_embeddings, image_embeddings]).tolist()  # Stack text and image embeddings

In [None]:
import chromadb
from chromadb.config import Settings

# Initialize Chroma client
persist_dir = "chroma_multimodal_db"
client = chromadb.Client(Settings(persist_directory=persist_dir))

# Create or load a collection
collection = client.get_or_create_collection("multimodal_collection")

# Add documents with embeddings
documents = [doc.page_content for doc in text_documents + image_documents]  # Document content
metadatas = [{"type": "text"}] * len(text_documents) + [{"type": "image"}] * len(image_documents)  # Metadata
ids = [f"id_{i}" for i in range(len(documents))]  # Unique IDs

# Add data to the collection
collection.add(
    embeddings=embeddings,  # Precomputed embeddings
    documents=documents,  # Corresponding text or image paths
    metadatas=metadatas,  # Metadata (optional)
    ids=ids  # Unique IDs
)

print("Data successfully added to the collection!")

In [57]:
# Generate a query embedding
query_embedding = get_text_embeddings_in_batches(["Can you show me a picture of the DB Longboards?"])[0]

# Convert the PyTorch tensor to a Python list
query_embedding_list = query_embedding.cpu().tolist()

# Query the collection
results = collection.query(
    query_embeddings=[query_embedding_list],
    n_results=5  # Number of top results
)

# Display the results
for document, metadata in zip(results["documents"], results["metadatas"]):
    print(f"Document: {document}, Metadata: {metadata}")

NameError: name 'get_text_embeddings_in_batches' is not defined

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load the Chroma vector store
vectorstore = Chroma(
    collection_name="multimodal_collection",
    persist_directory=persist_dir,
    embedding_function=None  # Already embedded documents
)

In [None]:
print(f"Collection Name (Chroma client): {collection.name}")
print(f"Persist Directory (Chroma client): {persist_directory}")
print(f"Collection Name (LangChain): {vectorstore._collection.name}")
print(f"Persist Directory (LangChain): {persist_directory}")

In [None]:
collection_v = vectorstore._collection

collection_item_count = len(collection_v.get(include=["documents"])["documents"])
print(f"Number of items in the Chroma collection: {collection_item_count}")

In [None]:
# Check the number of items in the LangChain vector store
vectorstore_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
vectorstore_results = vectorstore_retriever.get_relevant_documents("test query")
print(f"Number of items in the LangChain vector store: {len(vectorstore_results)}")

In [None]:
# Load the LLM
model_name = "meta-llama-3b"  # Replace with your LLM model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create a pipeline
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)

# Wrap the pipeline for LangChain
llm = HuggingFacePipeline(pipeline=llm_pipeline)

In [None]:
# Define a prompt template
template = """
You are a helpful assistant capable of answering questions based on provided context.

Context:
{context}

User Query:
{question}

Your Response:
"""
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

# Create a conversational retrieval chain
retrieval_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,  # Include retrieved documents in the output
    combine_docs_chain_kwargs={"prompt": prompt}
)

In [None]:
from langchain.embeddings.base import Embeddings

class CLIPEmbeddingFunction(Embeddings):
    def __init__(self, clip_model, clip_processor, device, batch_size=16):
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        self.device = device
        self.batch_size = batch_size  # Set batch size for processing

    def embed_documents(self, texts_or_images):
        embeddings = []

        for i in range(0, len(texts_or_images), self.batch_size):
            batch = texts_or_images[i:i + self.batch_size]

            if all(isinstance(x, str) and x.endswith((".jpg", ".png")) for x in batch):  # Image paths
                images = [Image.open(path).convert("RGB") for path in batch]
                inputs = self.clip_processor(images=images, return_tensors="pt", padding=True).to(self.device)
                with torch.no_grad():
                    batch_embeddings = self.clip_model.get_image_features(**inputs).cpu().tolist()
            elif all(isinstance(x, str) for x in batch):  # Text
                inputs = self.clip_processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(self.device)
                with torch.no_grad():
                    batch_embeddings = self.clip_model.get_text_features(**inputs).cpu().tolist()
            else:
                raise ValueError("Inputs must be a list of text strings or image paths.")

            embeddings.extend(batch_embeddings)

        return embeddings

    def embed_query(self, query):
        inputs = self.clip_processor(text=[query], return_tensors="pt", padding=True, truncation=True).to(self.device)
        with torch.no_grad():
            embeddings = self.clip_model.get_text_features(**inputs).cpu().tolist()
        return embeddings[0]

In [None]:
# Initialize the custom embedding function
clip_embedding_function = CLIPEmbeddingFunction(clip_model, clip_processor, device)

In [None]:
text_documents = [Document(page_content=text) for text in text_descriptions]
image_documents = [Document(page_content=path) for path in image_paths]

documents = text_documents + image_documents

In [None]:
persist_dir = "chroma_embeddings_db"

# Create the vector store with dynamic embeddings
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=clip_embedding_function,
    persist_directory=persist_dir
)

# Persist the vectorstore
vectorstore.persist()
print("Vectorstore created and persisted!")

In [None]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
import chromadb
from chromadb.config import Settings
import os

text_documents = [Document(page_content=text) for text in text_descriptions]
image_documents = [Document(page_content=path) for path in image_paths]

# Combine embeddings and documents
documents = text_documents + image_documents
embeddings = np.vstack([text_embeddings, image_embeddings]).tolist()  # Stack text and image embeddings

# Initialize Chroma database
persist_dir = "chroma_embeddings_db"
client = chromadb.Client(Settings(persist_directory=persist_dir, chroma_db_impl="duckdb+parquet"))

# Create a collection in Chroma
collection = client.get_or_create_collection("my_embeddings_collection")

# Insert precomputed embeddings into the collection
documents = [doc.page_content for doc in text_documents + image_documents]
metadata = [{"type": "text"}] * len(text_documents) + [{"type": "image"}] * len(image_documents)

collection.add(
    embeddings=embeddings,  # Precomputed embeddings
    documents=documents,  # Corresponding documents
    metadatas=metadata,  # Metadata for each document
    ids=[f"id_{i}" for i in range(len(documents))]  # Unique IDs for each document
)

In [None]:
import getpass

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
os.environ["COHERE_API_KEY"] = getpass.getpass()

In [None]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.vectorstores import Chroma

# vectorstore = Chroma.from_documents(documents=chunks,
#                                     embedding=embedding)

vectorstore_openai_embed = Chroma.from_documents(documents=chunks,
                                    embedding=OpenAIEmbeddings())

# retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

retriever_openai_embed = vectorstore_openai_embed.as_retriever(search_kwargs={"k": 10})