<a href="https://colab.research.google.com/github/HowieG/TaiLOR/blob/vector-db/notebooks/semantic_text_search_using_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Semantic text search using embeddings

We can search through all our reviews semantically in a very efficient manner and at very low cost, by embedding our search query, and then finding the most similar reviews. The dataset is created in the [Obtain_dataset Notebook](Obtain_dataset.ipynb).

In [None]:
!pip install openai transformers plotly tiktoken weaviate-client

In [None]:
import os
import weaviate

def create_client():
    api_key = os.environ.get("TAILOR_WEAVIATE")
    openai_api_key = os.environ.get("OPENAI_API_KEY")

    client = weaviate.Client(
        url = "https://tailor-wiu5z0lk.weaviate.network", 
        auth_client_secret=weaviate.AuthApiKey(api_key=api_key),
        additional_headers = {
            "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
        }
    )
    return client

In [None]:
client = create_client()

# ===== add schema =====
product_obj = {
    "class": "Product",
    "vectorizer": "text2vec-openai"
}

client.schema.create_class(product_obj)

image_obj = {
    "class": "Image"
}


In [None]:
import requests
import json

url = "https://api.thenextleg.io/v2/describe"

payload = json.dumps({
  "url": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQFpRS4kjadav4d65qRFEUa9m5DEN_fmphyU7wO_ssGjA7cWY0lKwPIfVdYN5bNS_hNUX5jV_V6dk2ZthW3ix5h04g-jD8ukbPZWH3sAsngi3gnZVMSMlnk3A&usqp=CAE",
  "ref": "",
  "webhookOverride": ""
})
headers = {
  'Authorization': 'Bearer ',
  'Content-Type': 'application/json'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.json())

In [None]:
import requests
import json

url = "https://api.thenextleg.io/v2/message/XJhtBQAwsPo1kISrvDC3?expireMins=2"

headers = {
  'Authorization': 'Bearer ',
  'Content-Type': 'application/json'
}

response = requests.request("GET", url, headers=headers)

print(response.text)
                

In [None]:
!pip install replicate

In [None]:
import os
os.environ["REPLICATE_API_TOKEN"] = ""

import replicate
output = replicate.run(
    "methexis-inc/img2prompt:50adaf2d3ad20a6f911a8a9e3ccf777b263b8596fbd2c8fc26e8888f8a0edbb5",
    input={"image": open("2358824670_woman, skirt ends at knees _xl-beta-v2-2-2.png", "rb")}
)
print(output)

In [None]:
!pip install midjourney-api

In [None]:
from midjourney_api import TNL

TNL_API_KEY = ''
tnl = TNL(TNL_API_KEY)
image_url = "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQFpRS4kjadav4d65qRFEUa9m5DEN_fmphyU7wO_ssGjA7cWY0lKwPIfVdYN5bNS_hNUX5jV_V6dk2ZthW3ix5h04g-jD8ukbPZWH3sAsngi3gnZVMSMlnk3A&usqp=CAE"
response = tnl.describe(image_url)


print(response)

In [None]:
def download_image(url, filename):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img.save(filename)

In [None]:
import os
from serpapi import GoogleSearch
import replicate
import requests
from PIL import Image
from io import BytesIO
import uuid


def get_product_urls(query, num):
    params = {
        "engine": "google",
        "tbm": "shop",
        "q": query,
        "num": num,
        "api_key": os.environ.get("SERPAPI_KEY")
    }
    
    client = GoogleSearch(params)
    results = client.get_dict()
    
    # Extract product and image URLs
    for item in results:
        if "thumbnail" in item:
            download_image(item["thumbnail"], 'image')
            os.environ["REPLICATE_API_TOKEN"] = ""

            description = replicate.run(
                "methexis-inc/img2prompt:50adaf2d3ad20a6f911a8a9e3ccf777b263b8596fbd2c8fc26e8888f8a0edbb5",
                input={"image": open("image", "rb")}
            )

            namespace = uuid.NAMESPACE_URL  # or any other predefined namespace

            description_uuid = uuid.uuid5(namespace, item["link"])
            image_uuid = uuid.uuid5(namespace, item["thumbnail"])


            with client.batch as batch:
                desc_properties = {
                    "description": description,
                    "url": item["link"],
                    "image_url": item["thumbnail"]
                }

                client.batch.add_data_object(desc_properties, "Product", uuid=description_uuid)
                
                image_properties = {
                    "description_id": description_uuid
                }
                client.batch.add_data_object(image_properties, "Image", vector=d["Vector"], uuid=image_uuid)
    
    

In [None]:
# imports
import pandas as pd
import tiktoken

from openai.embeddings_utils import get_embedding
import openai


In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import openai
openai.api_key_path = "key.txt"

In [None]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191


In [None]:
# Your list of descriptions
image_descriptions = [
    "3 - Piece Upholstered Sectional couch, gray",
    "a woman in a wedding dress posing for a picture, a digital rendering by Lü Ji, trending on pinterest, romanesque, rococo, white background, elegant",
    "a black backpack with a thin blue line on it, a digital rendering by Jeff A. Menges, reddit contest winner, cobra, contest winner, sabattier filter, sabattier effect",
    "a gray and green shirt hanging on a white wall, a stock photo by Jerry Wilkerson, pinterest contest winner, verdadism, y2k aesthetic, contest winner, clean",
    "a woman in a wedding dress holding a bouquet of flowers, a digital rendering by Thomas Millie Dow, trending on cg society, arabesque, made of flowers, detailed, ornate"
]

# Convert list to DataFrame
df = pd.DataFrame(image_descriptions, columns=['description'])

# Print DataFrame
print(df)


In [None]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.description.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)


## 2. Get embeddings and save them for future reuse

In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.description.apply(lambda x: get_embedding(x, engine=embedding_model))

Here we compare the cosine similarity of the embeddings of the query and the documents, and show top_n best matches.

In [None]:
from openai.embeddings_utils import get_embedding, cosine_similarity

# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .description
    )
    if pprint:
        for r in results:
            print(r[:200])
            print()
    return results


results = search_reviews(df, "a woman in a white dress holding a bouquet of flowers, a digital rendering by Irene and Laurette Patten, trending on pinterest, neoclassicism, elegant, white background, full body", n=3)


We can search through these reviews easily. To speed up computation, we can use a special algorithm, aimed at faster search through embeddings.

As we can see, this can immediately deliver a lot of value. In this example we show being able to quickly find the examples of delivery failures.

In [None]:
from transformers import CLIPProcessor, FlaxCLIPModel

# set the model path
model_path = "openai/clip-vit-large-patch14"
# initialize the CLIPProcessor using the pretrained model
processor = CLIPProcessor.from_pretrained(model_path)
# initialize the FlaxCLIPModel using the pretrained model
model = FlaxCLIPModel.from_pretrained(model_path)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

def list_files_in_folder(folder_path):
    files_list = []
    for subdir, dirs, files in os.walk(folder_path):
        for file in files:
            files_list.append(os.path.join(subdir, file))
    return files_list

folder_path = '/content/drive/My Drive/tailor_images'
files_list = list_files_in_folder(folder_path)
image_paths = files_list[1:]
target_image_path = files_list[0]

In [None]:
import jax
import jax.numpy as jnp
from flax import linen as nn
import optax

def embed_images(images, target_image):
    # Convert the target_image into model-acceptable input and apply padding
    target_inputs = processor(images=target_image, return_tensors="jax", padding=True)
    target_emb = model.get_image_features(**target_inputs)
    target_emb = jnp.array(target_emb.tolist())

    # Normalize the target_emb
    target_emb = target_emb / jnp.linalg.norm(target_emb)

    # Convert the images into model-acceptable inputs and apply padding
    inputs = processor(images=images, return_tensors="jax", padding=True)
    emb = model.get_image_features(**inputs)

    # Convert the embedding vectors into a JAX array
    emb = jnp.array(emb.tolist())

    # Normalize the embeddings
    emb = emb / jnp.expand_dims(jnp.linalg.norm(emb, axis=-1), axis=-1)

    # Compute cosine similarity
    cos_similarities = jnp.dot(emb, target_emb.T)

    # Convert the cosine similarities to a list and return
    return cos_similarities.tolist()

In [None]:
from PIL import Image

def load_images(paths):
    # create empty lists to store images and metadata
    images = []
    # loop through each file path in the input list
    for path in paths:
        # open the image at the file path and convert it to RGB format
        img = Image.open(path).convert("RGB")
        # append the image to the list of images and include path in metadata
        images.append(img)
    # Return the lists of images and metadata
    return images

In [None]:
from PIL import Image

def load_images(paths):
    # create empty lists to store images and metadata
    images = []
    # loop through each file path in the input list
    for path in paths:
        # open the image at the file path and convert it to RGB format
        img = Image.open(path).convert("RGB")
        # append the image to the list of images and include path in metadata
        images.append(img)
    # Return the lists of images
    return images

# Load your images using the function you defined
target_image = load_images([target_image_path])[0]
images = load_images(image_paths)

# Now you can pass these images to the embed_images function
cos_similarities = embed_images(images, target_image)
print(cos_similarities)


In [None]:
files_list = [f for f in files_list if f != '/content/drive/My Drive/tailor_images/.DS_Store']
images = load_images(files_list)
embeddings = embed_images(images)

In [None]:
def embed_images(images):
  # converts the images into model-acceptable inputs and applies padding
  inputs = processor(images=images, return_tensors="jax", padding=True)
  # passes the images through the CLIP model and extracts image features
  emb = model.get_image_features(**inputs)
  # converts the embedding vectors into a Python list and returns them
  return emb.tolist()