## Gemini Embedding

In [1]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from google import genai
from google.genai.types import ContentEmbedding, EmbedContentConfig
from enum import Enum
from typing import List, Union

load_dotenv()

embedding_size = 768
api_key  = os.getenv('GEMINI_API_KEY')
model_id = os.getenv('GEMINI_EMBEDDING_ID')

client = genai.Client(api_key=api_key)


class EmbeddingTaskTypeEnum(str, Enum):
    RETRIEVAL_DOCUMENT = "RETRIEVAL_DOCUMENT"
    RETRIEVAL_QUERY    = "RETRIEVAL_QUERY"
    QUESTION_ANSWERING = "QUESTION_ANSWERING"
    FACT_VERIFICATION  = "FACT_VERIFICATION"


In [2]:
def embed_content(
    contents: Union[str, List[str]],
    task_type: EmbeddingTaskTypeEnum = EmbeddingTaskTypeEnum.RETRIEVAL_DOCUMENT
) -> List[ContentEmbedding]:
    try:
        response = client.models.embed_content(
            model    = model_id,
            contents = contents,
            config   = EmbedContentConfig(
                    task_type = task_type,
                    output_dimensionality = embedding_size,
                )
            )
        return response.embeddings
    except:
        return [ContentEmbedding(values=[])]

content  = """
The quick brown fox jumps over the lazy dog.
"""
embeds = embed_content(content)
print("Embedding size:", len(embeds[0].values))
print("Embedding values:", embeds[0].values)

Embedding size: 768
Embedding values: [-0.06868978, 0.005816352, -0.0044413465, -0.012453203, -0.016220396, 0.0378335, 0.016380973, 0.022189835, 0.013305066, -0.000575397, -0.0052356906, 0.02310427, 0.0678747, 0.04785653, -0.008731612, -0.005142602, 0.025083635, 0.033237115, -0.07198565, 0.034525197, 0.04022084, -0.008876277, 0.017015086, -0.037441514, -0.0067318017, -0.024688154, -0.004185732, -0.013715649, -0.0084399665, -0.021793142, 0.041767, 0.09108016, 0.0041992585, -0.024113636, 0.05226678, 0.0021381488, -0.04233077, 0.027580375, 0.07004266, -0.015475536, -0.08674333, 0.008302095, -0.029740943, 0.03891059, 0.0010970772, -0.060369544, 0.010995038, 0.019837605, -0.02963292, 0.038032256, 0.062986806, 0.017882522, -0.049972247, -0.00549779, -0.011984985, 0.030878022, -0.024744753, 0.013024645, 0.0007624969, 0.0006026833, 0.009504737, 0.0025120068, -0.030736525, 0.023924567, 0.027402163, -0.05704332, -0.05121925, -0.034580123, -0.010532605, 0.0311681, 0.0001340585, 0.05318172, -0.014

## Supabase's PG Vector

In [3]:
from supabase import create_client, Client
from supabase.client import ClientOptions

supabase: Client = create_client(
        os.getenv('SUPABASE_URL'),
        os.getenv('SUPABASE_KEY'),
        options=ClientOptions(
            postgrest_client_timeout=20
        ),
    )

### Indexing

In [4]:
# Create example corpus with 10 sentences
the_zoo_corpus = [
    # For retrieval
    "The quick brown fox jumps over the lazy dog.",
    "Elephants are the largest land mammals on Earth.",
    "Dolphins are known for their intelligence and playful behavior.",
    "Tigers are the largest species of cat and are endangered.",
    "Penguins can't fly but are excellent swimmers.",
    "Giraffes have the longest necks of any living animal.",
    "Koalas sleep for up to 22 hours a day.",
    "Cheetahs are the fastest land animals, reaching speeds of 70 mph.",
    "Octopuses have three hearts and blue blood.",
    "Bats are the only mammals capable of sustained flight.",
    "Sloths move so slowly that algae grows on their fur.",
    "Hummingbirds can fly backwards and hover in mid-air.",
    "Chameleons can move their eyes independently of each other.",
    "Polar bears have black skin underneath their white fur.",
    "Kangaroos cannot walk backwards.",
    "Owls can rotate their heads up to 270 degrees.",
    "Hippopotamuses secrete a red oily substance that acts as sunscreen.",
    "Flamingos get their pink color from the food they eat.",
    "Beavers can hold their breath underwater for up to 15 minutes.",
    "Butterflies taste with their feet.",

    "Honeybees communicate through a dance language to indicate food locations.",
    "Axolotls can regenerate entire limbs and parts of their vital organs.",
    "Narwhals' tusks are actually a long, protruding canine tooth.",
    "Mantis shrimp can strike with the force of a bullet.",
    "Tardigrades can survive in space and extreme environments.",
    "Crows recognize human faces and remember people who threaten them.",
    "Platypuses are among the few mammals that produce venom.",
    "African elephants have a pregnancy that lasts 22 months.",
    "Peacocks' tail feathers can make up 60 percent of their body length.",
    "Naked mole rats are resistant to cancer and can live for 30 years.",
    "Jellyfish have existed for over 650 million years on Earth.",
    "Ostriches have the largest eyes of any land animal.",
    "Pistol shrimp create bubbles that reach temperatures hotter than the sun.",
    "Goats have rectangular pupils that allow for wide-angle vision.",
    "Pufferfish are the second most poisonous vertebrate in the world.",
    "Lyrebirds can mimic almost any sound they hear in their environment.",
    "Pangolins are the only mammals completely covered in scales.",
    "Archerfish spit water jets to knock insects into the water.",
    "Mantis shrimp have 16 color receptors compared to humans' three.",
    "Horned lizards can squirt blood from their eyes as defense.",

    "Seahorses are monogamous and mate for life with their partners.",
    "Snails can sleep for up to three years when conditions are unfavorable.",
    "Elephants are the only mammals that cannot jump.",
    "Wombats produce cube-shaped feces to mark their territory.",
    "Hummingbirds are the only birds that can fly backwards.",
    "Blue whales are the largest animals to have ever existed on Earth.",
    "Ants can lift up to 50 times their own body weight.",
    "Sloths take two weeks to digest a single meal.",
    "Sharks have been on Earth longer than trees have existed.",
    "Camels have three eyelids to protect their eyes from sand.",
    "Flamingos can only eat with their heads upside down.",
    "Koalas have fingerprints nearly identical to human fingerprints.",
    "Octopuses have nine brains - one central brain and eight in their arms.",
    "Reindeer eyes change color from gold to blue in winter months.",
    "Vultures have stomach acid so strong they can digest anthrax.",
    "Frogs completely shed their skin every few weeks and then eat it.",
    "Dolphins call each other by unique names or whistles.",
    "Armadillos always give birth to identical quadruplets.",
    "Squirrels plant thousands of trees annually by forgetting buried nuts.",
    "Rhinoceros horns are made of keratin, the same as human fingernails.",

    "Cuttlefish can change the texture of their skin as well as color.",
    "Woodpeckers wrap their tongues around their skulls when not in use.",
    "Zebras have unique stripe patterns, like human fingerprints.",
    "Rabbits cannot vomit due to their digestive system structure.",
    "Albatrosses can sleep while flying over ocean waters.",
    "Tasmanian devils store fat in their tails for times of scarcity.",
    "Crocodiles cannot stick out their tongues, as they are attached to the mouth.",
    "Gorillas can catch human colds and other respiratory infections.",
    "Prairie dogs have one of the most complex animal languages.",
    "Komodo dragons can reproduce asexually when no males are available.",
    "Pigeons can recognize all 26 letters of the English alphabet.",
    "Lobsters taste with their feet and can live for over 100 years.",
    "Hyenas' spots are unique to each individual, like fingerprints.",
    "Dragonflies have been on Earth for 300 million years.",
    "Macaws eat clay to neutralize toxins in their fruit-based diet.",
    "Elephants are the only animals with four knees.",
    "Otters hold hands while sleeping to avoid drifting apart.",
    "Galapagos tortoises can survive a year without food or water.",
    "Barn owls have asymmetrical ears to better locate prey by sound.",
    "Rattlesnakes add a new rattle segment each time they shed their skin.",
    "Moose can dive up to 20 feet underwater to feed on aquatic plants.",

    # For question answering
    "Falcons can dive at speeds over 200 mph, making them the fastest flying animals.",
    "Whales and dolphins use echolocation to navigate and hunt in dark ocean waters.",
    "Chimpanzees use tools like sticks to extract termites from mounds for food.",
    "Electric eels can generate shocks of up to 600 volts to stun prey and deter predators.",
    "Migratory birds use Earth's magnetic field, stars, and landmarks to navigate thousands of miles.",
    "Bombardier beetles spray boiling hot toxic chemicals at predators when threatened.",
    "Vampire bats can detect infrared radiation, allowing them to find blood vessels in prey.",
    "Orangutans create and use tools, and can learn sign language to communicate with humans.",
    "Deep-sea anglerfish females have bioluminescent lures to attract prey in the darkness.",
    "Honey badgers have thick, loose skin that allows them to turn and attack predators holding them.",
    "Spiders can detect vibrations so sensitive they can feel a fly landing on their web from across a room.",
    "Elephants communicate through infrasound that can travel several miles through ground and air.",
    "Peregrine falcons have specialized air sacs that protect their lungs during high-speed dives.",
    "Orcas have distinct dialects and hunting techniques that are passed down through generations.",
    "Bees can recognize human faces and remember people who have harmed their hive.",
    "Box jellyfish have 24 eyes, including four that can see in color and detect obstacles.",
    "African grey parrots have the intelligence equivalent to a 5-year-old human child.",
    "Octopuses can solve complex puzzles and remember solutions for months.",
    "Dung beetles navigate using the Milky Way, the first insects known to use stars for orientation.",
]

import time

upsert_data = []
# Embed the corpus
start_time = time.time()
embeds = embed_content(the_zoo_corpus)
embed_time = (time.time() - start_time) * 1000
print(f"Embedding completed in {int(embed_time)} ms")

for i, embed in enumerate(embeds):
    upsert_data.append({
        "id"       : f"the_zoo_chunk_{i}",
        "file_id"  : "the_zoo",
        "content"  : the_zoo_corpus[i],
        "embedding": embed.values,
    })

# Upsert all embeddings to Supabase
start_time = time.time()
response = supabase.table("file_embeddings").upsert(upsert_data).execute()
upsert_time = (time.time() - start_time) * 1000
print(f"Upserting completed in {int(upsert_time)} ms")

response.data[:3]

Embedding completed in 4925 ms
Upserting completed in 8681 ms


[{'id': 'the_zoo_chunk_0',
  'file_id': 'the_zoo',
  'content': 'The quick brown fox jumps over the lazy dog.',
  'fts': "'brown':3 'dog':9 'fox':4 'jump':5 'lazi':8 'quick':2",
  'embedding': '[-0.06868978,0.005816352,-0.0044413465,-0.012453203,-0.016220396,0.0378335,0.016380973,0.022189835,0.013305066,-0.000575397,-0.0052356906,0.02310427,0.0678747,0.04785653,-0.008731612,-0.005142602,0.025083635,0.033237115,-0.07198565,0.034525197,0.04022084,-0.008876277,0.017015086,-0.037441514,-0.0067318017,-0.024688154,-0.004185732,-0.013715649,-0.0084399665,-0.021793142,0.041767,0.09108016,0.0041992585,-0.024113636,0.05226678,0.0021381488,-0.04233077,0.027580375,0.07004266,-0.015475536,-0.08674333,0.008302095,-0.029740943,0.03891059,0.0010970772,-0.060369544,0.010995038,0.019837605,-0.02963292,0.038032256,0.062986806,0.017882522,-0.049972247,-0.00549779,-0.011984985,0.030878022,-0.024744753,0.013024645,0.0007624969,0.0006026833,0.009504737,0.0025120068,-0.030736525,0.023924567,0.027402163,-0.057

### Query (Full-text search)
https://supabase.com/docs/guides/database/full-text-search?queryGroups=language&language=python

In [6]:
query = "animal with white fur"
processed_query = query.replace(" ", " | ")

response = (
    supabase.table("file_embeddings")
    .select("*")
    .eq("file_id", "the_zoo")
    .text_search("fts", processed_query)
    .execute()
)

for row in response.data:
    print(row["content"])

Giraffes have the longest necks of any living animal.
Cheetahs are the fastest land animals, reaching speeds of 70 mph.
Sloths move so slowly that algae grows on their fur.
Polar bears have black skin underneath their white fur.
Ostriches have the largest eyes of any land animal.
Blue whales are the largest animals to have ever existed on Earth.
Prairie dogs have one of the most complex animal languages.
Elephants are the only animals with four knees.
Falcons can dive at speeds over 200 mph, making them the fastest flying animals.


### Vector Search (Semantic Search)

https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types

In [7]:
# The default task type for queries
retrieval_queries = [
    "flying animals and their abilities",
    "aquatic animals and their adaptations",
    "animals with exceptional speed",
    "animals with unusual sleeping habits",
    "animals with unique defensive capabilities",
    "animals with remarkable vision systems",
    "animals with extraordinary lifespans",
    "intelligent animals and their behaviors",
    "animals with specialized hunting techniques",
    "animals with distinctive physical features"
]

# Use in cases where all queries are formatted as proper questions
question_answering_queries = [
    "How do animals protect themselves from predators?",
    "What animals have unusual sleeping patterns?",
    "How do different animals communicate?",
    "Which animals have the most unique physical adaptations?",
    "What makes certain animals excellent swimmers?",
    "How do animals use their specialized senses?",
    "Which animals demonstrate remarkable intelligence?",
    "How do different animals hunt their prey?",
    "What unusual eating habits do some animals have?",
    "How do animals survive in extreme conditions?"
]

# Use in cases where you want to retrieve a document from your corpus
# that proves or disproves a statement
fact_verification_queries = [
    "Some animals can change their body color for protection",
    "Several mammals have evolved the ability to fly",
    "Certain animals can survive without water for extended periods",
    "Many marine creatures have multiple hearts",
    "Some animals can regenerate parts of their bodies",
    "Various animals can see types of light invisible to humans",
    "Certain animals use tools to obtain food",
    "Some animals have developed venomous capabilities",
    "Many animals can sleep while remaining partially alert",
    "Several animals have specialized methods for catching prey"
]

In [8]:
import random
import pandas as pd

pd.set_option('max_colwidth', 2000)

In [9]:

query  = "animals with white fur"
query  = random.choice(retrieval_queries)
query  = question_answering_queries[1]
print(f"Search query: {query}")

print(f"\nEmbedding query...")

task_type = EmbeddingTaskTypeEnum.QUESTION_ANSWERING
task_type = EmbeddingTaskTypeEnum.FACT_VERIFICATION
task_type = EmbeddingTaskTypeEnum.RETRIEVAL_QUERY

embeds = embed_content(query, task_type)

print(f"Searching for matches...")
result = supabase.rpc("match_file_embeddings", params={
    "p_query_embedding" : embeds[0].values,
    "p_match_threshold" : 0.5,
    "p_match_count"     : 7,
}).execute()


# Dataframe
df = pd.DataFrame(result.data)
df['similarity'] = df['similarity'].apply(lambda x: f"{x:.3f}")
df[['id', 'file_id', 'content', 'similarity']]

Search query: What animals have unusual sleeping patterns?

Embedding query...
Searching for matches...


Unnamed: 0,id,file_id,content,similarity
0,the_zoo_chunk_6,the_zoo,Koalas sleep for up to 22 hours a day.,0.653
1,the_zoo_chunk_64,the_zoo,Albatrosses can sleep while flying over ocean waters.,0.634
2,the_zoo_chunk_76,the_zoo,Otters hold hands while sleeping to avoid drifting apart.,0.611
3,the_zoo_chunk_47,the_zoo,Sloths take two weeks to digest a single meal.,0.597
4,the_zoo_chunk_10,the_zoo,Sloths move so slowly that algae grows on their fur.,0.58
5,the_zoo_chunk_41,the_zoo,Snails can sleep for up to three years when conditions are unfavorable.,0.57
6,the_zoo_chunk_9,the_zoo,Bats are the only mammals capable of sustained flight.,0.552


### Compare Task Types

In [10]:
task_types = [
    EmbeddingTaskTypeEnum.RETRIEVAL_QUERY,
    EmbeddingTaskTypeEnum.QUESTION_ANSWERING,
    EmbeddingTaskTypeEnum.FACT_VERIFICATION
]

query  = random.choice(retrieval_queries) # retrieval_queries, question_answering_queries, fact_verification_queries
print(f"Search query: '{query}'")

print(f"\nEmbedding query...")
embeds = embed_content(query, task_type)

results = {}

for task_type in task_types:
    print(f"Runing task type: {task_type.name}")
    embeds = embed_content(query, task_type)

    # print(f"Searching for matches...")
    result = supabase.rpc("match_file_embeddings", params={
        "p_query_embedding" : embeds[0].values,
        "p_match_threshold" : 0.5,
        "p_match_count"     : 5,
    }).execute()

    # Dataframe
    df = pd.DataFrame(result.data)
    df['similarity'] = df['similarity'].apply(lambda x: f"{x:.3f}")
    df['task_type'] = task_type.name
    results[task_type.name] = df

# Create a combined dataframe for comparison
combined_df = pd.concat(results.values())
combined_df = combined_df.sort_values(['id', 'task_type'])

# Display comparison of how the same documents rank differently by task type
print("\nComparison of similarity scores across task types:")
# Create pivot table
pivot_df = combined_df.pivot(index=['id', 'content'], columns='task_type', values='similarity')

pivot_df = pivot_df.sort_values(by='RETRIEVAL_QUERY', ascending=False)

# Find the highest value in each row and apply bold formatting
def highlight_max(row):
    # Convert string values to float for comparison
    row_float = row.astype(float)
    max_val = max(row_float)
    return ['font-weight: bold; color: green;' if float(v) == max_val else '' for v in row]

# Apply the styling and display the result
pivot_df.style.apply(highlight_max, axis=1)

Search query: 'intelligent animals and their behaviors'

Embedding query...
Runing task type: RETRIEVAL_QUERY
Runing task type: QUESTION_ANSWERING
Runing task type: FACT_VERIFICATION

Comparison of similarity scores across task types:


Unnamed: 0_level_0,task_type,FACT_VERIFICATION,QUESTION_ANSWERING,RETRIEVAL_QUERY
id,content,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the_zoo_chunk_2,Dolphins are known for their intelligence and playful behavior.,0.631,0.624,0.619
the_zoo_chunk_98,Octopuses can solve complex puzzles and remember solutions for months.,0.559,0.561,0.56
the_zoo_chunk_68,Prairie dogs have one of the most complex animal languages.,0.561,0.559,0.555
the_zoo_chunk_97,African grey parrots have the intelligence equivalent to a 5-year-old human child.,0.545,0.547,0.538
the_zoo_chunk_88,"Orangutans create and use tools, and can learn sign language to communicate with humans.",0.538,0.538,0.528
