In [2]:
import setup 
setup.init_django()

In [3]:
from google import genai
from google.genai import types
from decouple import config

EMBEDDING_MODEL = config("EMBEDDING_MODEL", default="gemini-embedding-exp-03-07")

In [4]:
from blog.models import BlogPost
RECREATE_DATA = True

In [5]:
qs = BlogPost.objects.filter(can_delete=True)
qs.delete()

(4, {'blog.BlogPost': 4})

In [6]:
docs = ["The man is looking at the woman",
       "The woman is looking at the man",
       "The day is bright",
       "The man is dark and woman is light"]

In [7]:
new_data = []
for index, content in enumerate(docs):
    new_post = BlogPost(title= f"Blog Post {index+1}", content=content, can_delete=True)
    new_data.append(new_post)

if RECREATE_DATA:
    qs = BlogPost.objects.filter(can_delete=True)
    qs.delete
    BlogPost.objects.bulk_create(new_data)
    

In [8]:
qs = BlogPost.objects.all()
qs.count()
# BlogPost.objects.all().delete()

4

In [9]:
client = genai.Client(api_key=config("GEMINI_API_KEY"))
key = api_key=config("GEMINI_API_KEY")
# key

In [10]:
def get_embedding(text, task_type="retrieval_document"):
    """
    Generates an embedding for a given text using the experimental
    gemini-embedding-exp-03-07 model with 3072 dimensions.
    Ensure GEMINI_API_KEY is set in your Django settings.
    """

    try:
        client = genai.Client(api_key=config("GEMINI_API_KEY"))
        response = client.models.embed_content(
            model="gemini-embedding-exp-03-07",
            contents=text,
            config=types.EmbedContentConfig(output_dimensionality=3072, task_type="RETRIEVAL_DOCUMENT")
        )
        # Access the actual list of float values from the ContentEmbedding object
        if response.embeddings and len(response.embeddings) > 0:
            # response.embeddings is a list of ContentEmbedding objects
            # Each ContentEmbedding object has a 'values' attribute which is the list of floats
            return response.embeddings[0].values
        else:
            print("Warning: No embeddings found in the response.")
            return None

    except Exception as e:
        print(f"Error generating Gemini embedding: {e}")
        return None

In [10]:
query = "we are opened today"
query_embedding = get_embedding(text=query)

In [12]:
import time
for obj in qs:
    if obj.embedding is None:
        print(f"Embedded: {obj.title}")
        # obj.embedding = get_embedding(obj.get_embedding_text_raw())
        obj.save()
        time.sleep(10) 
        # You can use batch embedding
        

Embedded: Blog Post 1
Embedded: Blog Post 2
Embedded: Blog Post 3
Embedded: Blog Post 4


In [13]:
# BlogPost.objects.all().delete()

In [14]:
query = "The man is dark and woman is light"
query_emedding = get_embedding(query)

In [15]:
# BlogPost.objects.filter(embedding=query_emedding)

In [16]:
from pgvector.django import CosineDistance
from django.db.models import F
qs = BlogPost.objects.annotate(
    distance=CosineDistance('embedding', query_emedding),
    similarity= 1 - F("distance")).order_by("distance")

for obj in qs:
    print(obj.title, obj.distance, obj.similarity*100)

Blog Post 4 0.0 100.0
Blog Post 1 0.14984181969886445 85.01581803011355
Blog Post 2 0.1519971489906311 84.80028510093689
Blog Post 3 0.21231000533491107 78.76899946650889


In [17]:
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from google.genai.types import EmbedContentConfig


EMBEDDING_MODEL = config("EMBEDDING_MODEL", default="gemini-embedding-exp-03-07")
EMBEDDING_LENGTH = config("EMBEDDING_LENGTH", default=3072, cast=int)
GEMINI_API_KEY = config("GEMINI_API_KEY", cast=str)

# Test with the more robust model
embed_model_test = GoogleGenAIEmbedding(
    model_name=EMBEDDING_MODEL,
    api_key=GEMINI_API_KEY,
    dimensions=EMBEDDING_LENGTH,
)

sample_text = "Verify this model's output dimension."
embedding_test = embed_model_test.get_text_embedding(sample_text)
print(f"Test with {EMBEDDING_MODEL}: {len(embedding_test)} dimensions")



Test with models/gemini-embedding-exp-03-07: 3072 dimensions
