In [1]:
import setup
setup.init_django()
from decouple import config
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = config("EMBEDDING_MODEL", default="text-embedding-3-small")
EMBEDDING_DIMENSIONS = 1536
RECREATE_DATA = True

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [2]:
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

In [3]:
from blog.models import Post

new_data = []
for i,x in enumerate(sentences):
    new_data.append(
        Post(
            title=f"Blog Post #{i+1}",
            content=x,
            can_delete=True
        )
    )

if RECREATE_DATA:
    qs = Post.objects.filter(can_delete=True)
    qs.delete()
    Post.objects.bulk_create(new_data)

In [4]:
qs = Post.objects.filter(can_delete=True)
qs.count()

3

In [5]:
# def get_embedding(text):
#     text = text.replace("\n", " ").strip()
#     return model.encode([text])

In [6]:
for obj in qs:
    if obj.embedding is None:
        text = obj.get_embedding_text_raw()
        embedding = model.encode(text)
        # Save embedding
        if len(embedding.shape) > 1:
            raise ValueError(f"Expected 1D embedding, got shape {embedding.shape}")
        obj.embedding = embedding.tolist()
        obj.save()

In [13]:
query = "The weather is lovely today."
query_embedding = model.encode(query)

In [14]:
from pgvector.django import CosineDistance
from django.db.models import F


qs = Post.objects.annotate(
    distance=CosineDistance('embedding', query_embedding),
    similarity=1-F("distance")
).order_by("distance")

for obj in qs:
    print(obj.title, obj.distance, obj.similarity)

Blog Post #1 0.0 1.0
Blog Post #2 0.3340447942627902 0.6659552057372098
Blog Post #3 0.8954160033290387 0.1045839966709613
