## Part 1

In [1]:
import setup

setup.init_django()

In [2]:
from decouple import config
from blog.models import BlogPost 
from blog import services

In [3]:
# qs = BlogPost.objects.filter(can_delete=True)
# qs

In [4]:
# !pip install llama-index sqlalchemy llama-index-vector-stores-postgres

In [5]:
# !pip install llama-index llama-index-llms-gemini llama-index-embeddings-gemini google-generativeai

In [6]:
# !pip install llama-index-llms-google-genai llama-index llama_index.embeddings.google_genai

In [7]:
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.vector_stores.postgres import PGVectorStore
import google.generativeai as genai
from llama_index.core import Settings

from google import genai
from google.genai import types

In [8]:
EMBEDDING_MODEL = config("EMBEDDING_MODEL", default="gemini-embedding-exp-03-07")
EMBEDDING_LENGTH = config("EMBEDDING_LENGTH", default=3072, cast=int)
GEMINI_API_KEY = config("GEMINI_API_KEY", cast=str)
LLM_MODEL = config("LLM_MODEL", default="gemini-2.0-flash")

EMBEDDING_LENGTH

3072

In [9]:
llm = GoogleGenAI(
    model=LLM_MODEL,
    api_key=GEMINI_API_KEY, 
)
embed_model = GoogleGenAIEmbedding(
    model_name=EMBEDDING_MODEL,
    api_key = GEMINI_API_KEY,
    dimensions=3072,
)

In [10]:
from typing import List
class MyGoogleGenAIEmbedding(GoogleGenAIEmbedding):
    
    def _get_query_embedding(self, query: str) -> List[float]:
        """Get query embedding."""
        print(f"My Query: {query}")
        return super()._get_query_embedding(query)

    def _get_text_embedding(self, text: str) -> List[float]:
        """Get text embedding."""
        print(f"Text: {text}")
        return super()._get_text_embedding([text])

    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Get text embeddings."""
        print(f"Texts: {texts}")
        return super()._get_text_embeddings(texts)

embed_model = MyGoogleGenAIEmbedding(
    model_name = EMBEDDING_MODEL,
    api_key = GEMINI_API_KEY,
    dimensions=3072)

In [11]:
embed_model = GoogleGenAIEmbedding(
    model_name=EMBEDDING_MODEL,
    api_key=GEMINI_API_KEY,
    dimensions=3072,
)
sample_text = "Test text"
embedding = embed_model.get_text_embedding(sample_text)
print(len(embedding))

3072


In [12]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [13]:
vector_db_name = "vector_db"
vector_db_table_name = "blogpost" # -> data_blogpost

In [14]:
DATABASE_URL = config("DATABASE_URL_POOL")
if DATABASE_URL.startswith("postgres://"):
    DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql://", 1)

In [15]:
from sqlalchemy import create_engine, text

engine = create_engine(DATABASE_URL, isolation_level="AUTOCOMMIT")
with engine.connect() as connection:
    result = connection.execute(text("SELECT 1 FROM pg_database WHERE datname = :db_name"), {"db_name": vector_db_name})
    db_exists = result.scalar() == 1
    if not db_exists:
        connection.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
        connection.execute(text(f"CREATE DATABASE {vector_db_name}"))

In [16]:
from sqlalchemy import make_url

url = make_url(DATABASE_URL)
vector_store = PGVectorStore.from_params(
    database=vector_db_name,
    host=url.host,
    password=url.password,
    port=url.port or 5432,
    user=url.username,
    table_name=vector_db_table_name,
    embed_dim=EMBEDDING_LENGTH,
)

In [17]:
from llama_index.core import VectorStoreIndex, StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context)
query_engine = index.as_query_engine()

In [18]:
query_engine.query("My query")

Response(response='I am sorry, I cannot answer your query as it is empty.\n', source_nodes=[NodeWithScore(node=TextNode(id_='5e2471bc-e539-4d24-b1dd-828eed7883e4', embedding=None, metadata={'pk': 272, 'title': 'Blog Post 3'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='272', node_type='4', metadata={'pk': 272, 'title': 'Blog Post 3'}, hash='a24f0a8a2afac5e73456f07599c23e73ad3ffea30d4b47994521c7316a22bb2b')}, metadata_template='{key}: {value}', metadata_separator='\n', text='The day is bright', mimetype='text/plain', start_char_idx=0, end_char_idx=17, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.642418876198892), NodeWithScore(node=TextNode(id_='067d3b72-0973-4a12-908f-10a5c4a4b076', embedding=None, metadata={'pk': 273, 'title': 'Blog Post 4'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNo

## Part 2

In [19]:
from llama_index.core import Document

docs = []
qs = BlogPost.objects.filter(can_delete=True)
for obj in qs:
    docs.append(
        Document(
            text=f"{obj.get_embedding_text_raw()}",
            doc_id=str(obj.id),
            embedding=obj.embedding.tolist(),
            metadata = {
                "pk": obj.pk,
                "title": obj.title
            }
        )
    )

# docs

In [20]:
for doc in docs:
    index.delete_ref_doc(f"{doc.id_}", delete_from_docstore=True)
    index.insert(doc)

In [21]:
response = query_engine.query("The day")

In [22]:
str(response.response)

'The day is bright\n'

In [23]:
for item in response.metadata:
    for subk, v in response.metadata[item].items():
        print(subk, v)

pk 272
title Blog Post 3
pk 273
title Blog Post 4


## Part 3

In [24]:
port = url.port or 5432
db_url = f"postgresql://{url.username}:{url.password}@{url.host}:{port}/{vector_db_name}"

from sqlalchemy import create_engine, text
import numpy as np


engine =  create_engine(db_url)

with engine.connect() as connection:
    # Define the SQL query to select only the id and embedding columns
    query = text(f"SELECT * FROM data_{vector_db_table_name}")
    query = text(f"SELECT metadata_, embedding FROM data_{vector_db_table_name}")
    
    # Execute the query
    result = connection.execute(query)
    
    # Fetch all rows
    rows = result.fetchall()

In [25]:
def calculate_cosine_metrics(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude1 = np.linalg.norm(v1)
    magnitude2 = np.linalg.norm(v2)
    cosine_similarity = dot_product / (magnitude1 * magnitude2)
    cosine_distance = 1 - cosine_similarity
    return int(cosine_similarity* 100), int(cosine_distance * 100)

In [26]:
for row in rows:
    metadata_, embedding = row[0], row[1]
    # print(metadata_)
    # print(embedding)

    blog_post_pk = metadata_.get("pk")
    obj = BlogPost.objects.get(pk=blog_post_pk)
    # print(obj.embedding, embedding)
    embedding_array = np.array(embedding.strip('[]').split(','), dtype=float)
    obj_embedding_array = np.array(obj.embedding, dtype=float)
    print(calculate_cosine_metrics(embedding_array.shape, obj_embedding_array.shape))
    # print(obj.embedding, embedding)
    

(100, 0)
(100, 0)
(100, 0)
(100, 0)
