<a href="https://colab.research.google.com/github/Lucky-victory/devsend-app/blob/main/Code_snippets_for_Vector_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Prerequsites

In [1]:
%%capture
!pip install PyMySQL==1.1.0
!pip install openai==1.27.0
!pip install SQLAlchemy==2.0.30
!pip install tidb-vector==0.0.9

from sqlalchemy import Column, Integer, String, Text, create_engine, URL
from google.colab import userdata
from sqlalchemy.orm import Session, declarative_base
from tidb_vector.sqlalchemy import VectorType

import openai

# 1. Connect to a TiDB Cloud Cluster

In [4]:
def get_db_url():
    return URL(
        drivername="mysql+pymysql",
        username=userdata.get("TIDB_USER"),
        password=userdata.get("TIDB_PASSWORD"),
        host=userdata.get('TIDB_HOST'),
        port=int(userdata.get("TIDB_PORT")),
        database=userdata.get("TIDB_DB_NAME"),
        query={"ssl_verify_cert": True, "ssl_verify_identity": True},
    )

engine = create_engine(get_db_url(), pool_recycle=300)

# 2. Create Table and Vector Index

In [6]:
model_name = "text-embedding-3-small"
dim_of_embedding_model = 1536

Base = declarative_base()
class Entity(Base):
    __tablename__ = "entity"

    id = Column(Integer, primary_key=True)
    content = Column(Text)
    content_vec = Column(
        VectorType(dim=dim_of_embedding_model),
        comment="hnsw(distance=l2)"
    )

Base.metadata.create_all(engine)

# 3. Create Embedding Vector and Save Data


In [7]:
def embedding(content):
  open_ai_client = openai.OpenAI(api_key=userdata.get('OPENAI_API_KEY'))
  return open_ai_client.embeddings.create(
      input=[content], model=model_name).data[0].embedding

tidb_content = 'TiDB is an open-source distributed SQL database that supports \
Hybrid Transactional and Analytical Processing (HTAP) workloads.'
tikv_content = 'TiKV is an open-source, distributed, and transactional \
key-value database. Unlike other traditional NoSQL systems.'
pd_content = 'The Placement Driver (PD) server is the metadata \
managing component of the entire cluster.'

with Session(engine) as session:
    session.add(Entity(content = tidb_content, content_vec = embedding(tidb_content)))
    session.add(Entity(content = tikv_content, content_vec = embedding(tikv_content)))
    session.add(Entity(content = pd_content, content_vec = embedding(pd_content)))
    session.commit()

SecretNotFoundError: Secret OPENAI_API_KEY does not exist.

# 4. Retrieve Content via Vector Cosine Distance

In [None]:
query = 'What is TiDB?'

embedding_query = open_ai_client.embeddings.create(
    input=[query], model=model_name).data[0].embedding

with Session(engine) as session:
    entity = session.query(Entity).order_by(
        Entity.content_vec.cosine_distance(embedding_query)
    ).limit(1).first()

    print(entity.content)