## Vector Database setup

Remove old Weaviate DB files

In [1]:
!rm -rf ~/.local/share/weaviate


### Step 1 - Download sample data

In [2]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

json_print(data[0])

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


### Step 2 - Create an embedded instance of Weaviate vector database

In [3]:
import weaviate, os
from weaviate import EmbeddedOptions
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-BaseURL": os.environ['OPENAI_API_BASE'],
        "X-OpenAI-Api-Key": openai.api_key  # Replace this with your actual key
    }
)
print(f"Client created? {client.is_ready()}")

Started /home/jovyan/.cache/weaviate-embedded: process ID 442
Client created? True


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2025-09-26T01:59:28Z"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2025-09-26T01:59:28Z"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2025-09-26T01:59:28Z"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2025-09-26T01:59:28Z"}
            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [4]:
json_print(client.get_meta())

{
  "hostname": "http://127.0.0.1:8079",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "nam

## Step 3 - Create Question collection

In [5]:
# resetting the schema. CAUTION: This will delete your collection 
if client.schema.exists("Question"):
    client.schema.delete_class("Question")
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # Use OpenAI as the vectorizer
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "baseURL": os.environ["OPENAI_API_BASE"]
        }
    }
}

client.schema.create_class(class_obj)

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"question_uIyjRAT1kPYW","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2025-09-26T01:59:28Z","took":55660}


## Step 4 - Load sample data and generate vector embeddings

In [6]:
# reminder for the data structure
json_print(data[0])

{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [7]:
with client.batch.configure(batch_size=5) as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [8]:
count = client.query.aggregate("Question").with_meta_count().do()
json_print(count)

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}


## Let's Extract the vector that represents each question!

In [24]:
# write a query to extract the vector for a question
result = (client.query
          .get("Question", ["category", "question", "answer"])
          .with_additional("vector")
          .with_limit(1)
          .do())

json_print(result)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "vector": [
              -0.015748857,
              0.020325335,
              -0.0012039483,
              -0.005848451,
              -0.008818572,
              0.024311723,
              0.015447254,
              0.0085235275,
              -0.022121832,
              -0.011251056,
              -0.014240847,
              0.020718727,
              0.006048426,
              0.018751761,
              0.0026177063,
              0.008372727,
              0.021426836,
              0.0015039109,
              -0.0054058833,
              -0.0060287565,
              -0.022305416,
              -0.001745684,
              0.00792688,
              -0.00632708,
              -0.00794655,
              0.027222835,
              0.019420529,
              -0.04665648,
              -0.00024054377,
              -0.012667273,
              -0.009369324,
              0.00173421,
        

In [25]:
result.keys()

dict_keys(['data'])

In [27]:
result['data'].keys()

dict_keys(['Get'])

In [28]:
result['data']['Get'].keys()

dict_keys(['Question'])

In [32]:
len(result['data']['Get']['Question']), type(result['data']['Get']['Question'])

(1, list)

In [34]:
result['data']['Get']['Question'][0]

{'_additional': {'vector': [-0.015748857,
   0.020325335,
   -0.0012039483,
   -0.005848451,
   -0.008818572,
   0.024311723,
   0.015447254,
   0.0085235275,
   -0.022121832,
   -0.011251056,
   -0.014240847,
   0.020718727,
   0.006048426,
   0.018751761,
   0.0026177063,
   0.008372727,
   0.021426836,
   0.0015039109,
   -0.0054058833,
   -0.0060287565,
   -0.022305416,
   -0.001745684,
   0.00792688,
   -0.00632708,
   -0.00794655,
   0.027222835,
   0.019420529,
   -0.04665648,
   -0.00024054377,
   -0.012667273,
   -0.009369324,
   0.00173421,
   -0.034671087,
   -0.02589841,
   -0.011788694,
   -0.0009974166,
   -0.007159763,
   0.00249641,
   0.027065478,
   -0.029425839,
   0.024731342,
   0.016535643,
   -0.004091293,
   -0.016483191,
   -0.02060071,
   -0.0030209348,
   0.013506512,
   -0.00962503,
   -0.011093698,
   -0.003855257,
   -0.0055534057,
   0.0407818,
   -0.020994103,
   -0.013532739,
   -0.0071204235,
   -0.008425179,
   0.0060222,
   0.0046682702,
   -0.008792

In [35]:
result['data']['Get']['Question'][0].keys()

dict_keys(['_additional', 'answer', 'category', 'question'])

In [39]:
result['data']['Get']['Question'][0]['_additional'].keys()

dict_keys(['vector'])

In [40]:
result['data']['Get']['Question'][0]['_additional']['vector']

[-0.015748857,
 0.020325335,
 -0.0012039483,
 -0.005848451,
 -0.008818572,
 0.024311723,
 0.015447254,
 0.0085235275,
 -0.022121832,
 -0.011251056,
 -0.014240847,
 0.020718727,
 0.006048426,
 0.018751761,
 0.0026177063,
 0.008372727,
 0.021426836,
 0.0015039109,
 -0.0054058833,
 -0.0060287565,
 -0.022305416,
 -0.001745684,
 0.00792688,
 -0.00632708,
 -0.00794655,
 0.027222835,
 0.019420529,
 -0.04665648,
 -0.00024054377,
 -0.012667273,
 -0.009369324,
 0.00173421,
 -0.034671087,
 -0.02589841,
 -0.011788694,
 -0.0009974166,
 -0.007159763,
 0.00249641,
 0.027065478,
 -0.029425839,
 0.024731342,
 0.016535643,
 -0.004091293,
 -0.016483191,
 -0.02060071,
 -0.0030209348,
 0.013506512,
 -0.00962503,
 -0.011093698,
 -0.003855257,
 -0.0055534057,
 0.0407818,
 -0.020994103,
 -0.013532739,
 -0.0071204235,
 -0.008425179,
 0.0060222,
 0.0046682702,
 -0.008792346,
 -0.018686194,
 0.0064877155,
 0.0018063321,
 -0.023170881,
 0.0056091365,
 -0.016837245,
 0.0019079588,
 0.002873412,
 0.009690595,
 -0.0

In [41]:
# 1536 dim vector embedding
len(result['data']['Get']['Question'][0]['_additional']['vector'])

1536

## Query time
What is the distance between the `query`: `biology` and the returned objects?

In [10]:
response = (
    client.query
    .get("Question",["question","answer","category"])
    # query based on ur input text/dict
    # even if 'concepts' is not a field in ur dataset
    # ***during query-time -> vectorizer converts from text to embedding
    .with_near_text({"concepts": "biology"})    
    .with_additional('distance')    # return 'distance' pty
    .with_limit(2)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.1980502
          },
          "answer": "DNA",
          "category": "SCIENCE",
          "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
        },
        {
          "_additional": {
            "distance": 0.20255089
          },
          "answer": "species",
          "category": "SCIENCE",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        }
      ]
    }
  }
}


In [11]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"]})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

json_print(response)
# sort in ascending order for 'distance'
# i.e. nearest vectors above

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.19061601
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19258726
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20495284
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21570086
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
    

## We can let the vector database know to remove results after a threshold distance!

In [12]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    # filter by distance
    # we want vectors below tt threshold distance
    .with_near_text({"concepts": ["animals"], "distance": 0.24})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.19064832
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.1926108
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20497632
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21571934
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
     

## Vector Databases support for CRUD operations

### Create

In [13]:
#Create an object
object_uuid = client.data_object.create(
    data_object={
        'question':"Leonardo da Vinci was born in this country.",
        'answer': "Italy",
        'category': "Culture"
    },
    class_name="Question"
 )

In [14]:
print(object_uuid)

f364746b-8ee8-4972-b676-2643106b73ab


### Read

In [15]:
data_object = client.data_object.get_by_id(object_uuid, class_name="Question")
json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1758851969608,
  "id": "f364746b-8ee8-4972-b676-2643106b73ab",
  "lastUpdateTimeUnix": 1758851969608,
  "properties": {
    "answer": "Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vectorWeights": null
}


In [16]:
data_object = client.data_object.get_by_id(
    object_uuid,
    class_name='Question',
    with_vector=True
)

json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1758851969608,
  "id": "f364746b-8ee8-4972-b676-2643106b73ab",
  "lastUpdateTimeUnix": 1758851969608,
  "properties": {
    "answer": "Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vector": [
    0.022487657,
    -0.013111404,
    -0.003148336,
    -0.04778947,
    -0.0038502757,
    0.011966554,
    -0.017409386,
    -0.0040165666,
    -0.0045857937,
    -0.034588523,
    0.010476332,
    0.025634395,
    -0.003177117,
    -0.0072912197,
    0.010303645,
    -0.0015126084,
    0.013009071,
    0.005532373,
    0.03962842,
    -0.007988363,
    0.002385636,
    0.021246871,
    0.0117554935,
    -0.010975204,
    0.0050878646,
    0.0018499872,
    0.016603515,
    -0.032055784,
    0.014915022,
    -0.031851117,
    -0.007847655,
    0.0007734929,
    -0.009983854,
    -0.013367237,
    -0.018854838,
    -0.014544065,
    -0.0076621766,
    -0.028115967,
    0.0082378,
    0.01538

### Update

In [17]:
client.data_object.update(
    uuid=object_uuid,
    class_name="Question",
    data_object={
        'answer':"Florence, Italy"
    })

In [18]:
data_object = client.data_object.get_by_id(
    object_uuid,
    class_name='Question',
)

json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1758851969608,
  "id": "f364746b-8ee8-4972-b676-2643106b73ab",
  "lastUpdateTimeUnix": 1758851970189,
  "properties": {
    "answer": "Florence, Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vectorWeights": null
}


### Delete

In [19]:
# check count b4 delete
json_print(client.query.aggregate("Question").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 11
          }
        }
      ]
    }
  }
}


In [20]:
client.data_object.delete(uuid=object_uuid, class_name="Question")

In [21]:
# check count aft delete
json_print(client.query.aggregate("Question").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}
