In [1]:
pip install -U weaviate-client

Defaulting to user installation because normal site-packages is not writeable
Collecting weaviate-client
  Downloading weaviate_client-4.16.10-py3-none-any.whl.metadata (3.7 kB)
Collecting httpx<0.29.0,>=0.26.0 (from weaviate-client)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting validators<1.0.0,>=0.34.0 (from weaviate-client)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pydantic<3.0.0,>=2.8.0 (from weaviate-client)
  Downloading pydantic-2.11.9-py3-none-any.whl.metadata (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.4/68.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation<3.0.0,>=2.1.0 (from weaviate-client)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting httpcore==1.* (from httpx<0.29.0,>=0.26.0->weaviate-client)
  Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<0.29.0,>=0.26.0->weavia

In [1]:
import weaviate
weaviate.__version__

'4.16.10'

## Vector Database setup

Remove old Weaviate DB files

In [None]:
!rm -rf ~/.local/share/weaviate


### Step 1 - Download sample data

In [2]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

json_print(data[0])

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


### Step 2 - Create an embedded instance of Weaviate vector database

In [5]:
import weaviate, os
# from weaviate import EmbeddedOptions
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']


In [6]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
OPENAI_API_BASE = os.environ['OPENAI_API_BASE']

In [7]:
import weaviate.classes as wvc  # config + query helpers

headers = {
    "X-OpenAI-Api-Key": OPENAI_API_KEY                 # os.getenv("OPENAI_API_KEY", ""),
}

# Optional: use a non-default base URL (proxy/Azure)
if os.getenv("OPENAI_API_BASE"):
    headers["X-OpenAI-Baseurl"] = OPENAI_API_BASE      # os.getenv("OPENAI_API_BASE")

# Start an embedded server (or replace with connect_to_local / connect_to_weaviate_cloud)
client = weaviate.connect_to_embedded(
    # pin the binary version if you want deterministic behavior; otherwise omit
    # version="1.30.0",
    headers=headers,
    # environment_variables={"LOG_LEVEL": "error"},
)

print("is_ready?", client.is_ready())       # v4 client health check

{"action":"startup","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"Feature flag LD integration disabled: could not locate WEAVIATE_LD_API_KEY env variable","time":"2025-09-26T02:46:25Z"}
{"action":"startup","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2025-09-26T02:46:25Z"}
{"action":"startup","auto_schema_enabled":{},"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"auto schema enabled setting is set to \"\u0026{\u003cnil\u003e {{{} {0 0}} 0 0 {{} 0} {{} 0}} true}\"","time":"2025-09-26T02:46:25Z"}
{"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"",

is_ready? True


{"action":"bootstrap","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"node reporting ready, exiting bootstrap process","time":"2025-09-26T02:46:27Z"}
{"action":"telemetry_push","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"telemetry started","payload":"\u0026{MachineID:e26a9af1-c200-4367-a120-f6e435fd4dde Type:INIT Version:1.30.5 ObjectsCount:0 OS:linux Arch:amd64 UsedModules:[] CollectionsCount:0}","time":"2025-09-26T02:46:28Z"}


In [8]:
json_print(client.get_meta())

{
  "grpcMaxMessageSize": 104858000,
  "hostname": "http://127.0.0.1:8079",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddin

## Step 3 - Create Question collection

In [9]:
# --- (Re)create collection "Question" ----------------------------------------
# Delete if exists (CAUTION: removes data)
if client.collections.exists("Question"):
    client.collections.delete("Question")


In [10]:
from weaviate.classes.config import Property, DataType, Configure

# Create a collection using text2vec-openai vectorizer
# You can pick a model & dimensions (examples below); if none provided, Weaviate uses defaults.
questions = client.collections.create(
    name="Question",
    properties=[
        Property(name="question", data_type=DataType.TEXT),
        Property(name="answer",   data_type=DataType.TEXT),
        Property(name="category", data_type=DataType.TEXT),
    ],
    vector_config=Configure.Vectors.text2vec_openai(
        # Examples of model config (uncomment one that matches your setup):

        # Available models
        # https://docs.weaviate.io/weaviate/model-providers/openai/embeddings#available-models
        # model="text-embedding-3-small", dimensions=512,    # v3-family, small (512/1536)
        # model="text-embedding-3-large", dimensions=1024,   # v3-family, large (256/1024/3072)

        # TypeError: _Vectors.text2vec_openai() got an unexpected keyword argument 'type'
        # https://docs.weaviate.io/weaviate/model-providers/openai/embeddings#for-older-model-families-eg-ada
        model="ada", model_version="002", type_="text",     # legacy ada example
        # base_url=os.getenv("OPENAI_API_BASE"),             # if you want to hard-set base here
    ),
    # generative_config=Configure.Generative.openai(),      # Optional: for RAG later
)

{"action":"hnsw_prefill_cache_async","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2025-09-26T02:47:13Z","wait_for_cache_prefill":false}
{"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"Created shard question_37mWipilqZbW in 1.043723ms","time":"2025-09-26T02:47:13Z"}
{"action":"hnsw_vector_cache_prefill","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","count":1000,"index_id":"vectors_default","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2025-09-26T02:47:13Z","took":49197}


## Step 4 - Load sample data and generate vector embeddings

In [11]:
# reminder for the data structure
json_print(data[0])

{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [12]:
# --- Batch import -------------------------------------------------------------
# Use the collection's batcher; v4 gRPC batching is much faster than v3 REST.
with questions.batch.fixed_size(batch_size=5) as batch:
    for i, d in enumerate(data):
        print(f"importing question: {i+1}")
        props = {
            "answer":   d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        batch.add_object(properties=props)


importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [13]:

# --- Aggregate count ----------------------------------------------------------
agg = questions.aggregate.over_all(total_count=True)
print("Total objects:", agg.total_count)


Total objects: 10


## Let's Extract the vector that represents each question!

In [14]:

res = questions.query.fetch_objects(include_vector=True, limit=1)
o = res.objects[0]
json_print({
    "question": o.properties.get("question"),
    "answer":   o.properties.get("answer"),
    "category": o.properties.get("category"),
    "vector_len": len(o.vector["default"]) if "default" in o.vector else None
})


{
  "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
  "answer": "species",
  "category": "SCIENCE",
  "vector_len": 1536
}


## Query time
What is the distance between the `query`: `biology` and the returned objects?

In [15]:
# --- Vector search: near_text (with distance in metadata) --------------------
from weaviate.classes.query import MetadataQuery

resp = questions.query.near_text(
    query="biology",
    return_metadata=MetadataQuery(distance=True),
    limit=2
)
json_print([{
    "question": o.properties["question"],
    "answer": o.properties["answer"],
    "category": o.properties["category"],
    "distance": o.metadata.distance
} for o in resp.objects])




[
  {
    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "answer": "species",
    "category": "SCIENCE",
    "distance": 0.198769211769104
  },
  {
    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance",
    "answer": "DNA",
    "category": "SCIENCE",
    "distance": 0.20344388484954834
  }
]


In [16]:
resp = questions.query.near_text(
    query="animals",
    return_metadata=MetadataQuery(distance=True),
    limit=10
)
json_print([{
    "question": o.properties["question"],
    "answer":   o.properties["answer"],
    "distance": o.metadata.distance
} for o in resp.objects])

[
  {
    "question": "The gavial looks very much like a crocodile except for this bodily feature",
    "answer": "the nose or snout",
    "distance": 0.18551617860794067
  },
  {
    "question": "It's the only living mammal in the order Proboseidea",
    "answer": "Elephant",
    "distance": 0.18998634815216064
  },
  {
    "question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "answer": "Antelope",
    "distance": 0.19844287633895874
  },
  {
    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "answer": "species",
    "distance": 0.2177758812904358
  },
  {
    "question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "answer": "the diamondback rattler",
    "distance": 0.22018253803253174
  },
  {
    "question": "A metal that is ductile can be pulled into this while cold & under pressure",
    "answer": "wire",
    "distance"

## We can let the vector database know to remove results after a threshold distance!

In [17]:
# Optional: filter results by a max distance threshold (depends on distance metric; cosine is default)
resp = questions.query.near_text(
    query="animals",
    distance=0.24,  # keep only results within this distance
    return_metadata=MetadataQuery(distance=True),
    limit=10
)
json_print([{
    "question": o.properties["question"],
    "answer":   o.properties["answer"],
    "distance": o.metadata.distance
} for o in resp.objects])


[
  {
    "question": "The gavial looks very much like a crocodile except for this bodily feature",
    "answer": "the nose or snout",
    "distance": 0.18551617860794067
  },
  {
    "question": "It's the only living mammal in the order Proboseidea",
    "answer": "Elephant",
    "distance": 0.18998634815216064
  },
  {
    "question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "answer": "Antelope",
    "distance": 0.19844287633895874
  },
  {
    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "answer": "species",
    "distance": 0.2177758812904358
  },
  {
    "question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "answer": "the diamondback rattler",
    "distance": 0.22018253803253174
  }
]


## Vector Databases support for CRUD operations

### Create

In [18]:

# --- Create one object (CRUD) -------------------------------------------------
new_id = questions.data.insert(
    properties={
        "question": "Leonardo da Vinci was born in this country.",
        "answer":   "Italy",
        "category": "Culture"
    }
)
print("New UUID:", new_id)


New UUID: 1b9ced9f-582a-4fe4-b84a-7aa2ecfd26d8


### Read

In [19]:
# Read it back (w/o vector)
obj = questions.query.fetch_object_by_id(new_id)
json_print(obj.properties)


{
  "answer": "Italy",
  "question": "Leonardo da Vinci was born in this country.",
  "category": "Culture"
}


In [20]:
# Read with vector
obj = questions.query.fetch_object_by_id(new_id, include_vector=True)
json_print({
    "properties": obj.properties,
    "vector_len": len(obj.vector["default"]) if "default" in obj.vector else None
})


{
  "properties": {
    "answer": "Italy",
    "question": "Leonardo da Vinci was born in this country.",
    "category": "Culture"
  },
  "vector_len": 1536
}


### Update

In [21]:
# Update (partial)
questions.data.update(
    uuid=new_id,
    properties={"answer": "Florence, Italy"}
)
obj = questions.query.fetch_object_by_id(new_id)
json_print(obj.properties)

{
  "answer": "Florence, Italy",
  "question": "Leonardo da Vinci was born in this country.",
  "category": "Culture"
}


### Delete

In [22]:
# Count before/after delete
print("Count before delete:", questions.aggregate.over_all(total_count=True).total_count)
questions.data.delete_by_id(new_id)
print("Count after delete:", questions.aggregate.over_all(total_count=True).total_count)


Count before delete: 11
Count after delete: 10


In [23]:
# --- Close client -------------------------------------------------------------
client.close()


{"action":"restapi_management","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"Shutting down... ","time":"2025-09-26T02:51:45Z","version":"1.30.5"}
{"action":"restapi_management","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"Stopped serving weaviate at http://127.0.0.1:8079","time":"2025-09-26T02:51:45Z","version":"1.30.5"}
{"action":"telemetry_push","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"telemetry terminated","payload":"\u0026{MachineID:e26a9af1-c200-4367-a120-f6e435fd4dde Type:TERMINATE Version:1.30.5 ObjectsCount:11 OS:linux Arch:amd64 UsedModules:[text2vec-openai] CollectionsCount:1}","time":"2025-09-26T02:51:46Z"}
{"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"closing raft 