In [None]:
pip install -U weaviate-client

Collecting weaviate-client
  Downloading weaviate_client-4.16.10-py3-none-any.whl.metadata (3.7 kB)
Collecting validators<1.0.0,>=0.34.0 (from weaviate-client)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from weaviate-client)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Downloading weaviate_client-4.16.10-py3-none-any.whl (583 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m583.8/583.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Downloading validators-0.35.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.7/44.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: validators, deprecation, weaviate-client
Successfully installed deprecation-2.1.0 validators-0.35.0 weaviate-client-4.16.10


# Applications

RAG
* dec hallucinations - compare source txt vs generated txt
* LLM can cite sources
* For knowledge intensive tasks

In [None]:
from google.colab import userdata
WEAVIATE_API_KEY = userdata.get('WEAVIATE_API_KEY')
COHERE_API_KEY = userdata.get('COHERE_API_KEY')

In [None]:
def json_print(data):
    print(json.dumps(data, indent=2))

In [None]:
import weaviate, os, json
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

# auth_config = weaviate.auth.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY"))
auth_config = WEAVIATE_API_KEY
WEAVIATE_API_URL = 'http://jupyter-api-proxy.internal.dlai/rev-proxy/weaviate'
CO_API_URL = 'http://jupyter-api-proxy.internal.dlai/rev-proxy/cohere'

# client = weaviate.Client(
#     url=os.getenv("WEAVIATE_API_URL"),
#     auth_client_secret=auth_config,
#     additional_headers={
#         "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
#         "X-Cohere-BaseURL": os.getenv("CO_API_URL")
#     }
# )

# client.is_ready() #check if True

In [None]:
# v4 imports
from weaviate.classes.init import Auth
from weaviate.classes.query import Filter, MetadataQuery
from weaviate.classes.generate import GenerativeConfig


In [None]:
# ---------- connect ----------
# If you're on Weaviate Cloud (WCD / Serverless), pass the cluster URL & API key:
# For local dev, you can switch to: client = weaviate.connect_to_local()
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_API_URL,          # os.getenv("WEAVIATE_API_URL"),
    auth_credentials= WEAVIATE_API_KEY,    # Auth.api_key(os.getenv("WEAVIATE_API_KEY")),
    headers={
        # Cohere embeddings and/or generative modules expect this header
        "X-Cohere-Api-Key": COHERE_API_KEY,       # os.getenv("COHERE_API_KEY"),
        # If you’re using a Cohere-compatible proxy/base URL, the documented header is:
        # NOTE the lowercase 'url' in 'Baseurl' (per docs):
        "X-Cohere-Baseurl": CO_API_URL,           # os.getenv("CO_API_URL"),
    },
)


WeaviateStartUpError: Could not connect to Weaviate:Connection to Weaviate failed. Details: .

In [None]:
# Readiness
print("is_ready:", client.is_ready())  # True when healthy

# Grab the collection handle (v4 pattern)
wikipedia = client.collections.get("Wikipedia")


### 2. How many vectors are stored in this database

In [None]:
# print(json.dumps(client.query.aggregate("Wikipedia").with_meta_count().do(), indent=2))

In [None]:
# ---------- aggregate: count ----------
# v3: client.query.aggregate("Wikipedia").with_meta_count().do()
# v4:
agg = wikipedia.aggregate.over_all(total_count=True)
print("Wikipedia total count:", agg.total_count)

In [None]:
# 4 306 800

### 3. Perform search over them to find concepts you are interested in!

In [None]:
# response = (client.query
#             .get("Wikipedia",['text','title','url','views','lang'])
#             .with_near_text({"concepts": "Vacation spots in california"})
#             .with_limit(5)
#             .do()
#            )

# json_print(response)

In [None]:
# response = (client.query
#             .get("Wikipedia",['text','title','url','views','lang'])
#             .with_near_text({"concepts": "Vacation spots in california"})
#             .with_where({
#                 "path" : ['lang'],
#                 "operator" : "Equal",
#                 "valueString":'en'
#             })
#             .with_limit(3)
#             .do()
#            )

# json_print(response)

In [None]:
# response = (client.query
#             .get("Wikipedia",['text','title','url','views','lang'])
#             .with_near_text({"concepts": "Miejsca na wakacje w Kalifornii"})
#             .with_where({
#                 "path" : ['lang'],
#                 "operator" : "Equal",
#                 "valueString":'en'
#             })
#             .with_limit(3)
#             .do()
#            )

# json_print(response)

In [None]:
# response = (client.query
#             .get("Wikipedia",['text','title','url','views','lang'])
#             .with_near_text({"concepts": "أماكن العطلات في كاليفورنيا"})
#             .with_where({
#                 "path" : ['lang'],
#                 "operator" : "Equal",
#                 "valueString":'en'
#             })
#             .with_limit(3)
#             .do()
#            )

# json_print(response)

In [None]:
# ---------- vector search: near_text ----------
# v3: .get("Wikipedia", [...]).with_near_text(...).with_limit(5).do()
# v4: use collection.query.near_text(...)
resp = wikipedia.query.near_text(
    query="Vacation spots in california",
    limit=5,
    return_properties=["text", "title", "url", "views", "lang"],
    return_metadata=MetadataQuery(distance=True),  # optional, to inspect distance
)
print("\nTop 5 (near_text, any language):")
for o in resp.objects:
    print(f"- {o.properties.get('title')} ({o.properties.get('lang')}) | dist={o.metadata.distance}")

In [None]:
# output response objects can be in multiple languages

In [None]:
# ---------- vector search + filter (lang == 'en') ----------
resp = wikipedia.query.near_text(
    query="Vacation spots in california",
    filters=Filter.by_property("lang").equal("en"),
    limit=3,
    return_properties=["text", "title", "url", "views", "lang"],
)
print("\nTop 3 (near_text EN, query in EN):")
for o in resp.objects:
    print(f"- {o.properties.get('title')} ({o.properties.get('lang')})")


In [None]:
# output response objects in EN only

In [None]:
# Same query concept in Polish; still filter to English results
resp = wikipedia.query.near_text(
    query="Miejsca na wakacje w Kalifornii",
    filters=Filter.by_property("lang").equal("en"),
    limit=3,
    return_properties=["text", "title", "url", "views", "lang"],
)
print("\nTop 3 (near_text EN, query in PL):")
for o in resp.objects:
    print(f"- {o.properties.get('title')} ({o.properties.get('lang')})")


In [None]:
# output response objects in EN only
# even though input query was in Polish

In [None]:
# Same query concept in Arabic; still filter to English results
resp = wikipedia.query.near_text(
    query="أماكن العطلات في كاليفورنيا",
    filters=Filter.by_property("lang").equal("en"),
    limit=3,
    return_properties=["text", "title", "url", "views", "lang"],
)
print("\nTop 3 (near_text EN, query in AR):")
for o in resp.objects:
    print(f"- {o.properties.get('title')} ({o.properties.get('lang')})")


In [None]:
# output response objects in EN only
# even though input query was in Arabic

## Retrieval Augmented Generation

### Single Prompt

In [None]:
# prompt = "Write me a facebook ad about {title} using information inside {text}"
# result = (
#   client.query
#   .get("Wikipedia", ["title","text"])
#   # input prompt
#   .with_generate(single_prompt=prompt)
#   .with_near_text({
#     "concepts": ["Vacation spots in california"]
#   })
#   .with_limit(3)
# ).do()

# json_print(result)

In [None]:
prompt = "Write me a facebook ad about {title} using information inside {text}"

# Get the collection handle (v4 pattern)
wikipedia = client.collections.get("Wikipedia")

# Perform a semantic search + single-prompt generation
resp = wikipedia.generate.near_text(
    query="Vacation spots in california",
    limit=3,
    single_prompt=prompt,
    return_properties=["title", "text"],          # properties to show in results
    # return_metadata=MetadataQuery(distance=True), # optional: see vector distance
    # target_vector="my_vector_name",               # optional: if using named vectors
    # generative_provider=GenerativeConfig.cohere() # optional: set provider per call
)

# Print nicely
for o in resp.objects:
    print("----")
    print("TITLE:", o.properties.get("title"))
    print("GENERATED:\n", o.generative.text)


In [None]:
# If you want JSON-like output (similar to your v3 json_print(result))
json_print({
    "objects": [
        {
            "title": o.properties.get("title"),
            "text": o.properties.get("text"),
            "generated": o.generative.text
        } for o in resp.objects
    ]
})

### Group Task

In [None]:
# generate_prompt = "Summarize what these posts are about in two paragraphs."

# result = (
#   client.query
#   .get("Wikipedia", ["title","text"])
#   .with_generate(grouped_task=generate_prompt) # Pass in all objects at once
#   .with_near_text({
#     "concepts": ["Vacation spots in california"]
#   })
#   .with_limit(3)
# ).do()

# json_print(result)


In [None]:

# ---------- generative: grouped task ----------
generate_prompt = "Summarize what these posts are about in two paragraphs."

# v3: .with_generate(grouped_task=...)
# v4: collection.generate.near_text(..., grouped_task=...)
# If your collection is not pre-configured with a generative provider,
# you can explicitly set one per query, e.g., Cohere:
#   generative_provider=GenerativeConfig.cohere(model="command-r")
gen = wikipedia.generate.near_text(
    query="Vacation spots in california",
    limit=3,
    grouped_task=generate_prompt,
    # grouped_properties=["title", "text"],   # optional: reduce prompt length
    return_properties=["title", "text"],      # what to include in results payload
    # generative_provider=GenerativeConfig.cohere(),  # uncomment if needed
)

print("\n--- Grouped generative summary ---")
print(gen.generative.text)  # one summary across all retrieved objects


In [None]:
json_print({
    "objects": [
        {"title": o.properties.get("title"), "text": o.properties.get("text")}
        for o in resp.objects
    ],
    "generated": resp.generative.text,
})


In [None]:
# v4: remember to close to free resources
client.close()
