In [1]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[0], indent=2))

def json_print(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [2]:
import weaviate
from weaviate import EmbeddedOptions
import os

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]  # Replace this with your actual key
    }
)

embedded weaviate is already listening on port 6666


In [3]:
json_print(client.get_meta())

{
  "hostname": "http://127.0.0.1:6666",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://platform.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "reranker-cohere": {
      "documentationHref": "https://txt.cohere.com/rerank/",
      "name": "Reranker - Cohere"
    },
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings",
      "nam

In [4]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

In [5]:
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

In [6]:
with client.batch.configure() as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [8]:
json_print(client.query.aggregate("Question").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}


In [9]:
json_print(client.query.get("Question", ["question","answer"]).with_limit(3).do())

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "answer": "Liver",
          "question": "This organ removes excess glucose from the blood & stores it as glycogen"
        },
        {
          "answer": "Sound barrier",
          "question": "In 70-degree air, a plane traveling at about 1,130 feet per second breaks it"
        }
      ]
    }
  }
}


----

## Lets Extract the vector that represents each question!

In [10]:
# write a query to extract the vector for a question

result = (client.query
          .get("Question", ["category", "question", "answer"])
          .with_additional("vector")
          .with_limit(1)
          .do())

In [11]:
#This is the question corresponding to this vector

result['data']['Get']["Question"][0]['question']

'Weighing around a ton, the eland is the largest species of this animal in Africa'

In [12]:
#This is the answer to this question

result['data']['Get']["Question"][0]['answer']

'Antelope'

In [13]:
#Now display the vector representation of the above question and answer

result['data']['Get']["Question"][0]['_additional']['vector']

[0.015716804,
 -0.0069466173,
 -0.0014841097,
 -0.027707754,
 0.010213299,
 0.020085499,
 -0.019337704,
 -0.011138203,
 -0.019928068,
 -0.020754578,
 0.018563671,
 0.017238632,
 0.0007514843,
 -0.023102915,
 0.005408391,
 0.011689209,
 0.016189095,
 0.005519904,
 0.013106083,
 0.009045689,
 -0.019534491,
 0.0025467651,
 -0.011984391,
 0.0030305358,
 0.017514134,
 0.0190622,
 0.023522729,
 -0.02549061,
 -0.011787603,
 -0.013958831,
 -0.013375026,
 -0.0012143461,
 -0.008789865,
 -0.014680387,
 -0.04158787,
 -0.024244286,
 0.013945712,
 -0.018944127,
 0.037337247,
 0.0007969915,
 0.008691471,
 0.023771994,
 0.0024549307,
 -0.015401944,
 0.006854783,
 0.025083914,
 0.017763399,
 0.0035356248,
 -0.004493327,
 0.006694073,
 0.0015661047,
 0.0031699273,
 -0.0035585836,
 0.019613206,
 -0.007700972,
 -0.011964712,
 0.0032814404,
 0.0030305358,
 0.027760232,
 -0.008147025,
 -0.0019383621,
 0.012797781,
 -0.026408954,
 0.0077993656,
 -0.027970139,
 0.008579958,
 0.007202442,
 0.0018678464,
 -0.00

In [14]:
#How many numbers are there in this vector?

len(vector := result['data']['Get']["Question"][0]['_additional']['vector'])

1536

## We have sucessfully extracted the vector for this datapoint!

## Lets see if we can search for a relevant answer using vector search!

In [15]:
#Build a vector search query to extract questions ,answers and categories related to "biology"

response = (
    client.query
    .get("Question",["question","answer","category"])
    .with_near_text({"concepts":["biology"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Question": [
                {
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                },
                {
                    "answer": "species",
                    "category": "SCIENCE",
                    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
                }
            ]
        }
    }
}


## What is the distance between the `query`: `biology` and the returned objects?

In [16]:
#Write code to extract the distance between the query and returned object vectors

response = (
    client.query
    .get("Question",["question","answer","category"])
    .with_near_text({"concepts":["biology"]})
    .with_additional('distance')
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Question": [
                {
                    "_additional": {
                        "distance": 0.19695157
                    },
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                },
                {
                    "_additional": {
                        "distance": 0.20147645
                    },
                    "answer": "species",
                    "category": "SCIENCE",
                    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
                }
            ]
        }
    }
}


In [17]:
#Extract all 10 questions and analyze them based on distance/similarity to the query vector

response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"]})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.189655
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19135147
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20417404
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.2142871
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
       

## Notice how as the responses get more irrelevant to the question that the distance between the `query`:`"animals"` and the response increases! - *The vectors are getting farther from each other!*

---

## We can let the vector database know to remove results after a threshold distance!

In [18]:
#Set a max distance threshold - What should the max distance be?
max_distance = 0.24

response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"],
                    "distance": max_distance})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.18969738
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19144505
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20419747
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21444535
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
    

## Now we prevented irrelevant results by removing vectors further then `max_distance` away!