In [1]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[0], indent=2))

def jprint(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [2]:
import weaviate
from weaviate import EmbeddedOptions
import os

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]  # Replace this with your actual key
    }
)

embedded weaviate is already listening on port 6666


In [3]:
jprint(client.get_meta())

{
  "hostname": "http://127.0.0.1:6666",
  "modules": {
    "generative-openai": {
      "documentationHref": "https://beta.openai.com/docs/api-reference/completions",
      "name": "Generative Search - OpenAI"
    },
    "qna-openai": {
      "documentationHref": "https://beta.openai.com/docs/api-reference/completions",
      "name": "OpenAI Question & Answering Module"
    },
    "ref2vec-centroid": {},
    "text2vec-cohere": {
      "documentationHref": "https://docs.cohere.ai/embedding-wiki/",
      "name": "Cohere Module"
    },
    "text2vec-huggingface": {
      "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task",
      "name": "Hugging Face Module"
    },
    "text2vec-openai": {
      "documentationHref": "https://beta.openai.com/docs/guides/embeddings/what-are-embeddings",
      "name": "OpenAI Module"
    }
  },
  "version": "1.19.12"
}


In [4]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

In [5]:
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

In [6]:
with client.batch.configure() as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [7]:
jprint(client.query.aggregate("Question").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}


In [8]:
jprint(client.query.get("Question", ["question","answer"]).with_limit(3).do())

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "DNA",
          "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
        },
        {
          "answer": "wire",
          "question": "A metal that is ductile can be pulled into this while cold & under pressure"
        },
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


----

## Lets Extract the vector that represents each question!

In [21]:
result = client.query.get("Question",['category','question',"answer"]).with_additional('vector').with_limit(1).do()

In [22]:
result['data']['Get']["Question"][0]['question']

'In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance'

In [23]:
result['data']['Get']["Question"][0]['answer']

'DNA'

In [25]:
result['data']['Get']["Question"][0]['_additional']['vector']

[-0.015701957,
 0.020315554,
 -0.0010935994,
 -0.0058915108,
 -0.008866756,
 0.024129637,
 0.0154136075,
 0.008414572,
 -0.022124294,
 -0.011284963,
 -0.014273316,
 0.020761186,
 0.006022579,
 0.018755844,
 0.0026393838,
 0.008434232,
 0.02140342,
 0.0016465429,
 -0.0054000053,
 -0.0060618995,
 -0.022281576,
 -0.0017710576,
 0.007955833,
 -0.0061602006,
 -0.00800826,
 0.027157309,
 0.019411184,
 -0.04671267,
 -0.00033790994,
 -0.012674285,
 -0.009286175,
 0.00160968,
 -0.03478547,
 -0.025990803,
 -0.011789575,
 -0.0011124405,
 -0.0072152987,
 0.0025345292,
 0.027078668,
 -0.029568963,
 0.024758764,
 0.01647526,
 -0.0040762178,
 -0.016553901,
 -0.020669438,
 -0.0029473938,
 0.0134213725,
 -0.009718699,
 -0.011003166,
 -0.0036764601,
 -0.0055114133,
 0.040683538,
 -0.021219924,
 -0.013565548,
 -0.0071759783,
 -0.008388358,
 0.0058784042,
 0.0045939367,
 -0.008761902,
 -0.01865099,
 0.0063142055,
 0.0018398684,
 -0.023133518,
 0.005603161,
 -0.01688157,
 0.002020087,
 0.002845816,
 0.0097

In [26]:
len(vector := result['data']['Get']["Question"][0]['_additional']['vector'])

1536

## We have sucessfully extracted the vector for this datapoint!

## Lets see if we can search for a relevant answer using vector search!

In [28]:
response = (
    client.query
    .get("Question", ["question", "answer", "category"])
    .with_near_text({"concepts": ["biology"]})
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Question": [
                {
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                },
                {
                    "answer": "species",
                    "category": "SCIENCE",
                    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
                }
            ]
        }
    }
}


## What is the distance between the `query`: `biology` and the returned objects?

In [29]:
response = (
    client.query
    .get("Question", ["question", "answer", "category"])
    .with_near_text({"concepts": ["biology"]})
    .with_additional('distance')
    .with_limit(2)
    .do()
)

print(json.dumps(response, indent=4))

{
    "data": {
        "Get": {
            "Question": [
                {
                    "_additional": {
                        "distance": 0.19684237
                    },
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                },
                {
                    "_additional": {
                        "distance": 0.20149803
                    },
                    "answer": "species",
                    "category": "SCIENCE",
                    "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
                }
            ]
        }
    }
}


In [35]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"]})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.1895833
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19156313
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20412415
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21429819
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
     

## Notice how as the responses get more irrelevant to the question that the distance between the `query`:`"animals"` and the response increases! - *The vectors are getting farther from each other!*

---

## We can let the vector database know to remove results after a threshold distance!

In [36]:
max_distance = 0.24
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({
        "concepts": ["animals"],
        "distance": max_distance
    })
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.1895833
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19156313
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20412415
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21429819
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
     

## Now we prevented irrelevant results by removing vectors further then `max_distance` away!