In [None]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[0], indent=2))

def jprint(data):
    print(json.dumps(data, indent=2))

In [None]:
import weaviate
from weaviate import EmbeddedOptions
import os

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]  # Replace this with your actual key
    }
)

In [None]:
jprint(client.get_meta())

In [None]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

In [None]:
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
}

client.schema.create_class(class_obj)

In [None]:
with client.batch.configure() as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

In [None]:
jprint(client.query.aggregate("Question").with_meta_count().do())

In [None]:
jprint(client.query.get("Question", ["question","answer"]).with_limit(3).do())

----

## Lets Extract the vector that represents each question!

In [None]:
# write a query to extract the vector for a question

# ADD CODE HERE

In [None]:
#This is the question corresponding to this vector

result['data']['Get']["Question"][0]['question']

In [None]:
#This is the answer to this question

result['data']['Get']["Question"][0]['answer']

In [None]:
#Now display the vector representation of the above question and answer

# ADD CODE HERE

In [None]:
#How many numbers are there in this vector?

# ADD CODE HERE

## We have sucessfully extracted the vector for this datapoint!

## Lets see if we can search for a relevant answer using vector search!

In [None]:
#Build a vector search query to extract questions ,answers and categories related to "biology"

# ADD CODE HERE

print(json.dumps(response, indent=4))

## What is the distance between the `query`: `biology` and the returned objects?

In [None]:
#Write code to extract the distance between the query and returned object vectors

# ADD CODE HERE

print(json.dumps(response, indent=4))

In [None]:
#Extract all 10 questions and analyze them based on distance/similarity to the query vector

response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"]})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

print(json.dumps(response, indent=2))

## Notice how as the responses get more irrelevant to the question that the distance between the `query`:`"animals"` and the response increases! - *The vectors are getting farther from each other!*

---

## We can let the vector database know to remove results after a threshold distance!

In [None]:
#Set a max distance threshold - What should the max distance be?

# ADD CODE HERE

## Now we prevented irrelevant results by removing vectors further then `max_distance` away!