In [10]:
import spacy
import chromadb

In [15]:
# !python -m spacy download en_core_web_trf
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 1.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
# Load a transformer-based model in spaCy
nlp = spacy.load("en_core_web_sm")  # Transformer-based model

In [39]:
# Create the ChromaDB client and collection
client = chromadb.Client()
collection = client.create_collection(name="my_collection")

# Sample documents
documents = ["What is the capital of France?", "What is the largest planet in the solar system?", "Who wrote 'Hamlet'?"]
answers = ["Paris", "Jupiter", "William Shakespeare"]

# Generate embeddings using spaCy
embeddings = [nlp(doc).vector for doc in documents]

# Store documents, embeddings, and metadata in ChromaDB
for i, doc in enumerate(documents):
    collection.add(
        ids=[str(i)],
        embeddings=[embeddings[i]],
        metadatas=[{'answer': answers[i]}],
        documents=[doc]
    )



In [41]:
# Query the collection
query = "Who is the author of 'Hamlet'?"
query_embedding = nlp(query).vector
results = collection.query(query_embeddings=[query_embedding], n_results=1)
# Extract and print the most relevant answer
top_result = results['documents'][0]
results



{'ids': [['0']],
 'embeddings': None,
 'documents': [['What is the capital of France?']],
 'uris': None,
 'data': None,
 'metadatas': [[{'answer': 'Paris'}]],
 'distances': [[5.011672496795654]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [42]:
answer = results['metadatas'][0][0]['answer']
print(f"Question: {query}")
print(f"Answer: {answer}")

Question: Who is the author of 'Hamlet'?
Answer: Paris
