In [None]:
# import libs for embedding generation
import os
from dotenv import find_dotenv, load_dotenv
import json
import cohere

In [None]:
# load environment variables
load_dotenv(find_dotenv(".env"))

True

In [None]:
# read Cohere API key from env
COHERE_API_KEY=os.getenv("COHERE_API_KEY")

In [None]:
# initialize Cohere client
cohere_client = cohere.ClientV2(COHERE_API_KEY)

In [None]:
# load chunked documents from chunks.json
with open("chunks.json", "r") as file:
    data = json.load(file)

In [None]:
# extract text contents to embed
text_inputs = [item['content'] for item in data]

In [None]:
# request embeddings for all chunks
response = cohere_client.embed(
    texts=text_inputs,
    model="embed-v4.0",
    input_type="search_document",
    embedding_types=["float"],
)

In [None]:
# attach embeddings back to chunk objects
for chunk, embedding in zip(data, response.embeddings.float_):
    chunk['embedding'] = embedding

In [None]:
# persist chunks with embeddings to JSON
with open("chunks_embeddings.json", "w") as file:
    json.dump(data, file, indent=4)