##### Copyright 2023 Google LLC.

In [32]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Document Q&A with ChromaDB

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google/generative-ai-docs/blob/main/examples/gemini/python/vectordb_with_chroma/vectordb_with_chroma.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/google/generative-ai-docs/blob/main/examples/gemini/python/vectordb_with_chroma/vectordb_with_chroma.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>


## Overview

This tutorial demonstrates how to use the Gemini API to create a vector database and retrieve answers to questions from the database. Moreover, you will use [ChromaDB](https://docs.trychroma.com/){:.external}, an open-source Python tool that creates embedding databases. ChromaDB allows you to:

* Store embeddings as well as their metadata
* Embed documents and queries
* Search through the database of embeddings

In this tutorial, you'll use embeddings to retrieve an answer from a database of vectors created with ChromaDB.

## Prerequisites

You can run this quickstart in Google Colab.

To complete this quickstart on your own development environment, ensure that your environment meets the following requirements:

-  Python 3.9+
-  An installation of `jupyter` to run the notebook.

## Setup

First, download and install ChromaDB and the Gemini API Python library.

In [1]:
pip install google-ai-generativelanguage==0.6.15




In [2]:
!pip install -U -q google-generativeai

In [3]:
!pip install -q chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:0

Then import the modules you'll use in this tutorial.

In [4]:
import textwrap
import chromadb
import numpy as np
import pandas as pd

import google.generativeai as genai

# Used to securely store your API key
from google.colab import userdata

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings

### Grab an API Key

Before you can use the Gemini API, you must first obtain an API key. If you don't already have one, create a key with one click in Google AI Studio.

<a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Get an API key</a>

In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `API_KEY`.

Once you have the API key, pass it to the SDK. You can do this in two ways:

* Put the key in the `GOOGLE_API_KEY` environment variable (the SDK will automatically pick it up from there).
* Pass the key to `genai.configure(api_key=...)`

In [7]:
# Or use `os.getenv('API_KEY')` to fetch an environment variable.
API_KEY="AIzaSyCTQ3T6iqXECoROGGQri9-H2scItu_-n68"

genai.configure(api_key=API_KEY)

Key Point: Next, you will choose a model. Any embedding model will work for this tutorial, but for real applications it's important to choose a specific model and stick with it. The outputs of different models are not compatible with each other.

**Note**: At this time, the Gemini API is [only available in certain regions](https://ai.google.dev/available_regions).

In [8]:
for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


### Data

Here is a small set of documents you will use to create an embedding database:

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import os
import pandas as pd

folder_path = "/content/drive/MyDrive/vectorDB"  # Path to your Excel files
documents = []

for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx"):
        path = os.path.join(folder_path, filename)

        # Read the Excel file
        df = pd.read_excel(path, header=None)  # or specify sheet name
        text = "\n".join(df.astype(str).stack().tolist())  # Flatten and join all cells as string
        documents.append(text)

print(f"Loaded {len(documents)} documents from Excel files.")


Loaded 3 documents from Excel files.


## Creating the embedding database with ChromaDB

You will create a [custom function](https://docs.trychroma.com/embeddings#custom-embedding-functions){:.external} for performing embedding using the Gemini API. By inputting a set of documents into this custom function, you will receive vectors, or embeddings of the documents.


### API changes to Embeddings with model embedding-001

For the new embeddings model, embedding-001, there is a new task type parameter and the optional title (only valid with task_type=`RETRIEVAL_DOCUMENT`).

These new parameters apply only to the newest embeddings models.The task types are:

Task Type | Description
---       | ---
RETRIEVAL_QUERY	| Specifies the given text is a query in a search/retrieval setting.
RETRIEVAL_DOCUMENT | Specifies the given text is a document in a search/retrieval setting.
SEMANTIC_SIMILARITY	| Specifies the given text will be used for Semantic Textual Similarity (STS).
CLASSIFICATION	| Specifies that the embeddings will be used for classification.
CLUSTERING	| Specifies that the embeddings will be used for clustering.

In [10]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    model = 'models/embedding-001'
    title = "Custom query"
    return genai.embed_content(model=model,
                                content=input,
                                task_type="retrieval_document",
                                title=title)["embedding"]

Now you will create the vector database. In the `create_chroma_db` function, you will instantiate a [Chroma client](https://docs.trychroma.com/getting-started){:.external}. From there, you will create a collection, which is where you store your embeddings, documents, and any metadata. Note that the embedding function from above is passed as an argument to the `create_collection`.

Next, you use the `add` method to add the documents to the collection.

In [11]:
from google.colab import drive
import shutil
import os

def create_chroma_db(documents, name, batch_size=5000, drive_path='/content/drive/MyDrive/chroma_store'):
    if not documents:
        raise ValueError("The 'documents' list is empty.")

    # Step 1: Mount Drive
    drive.mount('/content/drive')

    # Step 2: Copy from Drive if exists
    local_path = "./chroma_store"
    if os.path.exists(drive_path):
        if not os.path.exists(local_path):
            shutil.copytree(drive_path, local_path)
            print("Loaded Chroma DB from Drive.")

    # Step 3: Use PersistentClient
    chroma_client = chromadb.PersistentClient(path=local_path)

    existing_collections = [col.name for col in chroma_client.list_collections()]
    if name in existing_collections:
        db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
        print(f"Loaded existing collection: {name}")
    else:
        db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())
        print(f"Created new collection: {name}")

        splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
        all_chunks = []
        all_ids = []

        for i, doc in enumerate(documents):
            chunks = splitter.split_text(doc)
            all_chunks.extend(chunks)
            all_ids.extend([f"{i}_{j}" for j in range(len(chunks))])

        # Add in batches
        for start in range(0, len(all_chunks), batch_size):
            end = start + batch_size
            batch_chunks = all_chunks[start:end]
            batch_ids = all_ids[start:end]
            db.add(documents=batch_chunks, ids=batch_ids)
            print(f"Added batch {start} to {end} to collection '{name}'.")

        # Step 4: Save to Drive
        if os.path.exists(drive_path):
            shutil.rmtree(drive_path)
        shutil.copytree(local_path, drive_path)
        print("Saved Chroma DB to Drive.")

    return db




In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Call the function to create and assign the db variable
db = create_chroma_db(documents, "my_document_collection")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())


Created new collection: my_document_collection
Added batch 0 to 5000 to collection 'my_document_collection'.
Added batch 5000 to 10000 to collection 'my_document_collection'.
Added batch 10000 to 15000 to collection 'my_document_collection'.
Added batch 15000 to 20000 to collection 'my_document_collection'.
Added batch 20000 to 25000 to collection 'my_document_collection'.
Added batch 25000 to 30000 to collection 'my_document_collection'.
Added batch 30000 to 35000 to collection 'my_document_collection'.
Added batch 35000 to 40000 to collection 'my_document_collection'.
Saved Chroma DB to Drive.


Confirm that the data was inserted by looking at the database:

## Getting the relevant document

`db` is a Chroma collection object. You can call `query` on it to perform a nearest neighbors search to find similar embeddings or documents.


In [13]:
def get_relevant_passage(query, db):
  passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
  return passage

In [18]:
# Perform embedding search
passage = get_relevant_passage(query, db)
Markdown(passage)

NameError: name 'query' is not defined

Now that you have found the relevant passage in your set of documents, you can use it make a prompt to pass into the Gemini API.

In [16]:
# Show a few records (documents) from the collection
peek_data = db.peek(3)
print(peek_data)


{'ids': ['0_0', '0_1', '0_2'], 'embeddings': array([[ 0.04806398, -0.02096214, -0.06218296, ...,  0.03389169,
        -0.02237787, -0.01042494],
       [ 0.04533244, -0.01592427, -0.0636674 , ...,  0.01667201,
        -0.03433314,  0.01253321],
       [ 0.05976871, -0.02252777, -0.07497181, ...,  0.02067008,
        -0.03486569,  0.00891464]]), 'documents': ['종목코드\n회사명\n시장구분\n업종\n업종명\n항목코드\n결산기준일\n항목명\n당기\n[001040]\nCJ\n유가증권시장상장법인\n649\n기타 금융업\ndart_NonOperatingProfitLoss\n2024-12-31 00:00:00\n      기타순손익\n160081113000\n[001040]\nCJ\n유가증권시장상장법인\n649\n기타 금융업\ndart_OperatingIncomeLoss\n2024-12-31 00:00:00\n      영업이익(손실)\n136262575000\n[001040]\nCJ\n유가증권시장상장법인\n649\n기타 금융업\ndart_BasicEarningsLossPerSharePreferredStock\n2024-12-31 00:00:00\n         우선주기본주당순이익(손실)\n6767\n[001040]\nCJ\n유가증권시장상장법인\n649\n기타 금융업\ndart_DilutedEarningsLossPerSharePreferredStock\n2024-12-31 00:00:00\n         우선주희석주당순이익(손실)\n6767', '2024-12-31 00:00:00\n         우선주희석주당순이익(손실)\n6767\n[001040]\nCJ\n유가증권시장상장법인\n64

In [17]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

Pass a query to the prompt:

In [19]:
query = "듀켐바이오 당기?"
prompt = make_prompt(query, passage)
Markdown(prompt)

NameError: name 'passage' is not defined

Now use the `generate_content` method to to generate a response from the model.

## Next steps

To learn more about how you can use the embeddings, check out the [examples](https://ai.google.dev/examples?keywords=embed) available. To learn how to use other services in the Gemini API, visit the [Python quickstart](https://ai.google.dev/gemini-api/docs/get-started/python).