In [2]:
# Import the Pinecone library
from pinecone import Pinecone
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec 
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()
import os
# Access environment variables as if they came from the actual environment
PINECONE_API = os.getenv('PINECONE')

## PINECONE TO CHROMADB

In [None]:

pc = Pinecone(api_key=PINECONE_API)
index = pc.Index("db")  # your existing index

all_ids = []
for page in index.list(limit=100, namespace=None):  # adjust namespace as needed
    all_ids.extend(page)
print(f"Found {len(all_ids)} vector IDs.")
def fetch_vectors(index, ids, batch_size=100):
    all_vectors = []
    for i in range(0, len(ids), batch_size):
        batch_ids = ids[i:i + batch_size]
        response = index.fetch(ids=batch_ids)

        for vid, vector in response.vectors.items():
            all_vectors.append({
                "id": vid,
                "vector": vector.values,
                "metadata": vector.metadata or {}
            })
    return all_vectors

vectors = fetch_vectors(index, all_ids)


Found 100 vector IDs.


In [9]:
vectors[1].keys()

dict_keys(['id', 'vector', 'metadata'])

In [8]:
import chromadb
chroma_client = chromadb.Client()


In [9]:
collection = chroma_client.create_collection(name="my_collection")


In [14]:
ids = [item["id"] for item in vectors]

In [15]:
embeddings = [item["vector"] for item in vectors]

In [16]:
metadatas = [item["metadata"] for item in vectors]

In [17]:
documents = [item["metadata"].get("text", "") for item in vectors]

In [18]:
collection.add(
    ids=ids,
    embeddings=embeddings,
    documents=documents,   # optional, only if you have the raw text
    metadatas=metadatas
)

In [16]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

C:\Users\HP\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:54<00:00, 1.51MiB/s]


{'ids': [['5e1f7f7f-d5d7-47ca-8529-873d59f0c97b', 'dd3c0921-8b5c-4446-b024-d56641f648c9']], 'embeddings': None, 'documents': [['The gathered data were subjected to qualitative and quant itative analysis. The analysis \nstarted with partial transcription of the important parts of the data (Dörnyei, 2007) on a \ncomputer word processor program  Microsoft Word 2016 . Then the transcribed parts of \nthe data were read several times in order to look f or common themes and frequently \noccurring information. The recurring ideas were coded and recoded, revised and', 'addition, my smartphone can successfully replace a traditional paper dictionary \nand I don’t have to waste time in thumbing through a lot of pages to find words \nI’m looking for. \n4.3. Resources and tools \nThe analysis of the data revealed that the students made use of both online resources \nand mobile apps. The most frequently used language tools were online dictionaries \n(e.g. diki, ColorDict Dictionary ) and a variety of

## CHROMADB TO PINECONE

In [10]:
chromadb_collection = chroma_client.get_collection(name="my_collection")

In [19]:
results = collection.get(include=["embeddings", "documents", "metadatas"])


In [None]:
pinecone_vector = []
for  id, vectors, metadata in zip(results["ids"], results["embeddings"], results["metadatas"]):
    vector = {
        "id":id,
        "values":vectors,
        "metadata":metadata
    }
    pinecone_vector.append(vector)
    
pinecone_vector

[{'id': '9e4dc004-251f-4902-bc55-680567ed5313',
  'values': array([-3.04036420e-02,  3.35696223e-03,  4.88552498e-03, -1.72479916e-03,
         -4.42056470e-02, -1.69095714e-02,  9.00220498e-02,  6.27321601e-02,
          1.94266755e-02,  5.04826903e-02,  9.96617824e-02,  5.29005751e-02,
          2.77781952e-02,  5.27088866e-02,  8.41665566e-02, -1.99464783e-02,
         -7.72075728e-02, -3.95590849e-02, -2.78991442e-02, -3.55799459e-02,
         -6.39365241e-02,  2.69621657e-03,  3.24044153e-02,  2.97607314e-02,
          4.29799827e-03,  4.35801893e-02,  6.79690717e-03, -8.04218203e-02,
          5.35402894e-02, -1.60491094e-02, -1.63964685e-02,  1.81514751e-02,
         -1.37970624e-02,  1.22390524e-03, -5.85925430e-02, -1.05313184e-02,
          4.73885462e-02, -1.13499456e-03, -5.03253303e-02, -2.09372770e-02,
         -7.97543898e-02, -6.49985075e-02, -8.93695559e-03, -6.90221190e-02,
         -1.31091652e-02, -6.17367914e-03, -5.18925190e-02, -7.30984211e-02,
         -5.996805

In [26]:
len(pinecone_vector)

100

In [27]:
index_name = "chroma"
pc.create_index(
    name=index_name,
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

{
    "name": "chroma",
    "metric": "cosine",
    "host": "chroma-vv4b40r.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [38]:
pinecone_vector[1]

{'id': '0aebd14b-1b58-4d4f-9d35-12c9676201ba',
 'values': array([ 5.18630296e-02,  3.04125082e-02,  6.96104318e-02, -5.08082621e-02,
         1.36551308e-02, -1.92227922e-02,  1.09510802e-01,  5.70916459e-02,
         5.56579866e-02,  1.95788704e-02,  1.31710395e-01,  6.60581514e-02,
         2.36065015e-02,  1.41165471e-02,  1.05046794e-01, -4.87512723e-02,
         2.93757226e-02, -4.85784076e-02, -7.16804713e-02,  2.94048637e-02,
        -2.61971224e-02,  2.09825346e-03,  1.12140141e-01,  6.15134789e-03,
         7.43681416e-02,  1.86187215e-03, -3.19175161e-02, -5.06326929e-02,
         6.05850182e-02,  3.93270664e-02,  8.79248232e-03,  1.77923381e-01,
         2.99053621e-02,  2.28264052e-02, -7.49917403e-02, -5.13197221e-02,
        -1.72376465e-02, -1.95894623e-03, -2.75484063e-02, -3.07952315e-02,
        -5.29933050e-02, -3.09005603e-02,  5.39731644e-02, -3.61252413e-03,
        -2.63574570e-02, -6.81506172e-02, -8.23366195e-02, -3.24616162e-03,
         8.64001177e-03,  5.165

In [28]:
index = pc.Index(index_name)  

In [44]:
index

<pinecone.db_data.index.Index at 0x27671044260>

In [33]:

PINECONE_API_KEY = os.environ.get('PINECONE')
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

In [39]:
index.upsert(vectors=pinecone_vector)

{'upserted_count': 100}

In [45]:

all_ids = []
for page in index.list(limit=100, namespace=None):  # adjust namespace as needed
    all_ids.extend(page)
print(f"Found {len(all_ids)} vector IDs.")

Found 100 vector IDs.
