In [1]:
import cellxgene_census as cxg
from cellxgene_census.experimental import get_embedding
import numpy as np
import pinecone
import os
import os
from pinecone import Pinecone, ServerlessSpec

In [2]:
embedding_uri = \
    "s3://cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-2"
census = cxg.open_soma(census_version="2023-12-15")

In [3]:
adata = cxg.get_anndata(
    census,
    organism = "homo_sapiens",
    measurement_name = "RNA",
    obs_value_filter = "tissue_general == 'central nervous system'",
)

In [4]:
embeddings = get_embedding("2023-12-15", embedding_uri, adata.obs["soma_joinid"].to_numpy())
adata.obsm["emb"] = embeddings

In [5]:
embeddings[0]

array([ 0.00138092,  0.02929688, -0.03686523, ..., -0.0390625 ,
       -0.04882812,  0.04174805], dtype=float32)

In [24]:
embeddings

array([[ 0.00138092,  0.02929688, -0.03686523, ..., -0.0390625 ,
        -0.04882812,  0.04174805],
       [ 0.04956055,  0.02282715,  0.01251221, ...,  0.00279236,
        -0.07324219,  0.00604248],
       [ 0.04492188,  0.01660156,  0.01348877, ...,  0.0045166 ,
        -0.06396484,  0.00469971],
       ...,
       [-0.02648926, -0.00759888, -0.01031494, ..., -0.00421143,
        -0.00964355,  0.04956055],
       [-0.00291443,  0.00352478, -0.00982666, ..., -0.00866699,
        -0.00866699,  0.04589844],
       [ 0.015625  ,  0.01806641, -0.01507568, ..., -0.02087402,
        -0.02209473,  0.04980469]], dtype=float32)

In [17]:
def insert_first_20_embeddings_into_pinecone(embeddings):
    # Ensure embeddings is a numpy array and has the correct shape
    if not isinstance(embeddings, np.ndarray) or embeddings.ndim != 2:
        raise ValueError("embeddings must be a 2D numpy array")
    
    # Extract the first 20 embeddings
    first_20_embeddings = embeddings[:20]
    
    # Set up Pinecone
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)

    # Specify your Pinecone index name
    index_name = 'uceindex'
    
    # Ensure your Pinecone index is created beforehand or create it if not exists
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=first_20_embeddings.shape[1],  # Assuming all embeddings have the same dimension
            metric='cosine',
            spec=ServerlessSpec(
                cloud='gcp-starter',  # Adjust cloud provider and region as per your setup
                region='us-central1'
            )
        )
    
    # Connect to your Pinecone index
    index = pc.Index(name=index_name)
    
    # Prepare the data for insertion
    # Convert numpy array to list and generate IDs for each embedding
    vectors = [(str(i), embedding.tolist()) for i, embedding in enumerate(first_20_embeddings, start=1)]
    
    # Insert the embeddings into Pinecone
    index.upsert(vectors=vectors)
    
    print("First 20 embeddings inserted into Pinecone successfully.")

# Example usage
# Assuming `embeddings` is your numpy array as described
# insert_first_20_embeddings_into_pinecone(embeddings)


In [18]:
insert_first_20_embeddings_into_pinecone(embeddings)

First 20 embeddings inserted into Pinecone successfully.


In [13]:
def find_similar_embeddings(embedding):
    # Set up Pinecone
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)

    # Specify your Pinecone index name
    index_name = 'uceindex'
    
    # Ensure the index exists
    if index_name not in pc.list_indexes().names():
        raise ValueError(f"Index {index_name} does not exist. Please create the index before querying.")
    
    # Connect to your Pinecone index
    index = pc.Index(name=index_name)
    
    # Query the index for the 5 most similar embeddings
    query_result = index.query(vector=[embedding.tolist()], top_k=5)
    
    # Extract the IDs of the most similar embeddings
    similar_ids = [match["id"] for match in query_result["matches"]]
    
    return similar_ids

# Example usage
# Assuming `embedding` is a numpy array representing the embedding you want to query
# similar_ids = find_similar_embeddings(embedding)
# print(similar_ids)


In [14]:
find_similar_embeddings(embeddings[0])

['1', '10', '11', '4', '3']

In [44]:
embeddings[0]

array([ 0.00138092,  0.02929688, -0.03686523, ..., -0.0390625 ,
       -0.04882812,  0.04174805], dtype=float32)