In [1]:
import cellxgene_census as cxg
from cellxgene_census.experimental import get_embedding
import numpy as np
import pinecone
import os
import os
from pinecone import Pinecone, ServerlessSpec

In [2]:
embedding_uri = \
    "s3://cellxgene-contrib-public/contrib/cell-census/soma/2023-12-15/CxG-contrib-2"
census = cxg.open_soma(census_version="2023-12-15")

In [3]:
adata = cxg.get_anndata(
    census,
    organism = "homo_sapiens",
    measurement_name = "RNA",
    obs_value_filter = "tissue_general == 'central nervous system'",
)

In [4]:
embeddings = get_embedding("2023-12-15", embedding_uri, adata.obs["soma_joinid"].to_numpy())
adata.obsm["emb"] = embeddings

In [23]:
lst = []
for i in adata.obs.cell_type:
    if i not in lst:
        lst.append(i)

In [24]:
cell_dict = {}
for cell_type in lst:
    indices = adata.obs[adata.obs.cell_type == cell_type].index[:5]
    for idx in indices:
        cell_dict[idx] = cell_type

In [39]:
cell_dict

{'0': 'ependymal cell',
 '9': 'ependymal cell',
 '10': 'ependymal cell',
 '271': 'ependymal cell',
 '295': 'ependymal cell',
 '1': 'astrocyte',
 '2': 'astrocyte',
 '3': 'astrocyte',
 '4': 'astrocyte',
 '5': 'astrocyte',
 '792': 'oligodendrocyte',
 '793': 'oligodendrocyte',
 '795': 'oligodendrocyte',
 '796': 'oligodendrocyte',
 '799': 'oligodendrocyte',
 '794': 'GABAergic neuron',
 '798': 'GABAergic neuron',
 '805': 'GABAergic neuron',
 '809': 'GABAergic neuron',
 '810': 'GABAergic neuron',
 '797': 'mural cell',
 '845': 'mural cell',
 '849': 'mural cell',
 '861': 'mural cell',
 '885': 'mural cell',
 '802': 'oligodendrocyte precursor cell',
 '827': 'oligodendrocyte precursor cell',
 '832': 'oligodendrocyte precursor cell',
 '850': 'oligodendrocyte precursor cell',
 '927': 'oligodendrocyte precursor cell',
 '803': 'microglial cell',
 '804': 'microglial cell',
 '817': 'microglial cell',
 '822': 'microglial cell',
 '824': 'microglial cell',
 '806': 'cerebellar granule cell',
 '818': 'cerebe

In [34]:
def insert_cells_embeddings_into_pinecone(embeddings, cell_dict):
    # Ensure embeddings is a numpy array and has the correct shape
    if not isinstance(embeddings, np.ndarray) or embeddings.ndim != 2:
        raise ValueError("embeddings must be a 2D numpy array")
    
    # Set up Pinecone
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)

    # Specify your Pinecone index name
    index_name = 'uceindex'
    
    # Ensure your Pinecone index is created beforehand or create it if not exists
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=embeddings.shape[1],  # Assuming all embeddings have the same dimension
            metric='cosine',
            spec=ServerlessSpec(
                cloud='gcp-starter',  # Adjust cloud provider and region as per your setup
                region='us-central1'
            )
        )
    
    # Connect to your Pinecone index
    index = pc.Index(name=index_name)
    
    # Prepare the data for insertion
    vectors = [(str(idx), embeddings[int(idx)]) for idx in cell_dict.keys()]
    
    # Insert the embeddings into Pinecone
    index.upsert(vectors=vectors)
    
    print("Cell embeddings inserted into Pinecone successfully.")


In [35]:
insert_cells_embeddings_into_pinecone(embeddings, cell_dict)

Cell embeddings inserted into Pinecone successfully.


In [36]:
def find_similar_embeddings(embedding):
    # Set up Pinecone
    pinecone_api_key = os.getenv('PINECONE_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)

    # Specify your Pinecone index name
    index_name = 'uceindex'
    
    # Ensure the index exists
    if index_name not in pc.list_indexes().names():
        raise ValueError(f"Index {index_name} does not exist. Please create the index before querying.")
    
    # Connect to your Pinecone index
    index = pc.Index(name=index_name)
    
    # Query the index for the 5 most similar embeddings
    query_result = index.query(vector=[embedding.tolist()], top_k=5)
    
    # Extract the IDs of the most similar embeddings
    similar_ids = [match["id"] for match in query_result["matches"]]
    
    return similar_ids

In [42]:
similar_cells = find_similar_embeddings(embeddings[1])
print(similar_cells)
for cell in similar_cells:
    print(cell_dict[cell])

['1', '2', '4', '3', '1173']
astrocyte
astrocyte
astrocyte
astrocyte
differentiation-committed oligodendrocyte precursor
