# CellVote Tutorial on PBMC3k

This tutorial demonstrates CellVote using the PBMC3k dataset. It:
- Loads PBMC3k, preprocesses, clusters, and computes markers.
- Simulates multiple annotation methods.
- Runs CellVote offline (local majority) and optionally online (LLM).


## Requirements
- `anndata`, `scanpy`, `omicverse`
- Place PBMC3k at `./data/pbmc3k.h5ad` or set `CELLVOTE_PBMC3K`.
- For LLM (optional), set your provider API key.


In [None]:
import os
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import omicverse as ov
from omicverse.single import CellVote

print('Versions:', 'anndata', ad.__version__, '| scanpy', sc.__version__, '| omicverse', ov.__version__)


## 1) Load PBMC3k
Looks for `CELLVOTE_PBMC3K` or `./data/pbmc3k.h5ad`.


In [None]:
def load_pbmc3k():
    for p in [os.getenv('CELLVOTE_PBMC3K'), os.path.join('data','pbmc3k.h5ad'), os.path.join('.','pbmc3k.h5ad')]:
        if p and os.path.exists(p):
            return ad.read_h5ad(p)
    raise FileNotFoundError('PBMC3k not found. Set CELLVOTE_PBMC3K or place ./data/pbmc3k.h5ad')

adata = load_pbmc3k()
adata


## 2) Preprocess, neighbors/UMAP, and Leiden clustering


In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)
sc.pp.pca(adata, n_comps=30)
sc.pp.neighbors(adata, n_neighbors=12, n_pcs=30)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.6, key_added='leiden')
adata.obs['leiden'] = adata.obs['leiden'].astype('category')
sc.pl.umap(adata, color=['leiden'], wspace=0.4, show=False)


## 3) Marker genes per cluster
Compute `rank_genes_groups` and assemble top-n markers per cluster.


In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=5, sharey=False, show=False)

def top_markers_from_rgg(adata, group_key='leiden', topn=10):
    rgg = adata.uns.get('rank_genes_groups')
    groups = rgg['names'].dtype.names
    return {g: list(rgg['names'][g][:topn]) for g in groups}

marker_dict = top_markers_from_rgg(adata, 'leiden', topn=10)
marker_dict


## 4) Simulate multiple annotation methods
Use a simple canonical-marker heuristic plus noise to create three annotation columns.


In [None]:
CANONICAL_MARKERS = {
    'T cell': {'CD3D','CD3E','IL7R'},
    'B cell': {'MS4A1','CD79A'},
    'NK': {'NKG7','GNLY'},
    'Myeloid': {'LYZ','S100A8','LST1'},
    'Dendritic': {'FCER1A','FCGR3A'},
    'Platelet': {'PPBP','PF4'},
}

def label_by_markers(marker_genes):
    mg = set(g.upper() for g in marker_genes)
    best, score = 'Unknown', -1
    for lbl, ref in CANONICAL_MARKERS.items():
        s = len(mg & set(ref))
        if s > score:
            best, score = lbl, s
    return best

cluster_labels = {cl: label_by_markers(genes) for cl, genes in marker_dict.items()}
rng = np.random.default_rng(0)
clusters = adata.obs['leiden'].astype(str)

def noisy_assign(p_noise=0.15):
    pool = list(set(cluster_labels.values())) + ['Unknown']
    vals = []
    for cl in clusters:
        vals.append(rng.choice(pool) if rng.random() < p_noise else cluster_labels.get(cl, 'Unknown'))
    return pd.Categorical(vals)

adata.obs['scsa_annotation'] = noisy_assign(0.10)
adata.obs['gpt_celltype']   = noisy_assign(0.18)
adata.obs['gbi_celltype']   = noisy_assign(0.12)
adata.obs[['leiden','scsa_annotation','gpt_celltype','gbi_celltype']].head()


## 5) CellVote (offline majority)
Monkey-patch arbitration to local majority, avoiding network and cost.


In [None]:
import omicverse.single._cellvote as cvmod

def local_majority_arbitration(cluster_celltypes, cluster_markers, species, organization, model, base_url, provider, api_key=None, **kwargs):
    out = {}
    for cl, cand in cluster_celltypes.items():
        if not cand:
            out[cl] = 'unknown'
        else:
            s = pd.Series(cand).str.lower()
            out[cl] = s.value_counts().idxmax()
    return out

cvmod.get_cluster_celltype = local_majority_arbitration

cv = CellVote(adata)
final_map_offline = cv.vote(
    clusters_key='leiden',
    cluster_markers=marker_dict,
    celltype_keys=['scsa_annotation','gpt_celltype','gbi_celltype'],
    species='human', organization='PBMC',
    provider='openai', model='gpt-4o-mini'
)
final_map_offline


In [None]:
cols = ['leiden','scsa_annotation','gpt_celltype','gbi_celltype','CellVote_celltype']
display(adata.obs[cols].head())
summary = (adata.obs.groupby('leiden')[cols[1:]].agg(lambda s: s.value_counts().index[0]))
summary


## 6) CellVote (online LLM, optional)
Set `RUN_ONLINE=True` and ensure your API key is configured. The underlying function includes timeouts/retries.


In [None]:
RUN_ONLINE = False
if RUN_ONLINE:
    import importlib, omicverse.single._cellvote as cvmod
    importlib.reload(cvmod)
    cv2 = CellVote(adata.copy())
    result_online = cv2.vote(
        clusters_key='leiden',
        cluster_markers=marker_dict,
        celltype_keys=['scsa_annotation','gpt_celltype','gbi_celltype'],
        species='human', organization='PBMC',
        provider='openai', model='gpt-4o-mini'
    )
    print('Online voting complete:', result_online)
else:
    print('RUN_ONLINE is False — skipped external API calls.')


## 7) Notes
- Replace the simulated annotations with real methods (`scsa_anno`, `gpt_anno`, `gbi_anno`, `scMulan_anno`) when available.
- Tune marker computation as needed.
- Final labels are stored in `adata.obs['CellVote_celltype']`.
