In [40]:

def weighted_knn_trainer(train_adata, train_adata_emb, label_key, n_neighbors=50):
    """Trains a weighted KNN classifier on ``train_adata``.
    Parameters
    ----------
    train_adata: :class:`~anndata.AnnData`
        Annotated dataset to be used to train KNN classifier with ``label_key`` as the target variable.
    train_adata_emb: str
        Name of the obsm layer to be used for calculation of neighbors. If set to "X", anndata.X will be
        used
    label_key: str
        Name of the column to be used as target variable (e.g. cell_type) in ``train_adata`` and ``query_adata``.
    n_neighbors: int
        Number of nearest neighbors in KNN classifier.
    """
    print(
        f"Weighted KNN with n_neighbors = {n_neighbors} ... ",
        end="",
    )
    k_neighbors_transformer = KNeighborsTransformer(
        n_neighbors=n_neighbors,
        mode="distance",
        algorithm="brute",
        metric="euclidean",
        n_jobs=-1,
    )
    if train_adata_emb == "X":
        train_emb = train_adata.X
    elif train_adata_emb in train_adata.obsm.keys():
        train_emb = train_adata.obsm[train_adata_emb]
    else:
        raise ValueError(
            "train_adata_emb should be set to either 'X' or the name of the obsm layer to be used!"
        )
    k_neighbors_transformer.fit(train_emb)
    return k_neighbors_transformer


def weighted_knn_transfer(
    query_adata,
    query_adata_emb,
    ref_adata_obs,
    label_keys,
    knn_model,
    threshold=1,
    pred_unknown=False,
    mode="package",
):
    """Annotates ``query_adata`` cells with an input trained weighted KNN classifier.
    Parameters
    ----------
    query_adata: :class:`~anndata.AnnData`
        Annotated dataset to be used to queryate KNN classifier. Embedding to be used
    query_adata_emb: str
        Name of the obsm layer to be used for label transfer. If set to "X",
        query_adata.X will be used
    ref_adata_obs: :class:`pd.DataFrame`
        obs of ref Anndata
    label_keys: str
        Names of the columns to be used as target variables (e.g. cell_type) in ``query_adata``.
    knn_model: :class:`~sklearn.neighbors._graph.KNeighborsTransformer`
        knn model trained on reference adata with weighted_knn_trainer function
    threshold: float
        Threshold of uncertainty used to annotating cells as "Unknown". cells with
        uncertainties higher than this value will be annotated as "Unknown".
        Set to 1 to keep all predictions. This enables one to later on play
        with thresholds.
    pred_unknown: bool
        ``False`` by default. Whether to annotate any cell as "unknown" or not.
        If `False`, ``threshold`` will not be used and each cell will be annotated
        with the label which is the most common in its ``n_neighbors`` nearest cells.
    mode: str
        Has to be one of "paper" or "package". If mode is set to "package",
        uncertainties will be 1 - P(pred_label), otherwise it will be 1 - P(true_label).
    """
    if not type(knn_model) == KNeighborsTransformer:
        raise ValueError(
            "knn_model should be of type sklearn.neighbors._graph.KNeighborsTransformer!"
        )

    if query_adata_emb == "X":
        query_emb = query_adata.X
    elif query_adata_emb in query_adata.obsm.keys():
        query_emb = query_adata.obsm[query_adata_emb]
    else:
        raise ValueError(
            "query_adata_emb should be set to either 'X' or the name of the obsm layer to be used!"
        )
    top_k_distances, top_k_indices = knn_model.kneighbors(X=query_emb)

    stds = np.std(top_k_distances, axis=1)
    stds = (2.0 / stds) ** 2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True
    )
    cols = ref_adata_obs.columns[ref_adata_obs.columns.str.endswith(label_keys)]
    uncertainties = pd.DataFrame(columns=cols, index=query_adata.obs_names)
    pred_labels = pd.DataFrame(columns=cols, index=query_adata.obs_names)
    for i in range(len(weights)):
        for j in cols:
            y_train_labels = ref_adata_obs[j].values
            unique_labels = np.unique(y_train_labels[top_k_indices[i]])
            best_label, best_prob = None, 0.0
            for candidate_label in unique_labels:
                candidate_prob = weights[
                    i, y_train_labels[top_k_indices[i]] == candidate_label
                ].sum()
                if best_prob < candidate_prob:
                    best_prob = candidate_prob
                    best_label = candidate_label

            if pred_unknown:
                if best_prob >= threshold:
                    pred_label = best_label
                else:
                    pred_label = "Unknown"
            else:
                pred_label = best_label

            if mode == "package":
                uncertainties.iloc[i][j] = (max(1 - best_prob, 0))

            else:
                raise Exception("Inquery Mode!")

            pred_labels.iloc[i][j] = (pred_label)

    print("finished!")

    return pred_labels, uncertainties

In [41]:
ref_latent=full_latent[full_latent.obs["Core_or_Extand"]=="Core",]

In [42]:
ref_latent

View of AnnData object with n_obs × n_vars = 273479 × 15
    obs: 'orig.ident', 'nCount_originalexp', 'nFeature_originalexp', 'Sample', 'Project', 'Limb.Atlas', 'Organ', 'Tissue', 'Tissue.Specific.', 'Stage', 'Gene.type', 'Treatment', 'Age', 'Age.In.Detail.', 'Machine', 'Species', 'Isolation.approach', 'Digestion', 'Enzymes', 'Cre', 'Bone.Forming.Methods', 'Data.Source', 'Related.Assay', 'Origin', 'nCount_RNA', 'nFeature_RNA', 'paper_label', 'coarse_label', 'C2_named', 'C7_named', 'C19_named', 'C36_named', 'C49_named', 'C90_named', 'C137_named', 'transf_ann_level_7_label', 'transf_ann_level_6_label', 'transf_ann_level_5_label', 'transf_ann_level_4_label', 'transf_ann_level_3_label', 'transf_ann_level_2_label', 'transf_ann_level_1_label', 'n_genes', 'most_confident_level', 'most_confident_uncert', 'most_confident_label', 'transf_ann_level_5_uncert', 'transf_ann_level_4_uncert', 'transf_ann_level_3_uncert', 'transf_ann_level_2_uncert', 'transf_ann_level_1_uncert', 'Tissue.Specific', 'Cor

In [43]:
k_neighbors_transformer = weighted_knn_trainer(
        train_adata=ref_latent,
        train_adata_emb="X",
        label_key="_named",
        n_neighbors=50,
    )

Weighted KNN with n_neighbors = 50 ... 

In [44]:
adata

AnnData object with n_obs × n_vars = 781397 × 20664
    obs: 'Sample', 'Project', 'Core Dataset', 'Limb Atlas', 'Cranial Atlas', 'Organ', 'Tissue', 'Tissue(Specific)', 'Stage', 'Gene type', 'Treatment', 'Age', 'Age(In Detail)', 'Machine', 'Species', 'Isolation approach', 'Digestion', 'Enzymes', 'Cre', 'Bone Forming Methods', 'Data Source', 'Related Assay', 'Cell Count After QC', 'Origin', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'paper_label', 'coarse_label', 'scDblFinder_class', 'short_id', 'nCount_originalexp', 'nFeature_originalexp', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'new_totals_log', 'batch', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'transf_ann_level_7_label', 'transf_ann_level_6_label', 'transf_ann_level_5_label', 'transf_ann_level_4_label', 'transf_ann_level_

In [None]:
labels, uncert = weighted_knn_transfer(
        query_adata=adata,
        query_adata_emb="scANVI_2",
        label_keys="_named",
        knn_model=k_neighbors_transformer,
        ref_adata_obs = ref_latent.obs
    )

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fac172dddc0>
Traceback (most recent call last):
  File "/home/zhanglab/mambaforge/envs/celloracle/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/zhanglab/mambaforge/envs/celloracle/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/zhanglab/mambaforge/envs/celloracle/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/zhanglab/mambaforge/envs/celloracle/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [48]:
adata.obsm["scANVI_2"].shape

(781397, 15)

In [49]:
adata.obsm["scANVI"].shape

(781397, 15)

In [51]:
ref_latent.X.shape

(273479, 15)

In [54]:
labels.to_csv("../result/9.18_novel_celltype/new_label_trans.csv")

In [55]:
uncert.to_csv("../result/9.18_novel_celltype/new_uncert_trans.csv")

In [133]:
labels

Unnamed: 0,C2_named,C7_named,C19_named,C36_named,C49_named,C90_named,C137_named
Suture2021_Farmer_E17_AAACCTGCAGACTCGC-1_1-0_1_1,Non-osteo,MSC,Late.MSC,Alcam.Late.MSC,Eln.Alcam.Late.MSC,Eln.Alcam.Late.MSC,Tnn.Eln.Alcam.Late.MSC
Suture2021_Farmer_E17_AAACCTGGTGCGGTAA-1_1-0_1_1,Non-osteo,Ly6a+ MSC,Ly6a+ MSC,Col6a6.Ly6a+ MSC,Pi16.Col6a6.Ly6a+ MSC,Pi16.Col6a6.Ly6a+ MSC,Krtdap.Pi16.Col6a6.Ly6a+ MSC
Suture2021_Farmer_E17_AAACCTGGTGGTCTCG-1_1-0_1_1,Ob,Ob,Ob,Col22a1.Ob,Ifitm5.Col22a1.Ob,Ifitm5.Col22a1.Ob,Ifitm5.Col22a1.Ob
Suture2021_Farmer_E17_AAACCTGTCACCACCT-1_1-0_1_1,Ob,Ob,Ob,Col22a1.Ob,Ifitm5.Col22a1.Ob,Ifitm5.Col22a1.Ob,Ifitm5.Col22a1.Ob
Suture2021_Farmer_E17_AAACCTGTCGTTACGA-1_1-0_1_1,Non-osteo,Lepr+ BMSC,Lepr+ BMSC,Col6a5.Lepr+ BMSC,Slc16a9.Col6a5.Lepr+ BMSC,Adipoq.Slc16a9.Col6a5.Lepr+ BMSC,Adipoq.Slc16a9.Col6a5.Lepr+ BMSC
...,...,...,...,...,...,...,...
TTTGTTGAGGTGATAT-1_8,Non-osteo,Chondro,CPC,Ucma.CPC,Crispld1.Ucma.CPC,Pcp4.Crispld1.Ucma.CPC,Dlk1.Pcp4.Crispld1.Ucma.CPC
TTTGTTGAGTCTTCCC-1_8,Non-osteo,MSC,Pre-ob,Dapk2.Pre-ob,Tnn.Dapk2.Pre-ob,Tnn+ Aspn+ BMSC,Tnn+ Aspn+ BMSC
TTTGTTGGTCCACACG-1_8,Non-osteo,Chondro,CPC,Ucma.CPC,Crispld1.Ucma.CPC,Pcp4.Crispld1.Ucma.CPC,Dlk1.Pcp4.Crispld1.Ucma.CPC
TTTGTTGTCACTACGA-1_8,Non-osteo,Chondro,CPC,Ucma.CPC,C1qtnf3.Ucma.CPC,C1qtnf3.Ucma.CPC,C1qtnf3.Ucma.CPC
