## Benchmarking mapping accuracy for Tangram2
<br>
<b>Description</b> : In this notebook we tried to benchmark Tangram2 mapping acuracy with publicly available tools such as Tangram, SpaOTsc and MOSCOT. Here I only show one example of creating 100 spots with one specific patient. The final plot is based on the aggregated result on six patients with varying number of spots<br>
<b>Author</b> : Hejin Huang (huang.hejin@gene.com)<br>

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import tangram2 as tg2
import tangram as tg
from sklearn.metrics import jaccard_score

  from pkg_resources import get_distribution, DistributionNotFound


In [2]:
# Load data
path = '../../data/tangram2_paper_data/original/scc/'
ad_sc = sc.read_h5ad(path + 'scc_new.h5ad')

# Select patient and generate cell mix data
patient_group = ['P2', 'P4', 'P5', 'P6', 'P9', 'P10']
patient = patient_group[0]
ad_sc_sample = ad_sc[ad_sc.obs['patient'] == patient]

# Define label column
label_used = 'level2_celltype_mod' # Assuming this is the intended label based on common usage in similar analyses

ad_sp, ad_sc_paired = tg2.evalkit.datagen.cellmix.cellmix.cellmix(ad_sc_sample,
              n_spots = 100,
              n_cells_per_spot = 10,
              n_types_per_spot = 3,
              label_col=label_used,
              encode_spatial = True,
              resample = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
              )

In [3]:
# Get noise levels
noise_level = ['no_noise']
for key in ad_sp.layers:
    noise_level.append(key)

# Create ground truth mapping
ground_truth = pd.DataFrame(index=ad_sp.obs.index, columns=ad_sc_paired.obs.index).fillna(0)
for i in range(len(ad_sp.uns['cellmix_cell_map_mp']['row_self'])):
    ground_truth.iloc[ad_sp.uns['cellmix_cell_map_mp']['row_self'][i], ad_sp.uns['cellmix_cell_map_mp']['row_target'][i]] = 1

In [4]:
# Save generated data
# ad_sp.write_h5ad('/gstore/data/resbioai/tangram2_data/telegraph/res/mapping/100_spot/ad_sp_' + patient + '.h5ad')
# ad_sc_paired.write_h5ad('/gstore/data/resbioai/tangram2_data/telegraph/res/mapping/100_spot/ad_sc_' + patient + '.h5ad')

In [5]:
# Initialize results dictionaries
result_tg2integrate = {}
result_tg2cell = {}
result_tg1cell = {}
result_spaotsc = {}
result_moscot = {}
result_argmax = {}
result_random = {}

In [37]:
# Define mapping methods for easier iteration
mapping_methods = {
    'argmax': {'method': tg2.evalkit.met.map_methods.ArgMaxCorrMap},
    'random': {'method': tg2.evalkit.met.map_methods.RandomMap},
}

for noise in noise_level:
    # Prepare AnnData objects for mapping
    ad_sc_mapping = ad_sc_paired.copy()
    ad_sp_mapping = ad_sp.copy()
    if noise != 'no_noise':
        ad_sp_mapping.X = ad_sp_mapping.layers[noise]

    # Run ArgMaxCorrMap and RandomMap
    for method_name, method_dict in mapping_methods.items():
        input_dict = tg2.evalkit.met.utils.adatas_to_input(
            {'from': ad_sc_mapping, 'to': ad_sp_mapping},
            categorical_labels={'from': [label_used]}
        )
        tg2.evalkit.met.workflows.Workflow({'map': method_dict}).run(input_dict)
        if method_name == 'argmax':
            result_argmax[noise] = input_dict['T'].copy()
        elif method_name == 'random':
            result_random[noise] = input_dict['T'].copy()

    # Run Tangram2Map (tg2_integrate)
    input_dict_tg2integrate = tg2.evalkit.met.utils.adatas_to_input(
        {'from': ad_sc_mapping, 'to': ad_sp_mapping},
        categorical_labels={'from': [label_used]}
    )
    tg2.evalkit.met.pp.StandardTangram2.run(input_dict_tg2integrate)
    map_res_tg2integrate = tg2.evalkit.met.map_methods.Tangram2Map.run(
        input_dict_tg2integrate,
        num_epochs=1000,
        density_prior='uniform',
    )
    input_dict_tg2integrate.update(map_res_tg2integrate)
    tg2.evalkit.met.pp.StandardScanpy.run(input_dict_tg2integrate, target_objs=['X_from'])
    input_dict_tg2integrate['w'].index = input_dict_tg2integrate['w']['cell_type']
    result_tg2integrate[noise] = input_dict_tg2integrate['T'].copy()

    # Run Tangram2 (tg2.mapping.map_cells_to_space, tg2_cell)
    ad_sc_mapping_tg2 = ad_sc_paired.copy()
    ad_sp_mapping_tg2 = ad_sp.copy()
    if noise != 'no_noise':
        ad_sp_mapping_tg2.X = ad_sp_mapping_tg2.layers[noise]
    tg2.mapping.pp_adatas(ad_sc_mapping_tg2, ad_sp_mapping_tg2)
    ad_map_tg2 = tg2.mapping.map_cells_to_space(
        ad_sc_mapping_tg2, ad_sp_mapping_tg2,
        mode="cells",
        device='cuda:0',
        density_prior='uniform',
    )
    result_tg2cell[noise] = ad_map_tg2.to_df().T

    # Run Tangram1 (tan.map_cells_to_space, tg1_cell)
    ad_sc_mapping_tg = ad_sc_paired.copy()
    ad_sp_mapping_tg = ad_sp.copy()
    if noise != 'no_noise':
        ad_sp_mapping_tg.X = ad_sp_mapping_tg.layers[noise]
    tg.pp_adatas(ad_sc_mapping_tg, ad_sp_mapping_tg)
    ad_map_tg = tg.map_cells_to_space(
        ad_sc_mapping_tg, ad_sp_mapping_tg,
        mode="cells",
        device='cuda:0',
        density_prior='uniform',
    )
    result_tg1cell[noise] = ad_map_tg.to_df().T

    Run spaOTsc
    input_dict_spaotsc = tg2.evalkit.met.utils.adatas_to_input(
        {'from': ad_sc_mapping, 'to': ad_sp_mapping},
        categorical_labels={'from': [label_used]}
    )
    wf_spaOT_setup = {
        'pp': {'method': tg2.evalkit.met.pp.StandardSpaOTsc},
        'map': {'method': tg2.evalkit.met.map.SpaOTscMap, 'params': {'num_epochs': 1000, 'genes': None}},
        'pred': {'method': tg2.evalkit.met.pred_methods.MoscotPred},
    }
    wf_spaOT = tg2.evalkit.met.workflows.Workflow(wf_spaOT_setup)
    wf_spaOT.run(input_dict_spaotsc)
    result_spaotsc[noise] = input_dict_spaotsc['T']

    # Run MOSCOT
    input_dict_moscot = tg2.evalkit.met.utils.adatas_to_input(
        {'from': ad_sc_mapping, 'to': ad_sp_mapping},
        categorical_labels={'from': [label_used]}
    )
    wf_moscot_setup = {
        'pp': {'method': tg2.evalkit.met.pp.StandardMoscot},
        'map': {'method': tg2.evalkit.met.map.MoscotMap, 'params': {'num_epochs': 1000, 'genes': None}},
        'pred': {'method': tg2.evalkit.met.pred_methods.MoscotPred},
    }
    wf_moscot = tg2.evalkit.met.workflows.Workflow(wf_moscot_setup)
    wf_moscot.run(input_dict_moscot)
    result_moscot[noise] = input_dict_moscot['T']

INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 20068 genes and uniform density_prior in clusters mode...
INFO:root:Printing scores every 100 epochs.


Set Solid Seed
Set Solid Seed
Score: 0.927, KL reg: 3.205, Entropy reg: -7.301
Score: 0.974, KL reg: 3.178, Entropy reg: -5.796
Score: 0.975, KL reg: 3.178, Entropy reg: -5.634
Score: 0.975, KL reg: 3.178, Entropy reg: -5.580
Score: 0.975, KL reg: 3.178, Entropy reg: -5.555
Score: 0.976, KL reg: 3.178, Entropy reg: -5.543
Score: 0.976, KL reg: 3.178, Entropy reg: -5.540
Score: 0.976, KL reg: 3.178, Entropy reg: -5.532
Score: 0.976, KL reg: 3.178, Entropy reg: -5.528
Score: 0.976, KL reg: 3.178, Entropy reg: -5.525


INFO:root:Renormalizing Single cell data
INFO:root:Begin training with 20068 genes and uniform density_prior in cells mode after renormalization
INFO:root:Printing scores every 100 epochs.


Set Solid Seed
Set Solid Seed
Score: 0.936, KL reg: 0.001, Entropy reg: -4132.243
Score: 0.997, KL reg: 0.000, Entropy reg: -521.425
Score: 0.998, KL reg: 0.000, Entropy reg: -411.384
Score: 0.998, KL reg: 0.000, Entropy reg: -382.132
Score: 0.998, KL reg: 0.000, Entropy reg: -369.306
Score: 0.998, KL reg: 0.000, Entropy reg: -361.381
Score: 0.998, KL reg: 0.000, Entropy reg: -358.768
Score: 0.998, KL reg: 0.000, Entropy reg: -355.380
Score: 0.998, KL reg: 0.000, Entropy reg: -353.432


INFO:root:Saving results..


Score: 0.998, KL reg: 0.000, Entropy reg: -351.914


INFO:root:20069 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:20069 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 20069 genes and uniform density_prior in cells mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.935, KL reg: 0.001, Entropy reg: -4132.243
Score: 0.998, KL reg: 0.000, Entropy reg: -480.119
Score: 0.998, KL reg: 0.000, Entropy reg: -383.662
Score: 0.998, KL reg: 0.000, Entropy reg: -354.646
Score: 0.998, KL reg: 0.000, Entropy reg: -342.843
Score: 0.998, KL reg: 0.000, Entropy reg: -336.384
Score: 0.998, KL reg: 0.000, Entropy reg: -331.145
Score: 0.998, KL reg: 0.000, Entropy reg: -329.991
Score: 0.998, KL reg: 0.000, Entropy reg: -327.414
Score: 0.998, KL reg: 0.000, Entropy reg: -326.598


INFO:root:Saving results..
INFO:root:20068 training genes are saved in `uns``training_genes` of both single cell and spatial Anndatas.
INFO:root:20069 overlapped genes are saved in `uns``overlap_genes` of both single cell and spatial Anndatas.
INFO:root:uniform based density prior is calculated and saved in `obs``uniform_density` of the spatial Anndata.
INFO:root:rna count based density prior is calculated and saved in `obs``rna_count_based_density` of the spatial Anndata.
INFO:root:Allocate tensors for mapping.
INFO:root:Begin training with 20068 genes and uniform density_prior in cells mode...
INFO:root:Printing scores every 100 epochs.


Score: 0.478, KL reg: 0.001
Score: 0.994, KL reg: 0.002
Score: 0.994, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002
Score: 0.995, KL reg: 0.002


INFO:root:Saving results..


[34mINFO    [0m Computing pca with `[33mn_comps[0m=[1;36m30[0m` for `xy` using `adata.X`                                                  
[34mINFO    [0m Normalizing spatial coordinates of `x`.                                                                   
[34mINFO    [0m Solving `[1;36m1[0m` problems                                                                                      
[34mINFO    [0m Solving problem OTProblem[1m[[0m[33mstage[0m=[32m'prepared'[0m, [33mshape[0m=[1m([0m[1;36m100[0m, [1;36m1002[0m[1m)[0m[1m][0m.                                           


In [None]:
# Calculate Jaccard Scores
result_df = pd.DataFrame(index=noise_level)
for noise in noise_level:
    result_df.loc[noise, 'tg1_cell'] = jaccard_score((result_tg1cell[noise] == result_tg1cell[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'tg2_cell'] = jaccard_score((result_tg2cell[noise] == result_tg2cell[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'tg2_integrate'] = jaccard_score((result_tg2integrate[noise] == result_tg2integrate[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'moscot'] = jaccard_score((result_moscot[noise] == result_moscot[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'spaotsc'] = jaccard_score((result_spaotsc[noise] == result_spaotsc[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'argmax'] = jaccard_score((result_argmax[noise] == result_argmax[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())
    result_df.loc[noise, 'random'] = jaccard_score((result_random[noise] == result_random[noise].max()).astype(int).values.flatten(), ground_truth.values.flatten())

result_df.to_csv('../../data/tangram2_paper_data/analysis/mapping_benchmark/100_spot/' + patient + '_result.csv')