## B cell activation experiment
<br>
<b>Description</b> : In this notebook we run Tangram2 and Intergram on human lymph node data.<br>
<b>Author</b> : Alma Andersson (andera29@gene.com)<br>
<b>Date</b> : 08/14/2024

Import necessary packages

In [18]:
%load_ext autoreload
%autoreload 2
%load_ext blackcellmagic

In [1]:
import tangram2 as tg2
import scanpy as sc
import matplotlib.pyplot as plt
from scipy.stats import hypergeom

import pandas as pd
import numpy as np
import copy
import tqdm
import os.path as osp
import os

  from pkg_resources import get_distribution, DistributionNotFound


## Define Helper Functions

In [2]:
import re

def get_isoforms(s):
    pattern = r'.*\.\d+'
    return bool(re.fullmatch(pattern, s))


In [3]:
def pp_adata(ad_sc,ad_sp = None):
    drop1 = ad_sc.var_names.str.startswith(("MT-","MT","RP","RPS","LINC"))
    drop2 = ad_sc.var_names.str.endswith(('VDJsum'))
    drop3 = np.array([ get_isoforms(x) for x in ad_sc.var_names])
    ad_sc = ad_sc[:,(~drop1) & (~drop2) & (~drop3)].copy()

    sc.pp.filter_cells(ad_sc,min_counts=300)
    
    sc.pp.filter_genes(ad_sc,min_counts=10)

    if ad_sp is not None:
        sc.pp.filter_cells(ad_sp,min_counts=100)
        sc.pp.filter_genes(ad_sp,min_counts=10)
  

    ad_sc.layers['raw'] = ad_sc.X.copy()
    sc.pp.normalize_total(ad_sc,1e4)
    sc.pp.log1p(ad_sc)
    sc.pp.highly_variable_genes(ad_sc,n_top_genes=5000)
    ad_sc.layers['norm'] = ad_sc.X.copy()
    ad_sc.X = ad_sc.layers['raw'].copy()

    if ad_sp is not None:
        return ad_sc,ad_sp
    return ad_sc
    

In [4]:
with open('OUTPUT_DIR.txt','r+') as f:
    OUTPUT_ROOT = f.readlines()[0]

OUTPUT_DIR = osp.join(OUTPUT_ROOT, 'bcell')
os.makedirs(OUTPUT_DIR, exist_ok = True)

### Data

Get data and process <br>
From: https://github.com/BayraktarLab/cell2location/blob/master/docs/notebooks/cell2location_tutorial.ipynb

Spatial Visium data from the human lymph node, all processing follows the author provided tutorial

In [5]:
adata_vis = sc.datasets.visium_sge(sample_id="V1_Human_Lymph_Node")
adata_vis.obs['sample'] = list(adata_vis.uns['spatial'].keys())[0]

In [6]:
adata_vis.var['SYMBOL'] = adata_vis.var_names
adata_vis.var.set_index('gene_ids', drop=True, inplace=True)

In [7]:
adata_vis.var.index = adata_vis.var['SYMBOL']
adata_vis.var_names_make_unique()

In [8]:
# find mitochondria-encoded (MT) genes
adata_vis.var['MT_gene'] = [gene.startswith('MT-') for gene in adata_vis.var['SYMBOL']]
    
# remove MT genes for spatial mapping (keeping their counts in the object)
adata_vis.obsm['MT'] = adata_vis[:, adata_vis.var['MT_gene'].values].X.toarray()
adata_vis = adata_vis[:, ~adata_vis.var['MT_gene'].values]

Get paired single cell data (originally downloaded from: https://cell2location.cog.sanger.ac.uk/paper/integrated_lymphoid_organ_scrna/RegressionNBV4Torch_57covariates_73260cells_10237genes/sc.h5ad)

In [9]:
# Read data
adata_ref = sc.read(
    f'../../data/lymph_node/lymph_sc.h5ad', 
    backup_url='https://cell2location.cog.sanger.ac.uk/paper/integrated_lymphoid_organ_scrna/RegressionNBV4Torch_57covariates_73260cells_10237genes/sc.h5ad'
)

In [10]:
adata_ref.var['SYMBOL'] = adata_ref.var.index 
# rename 'GeneID-2' as necessary for your data
adata_ref.var.set_index('GeneID-2', drop=True, inplace=True)

# delete unnecessary raw slot (to be removed in a future version of the tutorial)
del adata_ref.raw

In [11]:
adata_ref.var.index = adata_ref.var['SYMBOL']
adata_ref.var_names_make_unique()

We'll only use one donor for the scRNA-seq data to demonstrate (it's also less computationally heavy)

In [12]:
adata_ref = adata_ref[adata_ref.obs['Donor'].values == 'A16'].copy()

## Process data and run workflow

In [13]:
adata_ref,adata_vis = pp_adata(adata_ref,adata_vis)

Define column with cell type labels

In [14]:
label_col = 'Subset_Broad'

Initialize state dict

In [16]:
hvg_genes = adata_ref.var_names[adata_ref.var.highly_variable.values].tolist()

input_dict_1 = tg2.evalkit.met.utils.adatas_to_input({'from':adata_ref,
                                             'to':adata_vis}, # provide the data to be used
                                             categorical_labels={'from':[label_col]}, # include cluster labels in the design matrix
                                            )

Run mapping method

In [19]:
tg2.evalkit.met.pp.StandardTangram2.run(input_dict_1)

map_res_1 = tg2.evalkit.met.map_methods.Tangram2Map.run(
    input_dict_1,
    num_epochs=1000,
    genes=hvg_genes,
)

input_dict_1.update(map_res_1)

Run interaction model

In [20]:
tg2.evalkit.met.pp.StandardScanpy.run(input_dict_1, target_objs=["X_from"])

inter_res = tg2.ccc.TangramCCC.run(input_dict_1, n_epochs=1000, learning_rate=0.01)

In [22]:
inter_res.to_netcdf(osp.join(OUTPUT_DIR,'inter_res.netcdf'))

Calculate highly variable genes

In [21]:
hvg = tg2.evalkit.methods.dea.HVGFeatureDEA.run_with_adata(
    input_dict_1["X_from"],
    subset_col=label_col,
    subset_labels="B_GC_prePB",
)

In [37]:
hvg.to_csv(osp.join(OUTPUT_DIR,'hvg_B_GC_prePB.csv'))