# Run pycisTopic on `pbmc-granulocyte-sorted-3k_10x-Multiome`
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how run pycisTopic

# Set-up

In [None]:
# Load necessary packages
import os
import scanpy as sc
import pickle
from pycisTopic.cistopic_class import create_cistopic_object_from_fragments, run_cgs_models
from pycisTopic.lda_models import evaluate_models

In [None]:
# Set-up the paths to data (TODO: change to your own paths)
input_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed'
tmp_dir = os.path.join(input_dir, 'tmp')
if not os.path.exists(input_dir):
    os.makedirs(input_dir)

In [None]:
# Helpful to have
adata = sc.read_h5ad(os.path.join(input_dir, 'scRNA/adata.h5ad'))
scRNA_bc = adata.obs_names
cell_data = adata.obs
cell_data['sample_id'] = '10x_pbmc'
cell_data['celltype'] = cell_data['celltype'].astype(str) # set data type of the celltype column to str, otherwise the export_pseudobulk function will complain.
del(adata)

In [None]:
# Other needed paths
fragments_dict = {'10x_pbmc': os.path.join(input_dir, 'data/pbmc_granulocyte_sorted_3k_atac_fragments.tsv.gz')}
path_to_regions = {'10x_pbmc':os.path.join(input_dir, 'scATAC/consensus_peak_calling/consensus_regions.bed')}
path_to_blacklist= os.path.join(input_dir, 'hg38-blacklist.v2.bed')
metadata_bc = pickle.load(open(os.path.join(input_dir, 'scATAC/quality_control/metadata_bc.pkl'), 'rb'))
bc_passing_filters = pickle.load(open(os.path.join(input_dir, 'scATAC/quality_control/bc_passing_filters.pkl'), 'rb'))

In [None]:
# Print some stats
print(f"{len(list(set(bc_passing_filters['10x_pbmc']) & set(scRNA_bc)))} cell barcodes pass both scATAC-seq and scRNA-seq based filtering")

In [None]:
key = '10x_pbmc'
cistopic_obj = create_cistopic_object_from_fragments(
    path_to_fragments=fragments_dict[key],
    path_to_regions=path_to_regions[key],
    path_to_blacklist=path_to_blacklist,
    metrics=metadata_bc[key],
    valid_bc=list(set(bc_passing_filters[key]) & set(scRNA_bc)),
    n_cpu=1,
    project=key,
    split_pattern='-'
)
cistopic_obj.add_cell_data(cell_data, split_pattern='-')
print(cistopic_obj)

In [None]:
# Dump the cistopic object
pickle.dump(
    cistopic_obj,
    open(os.path.join(input_dir, 'scATAC/cistopic_obj.pkl'), 'wb')
)

In [None]:
# Run pycistopic
models = run_cgs_models(
    cistopic_obj,
    n_topics=[2,4,10,16,32,48],
    n_cpu=5,
    n_iter=500,
    random_state=555,
    alpha=50,
    alpha_by_topic=True,
    eta=0.1,
    eta_by_topic=False,
    save_path=None,
    _temp_dir = os.path.join(tmp_dir + 'ray_spill')
)

In [None]:
# Save and load
if not os.path.exists(os.path.join(input_dir, 'scATAC/models')):
    os.makedirs(os.path.join(input_dir, 'scATAC/models'))
pickle.dump(models,
            open(os.path.join(input_dir, 'scATAC/models/10x_pbmc_models_500_iter_LDA.pkl'), 'wb'))

In [None]:
# Eval models
model = evaluate_models(
    models,
    select_model=16,
    return_model=True,
    metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
    plot_metrics=False
)

In [None]:
# Add the best model
cistopic_obj.add_LDA_model(model)
pickle.dump(
    cistopic_obj,
    open(os.path.join(input_dir, 'scATAC/cistopic_obj.pkl'), 'wb')
)

# DONE!

---