# Prep data for training a scBasset model on `pbmc-granulocyte-sorted-3k_10x-Multiome`
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how to prep data for training a scBasset model on `pbmc_granulocyte_sorted_3k` using the `scbasset` package. See https://github.com/ML4GLand/pbmc_granulocyte_sorted_3k for more details on how to download the data.


# Set-up

In [None]:
# Load necessary packages
import os
import gc
import h5py
import psutil
import pandas as pd
import scanpy as sc
from scipy import sparse

In [None]:
# Set-up the paths to data (TODO: change to your own paths)
input_dir = '/cellar/users/aklie/data/ml4gland/pbmc_granulocyte_sorted_3k/processed/21Sep23/'
h5_file = os.path.join(input_dir, 'pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5')
bed_file = os.path.join(input_dir, 'pbmc_granulocyte_sorted_3k_atac_peaks.bed')
output_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load data

In [None]:
# Read peaks and h5ad
peak = pd.read_csv(bed_file, sep='\t', names=['chr','start','end'], comment='#')
ad = sc.read_10x_h5(h5_file, gex_only=False)

# Filter low quality cells

In [None]:
# Use Scanpy to process the data
ad_rna = ad[:, ad.var['feature_types'] == 'Gene Expression']
ad_atac = ad[:, ad.var['feature_types'] == 'Peaks']
ad_atac.var['chr'] = peak['chr'].values
ad_atac.var['start'] = peak['start'].values
ad_atac.var['end'] = peak['end'].values

In [None]:
# basic stats
sc.pp.filter_cells(ad_rna, min_genes=0)
sc.pp.filter_genes(ad_rna, min_cells=0)
sc.pp.filter_cells(ad_atac, min_genes=0)
sc.pp.filter_genes(ad_atac, min_cells=0)

In [None]:
# a gene need to be expressed in 5% cells and a peak need to be accessible in 5% cells
thres = int(ad.shape[0]*0.05)
ad_rna = ad_rna[:, ad_rna.var['n_cells'] > thres]
ad_atac = ad_atac[:, ad_atac.var['n_cells'] > thres]

# Save h5ad for next step

In [None]:
# Keep only chromosomes 1-22, X, Y
chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
ad_atac = ad_atac[:, ad_atac.var['chr'].isin(chrs)]
ad_atac.write(os.path.join(output_dir, 'atac_ad.h5ad'))

# Preprocess specifically for scBasset
This step uses the scBasset package to preprocess the data for training a scBasset model.

In [None]:
%%bash
source activate scbasset
python /cellar/users/aklie/opt/ml4gland/scBasset/bin/scbasset_preprocess.py \
    --ad_file /cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed/atac_ad.h5ad \
    --input_fasta /cellar/users/aklie/data/ml4gland/genomes/hg38/hg38.fa \
    --out_path /cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/pbmc-granulocyte-sorted-3k_10x-Multiome/processed

```bash

```

# DONE!

---