# Prep data for training a scBasset model on `Buenrostro_2018`
Adam Klie (last updated: *09/20/2023*)
***
This notebook shows how to prep data for training a scBasset model on the Buenrostro 2018 data. Details on how to get the processed data are coming soon. For now, you can download the ready to go h5ad file using the `setup.ipynb` notebook one directory up from this.

# Set-up

In [None]:
# Load necessary packages
import os
import pandas as pd
import scanpy as sc

In [None]:
# Set-up the paths to data (TODO: change to your own paths)
input_dir = '/cellar/users/aklie/data/ml4gland/Buenrostro_2018/processed/21Sep23/'
h5_file = os.path.join(input_dir, 'Buenrostro_2018_filtered_feature_bc_matrix.h5')  # TODO
bed_file = os.path.join(input_dir, 'Buenrostro_2018.bed')  # TODO
output_dir = '/cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/Buenrostro_2018/processed'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load data

In [None]:
# Read peaks and h5ad
peak = pd.read_csv(bed_file, sep='\t', names=['chr','start','end'], comment='#')
ad = sc.read_10x_h5(h5_file, gex_only=False)

# Filter low quality cells

In [None]:
# Use Scanpy to process the data
ad_rna = ad[:, ad.var['feature_types'] == 'Gene Expression']
ad_atac = ad[:, ad.var['feature_types'] == 'Peaks']
ad_atac.var['chr'] = peak['chr'].values
ad_atac.var['start'] = peak['start'].values
ad_atac.var['end'] = peak['end'].values

In [None]:
# basic stats
sc.pp.filter_cells(ad_rna, min_genes=0)
sc.pp.filter_genes(ad_rna, min_cells=0)
sc.pp.filter_cells(ad_atac, min_genes=0)
sc.pp.filter_genes(ad_atac, min_cells=0)

In [None]:
# a gene need to be expressed in 5% cells and a peak need to be accessible in 5% cells
thres = int(ad.shape[0]*0.05)
ad_rna = ad_rna[:, ad_rna.var['n_cells'] > thres]
ad_atac = ad_atac[:, ad_atac.var['n_cells'] > thres]

# Save h5ad for next step

In [None]:
chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
ad_atac = ad_atac[:, ad_atac.var['chr'].isin(chrs)]
ad_atac.write(os.path.join(output_dir, 'buen_atac_ad.h5ad'))

# Preprocess specifically for scBasset

In [None]:
%%bash
source activate scbasset
python /cellar/users/aklie/opt/ml4gland/scBasset/bin/scbasset_preprocess.py \
    --ad_file /cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/Buenrostro_2018/processed/buen_ad_sc.h5ad \
    --input_fasta /cellar/users/aklie/data/ml4gland/genomes/hg38/hg38.fa \
    --out_path /cellar/users/aklie/projects/ML4GLand/use_cases/scBasset/Buenrostro_2018/processed

```bash

```

# DONE!

---