# Installation and download to work with scBasset
**Authorship:**
Adam Klie (last updated: *07/19/2023*)
***
**Description:**
Set-up computational environments and data for working with scBasset models from multiple sources.

- EUGENe
- Kipoi
***

# Data

In [None]:
import subprocess
import os

download_savepath = '/cellar/users/aklie/data/ml4gland/use_cases/yuan22/github_tutorial'
os.makedirs(download_savepath, exist_ok=True)

if not os.path.exists('%s/buen_ad_sc.h5ad'%download_savepath):
    subprocess.run('wget -P %s https://storage.googleapis.com/scbasset_tutorial_data/buen_ad_sc.h5ad'%download_savepath, shell=True)

if not os.path.exists('%s/buen_model_sc.h5'%download_savepath):
    subprocess.run('wget -P %s https://storage.googleapis.com/scbasset_tutorial_data/buen_model_sc.h5'%download_savepath, shell=True)

if not os.path.exists('%s/pbmc_ad.h5ad'%download_savepath):
    subprocess.run('wget -P %s https://storage.googleapis.com/scbasset_tutorial_data/pbmc_ad.h5ad'%download_savepath, shell=True)

if not os.path.exists('%s/pbmc_model.h5'%download_savepath):
    subprocess.run('wget -P %s https://storage.googleapis.com/scbasset_tutorial_data/pbmc_model.h5'%download_savepath, shell=True)

# download motif injection fasta files for CISBP-1.0 motifs and unzip:
if not os.path.exists('%s/Homo_sapiens_motif_fasta.tar.gz'%download_savepath):
    subprocess.run('wget -P %s https://storage.googleapis.com/scbasset_tutorial_data/Homo_sapiens_motif_fasta.tar.gz'%download_savepath, shell=True)
subprocess.run('tar -xzf %s/Homo_sapiens_motif_fasta.tar.gz -C %s/'%(download_savepath, download_savepath), shell=True)

In [None]:
import numpy as np
import pandas as pd
import h5py
import scipy
import scanpy as sc
import anndata
from scbasset.utils import *

# plotting functions
import seaborn as sns
import matplotlib.pyplot as plt

import os

### read example from 10x multiome output

In [None]:
data_path = '/cellar/users/aklie/data/ml4gland/use_cases/yuan22/github_tutorial/multiome_pbmc/'

h5_file = data_path + 'pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5'
bed_file = data_path + 'pbmc_granulocyte_sorted_3k_atac_peaks.bed'

h5_file, bed_file

('/cellar/users/aklie/data/ml4gland/use_cases/yuan22/github_tutorial/multiome_pbmc/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5',
 '/cellar/users/aklie/data/ml4gland/use_cases/yuan22/github_tutorial/multiome_pbmc/pbmc_granulocyte_sorted_3k_atac_peaks.bed')

In [None]:
peak = pd.read_csv(bed_file, sep='\t', names=['chr','start','end'])
ad = sc.read_10x_h5(h5_file, gex_only=False)

  utils.warn_names_duplicates("var")


### filtering

In [None]:
ad_rna = ad[:, ad.var['feature_types']=='Gene Expression']
ad_atac = ad[:, ad.var['feature_types']=='Peaks']
ad_atac.var['chr'] = peak['chr'].values
ad_atac.var['start'] = peak['start'].values
ad_atac.var['end'] = peak['end'].values

# basic stats
sc.pp.filter_cells(ad_rna, min_genes=0)
sc.pp.filter_genes(ad_rna, min_cells=0)
sc.pp.filter_cells(ad_atac, min_genes=0)
sc.pp.filter_genes(ad_atac, min_cells=0)

# a gene need to be expressed in 5% cells
# a peak need to be accessible in 5% cells
thres = int(ad.shape[0]*0.05)
ad_rna = ad_rna[:, ad_rna.var['n_cells']>thres]
ad_atac = ad_atac[:, ad_atac.var['n_cells']>thres]

  This is separate from the ipykernel package so we can avoid doing imports until
  adata.obs['n_genes'] = number


### save h5ad

In [None]:
chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
ad_atac = ad_atac[:, ad_atac.var['chr'].isin(chrs)]
ad_atac.write(os.path.join(data_path, 'atac_ad.h5ad'))

  df[key] = c
  df[key] = c
