# Application of design best practices to COVID-19 dataset - prepare dataset

In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
import os,sys
import scanpy as sc 
import pandas as pd
import numpy as np
import milopy
import scipy
import anndata

In [2]:
# check X stores raw counts 
def _check_counts_in_X(adata):
    return(all(np.random.choice(adata.X.data, 100) % 1 == 0))

def _clean_adata(a):
    ## Make obs_names unique
    a.obs_names = a.obs['dataset_id'].astype('str') + '-' + a.obs_names.astype("str")
    assert _check_counts_in_X(a)

    sc.pp.calculate_qc_metrics(a, inplace=True)
    sc.pp.filter_cells(a, min_counts=1000)
    return(a)

Filtered anndata objects (subsampling to 500 cells per sample) and split by condition are prepared running scripts in `src/01_PBMC_data_processing/` and stored in `/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/`.

In [None]:
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
query_batch = '10_1038_s41591_021_01329_2'
normal_sample_obs = pd.read_csv(data_dir + 'PBMC_sample_metadata.normal.csv')
covid_sample_obs = pd.read_csv(data_dir + 'PBMC_sample_metadata.COVID.csv')
step_adata = sc.read_h5ad(data_dir + f'{query_batch}.h5ad', backed='r')
atlas_batch = normal_sample_obs[normal_sample_obs.dataset_id != query_batch].dataset_id.unique()
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
tmp_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/'

h5ad_files_atlas = [f'{x}.normal.subsample500cells.h5ad' for x in atlas_batch]
h5ad_files_covid = f'{query_batch}.COVID.subsample500cells.h5ad' 
h5ad_files_ctrl = f'{query_batch}.normal.subsample500cells.h5ad' 

assert os.path.exists(tmp_dir + h5ad_files_covid)
assert os.path.exists(tmp_dir + h5ad_files_ctrl)
assert all([os.path.exists(tmp_dir + x) for x in h5ad_files_atlas])

### Prep atlas dataset

In [None]:
adata_atlas_ls = [sc.read_h5ad(tmp_dir + f) for f in h5ad_files_atlas]
for a in adata_atlas_ls:
    _clean_adata(a)

## Concatenate
adata_atlas = anndata.concat(adata_atlas_ls)

## Make var with gene names
adata_atlas.var['gene_id'] = adata_atlas.var_names.values
adata_atlas.var['gene_name'] = [a for a in adata_atlas_ls if 'feature_name' in a.var.columns][0].var['feature_name']

## Exclude 3 donors with Smart-seq2 data
adata_atlas = adata_atlas[adata_atlas.obs['assay'] != 'Smart-seq2'].copy()

## Fix dataset naming
adata_atlas.obs['dataset_id'] = [x[0] for x in adata_atlas.obs['dataset_id'].str.split("_innate")]
adata_atlas.obs['dataset_id'] = [x[0] for x in adata_atlas.obs['dataset_id'].str.split("_adaptive")]

### Prep COVID dataset

In [None]:
adata_covid = sc.read_h5ad(tmp_dir + h5ad_files_covid)
_clean_adata(adata_covid)

AnnData object with n_obs × n_vars = 48083 × 24727
    obs: 'sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id', 'sample_id', 'donor_id', 'dataset_id', 'development_stage', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'
    var: 'feature_biotype', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

### Prep control dataset

In [None]:
adata_ctrl = sc.read_h5ad(tmp_dir + h5ad_files_ctrl)
_clean_adata(adata_ctrl)

AnnData object with n_obs × n_vars = 14426 × 24727
    obs: 'sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id', 'sample_id', 'donor_id', 'dataset_id', 'development_stage', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'
    var: 'feature_biotype', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'

In [None]:
## Save datasets
outdir = '/lustre/scratch117/cellgen/team205/ed6/PBMC_COVID/'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [None]:
adata_atlas.obs['donor_id'] = adata_atlas.obs['donor_id'].astype("str").astype("category")
adata_covid.obs['donor_id'] = adata_covid.obs['donor_id'].astype("str").astype("category")
adata_ctrl.obs['donor_id'] = adata_ctrl.obs['donor_id'].astype("str").astype("category")
adata_atlas.var['gene_id'] = adata_atlas.var['gene_id'].astype("category")

In [None]:
adata_atlas.write_h5ad(outdir + 'PBMC_COVID.subsample500cells.atlas.h5ad')
adata_covid.write_h5ad(outdir + 'PBMC_COVID.subsample500cells.covid.h5ad')
adata_ctrl.write_h5ad(outdir + 'PBMC_COVID.subsample500cells.ctrl.h5ad')