# Application of design best practices to COVID-19 dataset - prepare dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os,sys
import scanpy as sc 
import pandas as pd
import numpy as np
import milopy
import scipy
import anndata

In [3]:
# check X stores raw counts 
def _check_counts_in_X(adata):
    return(all(np.random.choice(adata.X.data, 100) % 1 == 0))

def _clean_adata(a):
    ## Make obs_names unique
    a.obs_names = a.obs['dataset_id'].astype('str') + '-' + a.obs_names.astype("str")
    assert _check_counts_in_X(a)

    sc.pp.calculate_qc_metrics(a, inplace=True)
    sc.pp.filter_cells(a, min_counts=1000)
    return(a)

Filtered anndata objects (subsampling to 500 cells per sample) and split by condition are prepared running scripts in `src/01_PBMC_data_processing/` and stored in `/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/`.

In [4]:
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
query_batch = '10_1038_s41591_021_01329_2'
normal_sample_obs = pd.read_csv(data_dir + 'PBMC_sample_metadata.normal.csv')
covid_sample_obs = pd.read_csv(data_dir + 'PBMC_sample_metadata.COVID.csv')
# step_adata = sc.read_h5ad(data_dir + f'{query_batch}.h5ad', backed='r')
atlas_batch = normal_sample_obs[normal_sample_obs.dataset_id != query_batch].dataset_id.unique()
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
tmp_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/'

h5ad_files_atlas = [f'{x}.normal.subsample500cells.h5ad' for x in atlas_batch]
h5ad_files_covid = f'{query_batch}.COVID.subsample500cells.h5ad' 
h5ad_files_ctrl = f'{query_batch}.normal.subsample500cells.h5ad' 

assert os.path.exists(tmp_dir + h5ad_files_covid)
assert os.path.exists(tmp_dir + h5ad_files_ctrl)
assert all([os.path.exists(tmp_dir + x) for x in h5ad_files_atlas])

### Prep atlas dataset

In [5]:
adata_atlas_ls = [sc.read_h5ad(tmp_dir + f) for f in h5ad_files_atlas]
for a in adata_atlas_ls:
    _clean_adata(a)

## Concatenate
adata_atlas = anndata.concat(adata_atlas_ls)

## Make var with gene names
adata_atlas.var['gene_id'] = adata_atlas.var_names.values
adata_atlas.var['gene_name'] = [a for a in adata_atlas_ls if 'feature_name' in a.var.columns][0].var['feature_name']

## Exclude 3 donors with Smart-seq2 data
adata_atlas = adata_atlas[adata_atlas.obs['assay'] != 'Smart-seq2'].copy()

## Fix dataset naming
adata_atlas.obs['dataset_id'] = [x[0] for x in adata_atlas.obs['dataset_id'].str.split("_innate")]
adata_atlas.obs['dataset_id'] = [x[0] for x in adata_atlas.obs['dataset_id'].str.split("_adaptive")]

### Prep COVID dataset

In [5]:
step_adata = sc.read_h5ad(data_dir + f'{query_batch}.h5ad', backed=True)

In [6]:
adata_covid = step_adata[step_adata.obs['disease'] == 'COVID-19'].to_memory()
adata_covid = adata_covid.raw.to_adata()
adata_covid.obs['dataset_id'] = query_batch
_clean_adata(adata_covid)

AnnData object with n_obs × n_vars = 497092 × 24727
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'dataset_id', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'
    var: 'feature_biotype', 'n_c

### Prep control dataset

In [7]:
adata_ctrl = step_adata[step_adata.obs['disease'] == 'normal'].to_memory()
adata_ctrl = adata_ctrl.raw.to_adata()
adata_ctrl.obs['dataset_id'] = query_batch
_clean_adata(adata_ctrl)

AnnData object with n_obs × n_vars = 101953 × 24727
    obs: 'sample_id', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'dataset_id', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'
    var: 'feature_biotype', 'n_c

In [8]:
## Save datasets
outdir = '/lustre/scratch117/cellgen/team205/ed6/PBMC_COVID_full/'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [13]:
# adata_atlas.obs['donor_id'] = adata_atlas.obs['donor_id'].astype("str").astype("category")
adata_covid.obs['donor_id'] = adata_covid.obs['patient_id'].astype("str").astype("category")
adata_ctrl.obs['donor_id'] = adata_ctrl.obs['patient_id'].astype("str").astype("category")
# adata_atlas.var['gene_id'] = adata_atlas.var['gene_id'].astype("category")

In [14]:
# adata_atlas.write_h5ad(outdir + 'PBMC_COVID.subsample500cells.atlas.h5ad')
adata_covid.write_h5ad(outdir + 'PBMC_COVID.full.covid.h5ad')
adata_ctrl.write_h5ad(outdir + 'PBMC_COVID.full.ctrl.h5ad')

In [5]:
outdir = '/lustre/scratch117/cellgen/team205/ed6/PBMC_COVID_full/'
adata_atlas = sc.read_h5ad(outdir+ 'PBMC_COVID.subsample500cells.atlas.h5ad')
adata_ctrl = sc.read_h5ad(outdir+ 'PBMC_COVID.full.ctrl.h5ad')
adata_covid = sc.read_h5ad(outdir+ 'PBMC_COVID.full.covid.h5ad')

In [6]:
adata_atlas.obs

Unnamed: 0,sex,tissue,ethnicity,disease,assay,assay_ontology_term_id,sample_id,donor_id,dataset_id,development_stage,cell_type,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,n_counts
ye_lupus_normal-TTTCCTCCATGCAACT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,female,blood,European,normal,10x 3' v2,EFO:0009899,02404cf5-4210-43b8-b998-797422f7b1a8,6ad107c4-317a-4b5b-b266-868c327af160,ye_lupus_normal,24-year-old human stage,"CD8-positive, alpha-beta T cell",842,6.736967,2442.0,7.800982,49.140049,62.162162,72.563473,85.995086,2442.0
ye_lupus_normal-GCCTCTAAGTAGATGT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,female,blood,European,normal,10x 3' v2,EFO:0009899,02404cf5-4210-43b8-b998-797422f7b1a8,6ad107c4-317a-4b5b-b266-868c327af160,ye_lupus_normal,24-year-old human stage,"CD8-positive, alpha-beta T cell",583,6.369901,2319.0,7.749322,60.025873,74.299267,83.484260,96.420871,2319.0
ye_lupus_normal-CGTAGCGCAGACGTAG-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,female,blood,European,normal,10x 3' v2,EFO:0009899,02404cf5-4210-43b8-b998-797422f7b1a8,6ad107c4-317a-4b5b-b266-868c327af160,ye_lupus_normal,24-year-old human stage,"CD8-positive, alpha-beta T cell",556,6.322565,1788.0,7.489412,52.852349,69.015660,80.089485,96.868009,1788.0
ye_lupus_normal-GTGAAGGTCTGTCTAT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,female,blood,European,normal,10x 3' v2,EFO:0009899,02404cf5-4210-43b8-b998-797422f7b1a8,6ad107c4-317a-4b5b-b266-868c327af160,ye_lupus_normal,24-year-old human stage,natural killer cell,614,6.421622,1403.0,7.247081,44.119743,57.020670,70.491803,91.874555,1403.0
ye_lupus_normal-AATCCAGGTCCTCCAT-1-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0,female,blood,European,normal,10x 3' v2,EFO:0009899,02404cf5-4210-43b8-b998-797422f7b1a8,6ad107c4-317a-4b5b-b266-868c327af160,ye_lupus_normal,24-year-old human stage,classical monocyte,969,6.877296,2770.0,7.926963,40.324910,54.440433,67.400722,83.068592,2770.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10_1038_s41586_021_04345_x-S28_CACAAACGTGCCTGCA-1,male,blood,European,normal,10x 5' v1,EFO:0011025,PC9,PC9,10_1038_s41586_021_04345_x,human adult stage,"naive thymus-derived CD8-positive, alpha-beta ...",447,6.104793,1210.0,7.099202,52.314050,67.768595,79.586777,100.000000,1210.0
10_1038_s41586_021_04345_x-S28_AAGCCGCGTTGAGGTG-1,male,blood,European,normal,10x 5' v1,EFO:0011025,PC9,PC9,10_1038_s41586_021_04345_x,human adult stage,conventional dendritic cell,555,6.320768,1285.0,7.159292,43.813230,59.455253,72.373541,95.719844,1285.0
10_1038_s41586_021_04345_x-S27_GATGAAAAGCCCAACC-1,male,blood,European,normal,10x 5' v1,EFO:0011025,PC9,PC9,10_1038_s41586_021_04345_x,human adult stage,native cell,1442,7.274480,4048.0,8.306225,31.645257,44.787549,58.448617,76.284585,4048.0
10_1038_s41586_021_04345_x-S27_CCGGGATTCTGTCCGT-1,male,blood,European,normal,10x 5' v1,EFO:0011025,PC9,PC9,10_1038_s41586_021_04345_x,human adult stage,classical monocyte,566,6.340359,1079.0,6.984716,38.739574,51.529194,66.079703,93.883225,1079.0


In [18]:
adata = anndata.concat([adata_atlas, adata_ctrl], join='outer')

In [19]:
(adata_atlas.n_obs + adata_ctrl.n_obs) == adata.n_obs

True

In [20]:
adata

AnnData object with n_obs × n_vars = 703009 × 25185
    obs: 'sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id', 'sample_id', 'donor_id', 'dataset_id', 'development_stage', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts', 'n_genes', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type', 'organism'
    obsm: 'X_pca', '

In [15]:
adata

AnnData object with n_obs × n_vars = 703009 × 16299
    obs: 'sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id', 'sample_id', 'donor_id', 'dataset_id', 'development_stage', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'

In [29]:
adata_dict = {}
adata_dict['atlas'] = adata_atlas
adata_dict['covid'] = adata_covid
adata_dict['ctrl'] = adata_ctrl

In [34]:
if 'atlas' in adata_dict.keys():
    keep_vars = np.intersect1d(adata_dict['atlas'].var_names, adata_dict['covid'].var_names)
    adata = adata[:,keep_vars].copy()

In [35]:
adata

AnnData object with n_obs × n_vars = 703009 × 16299
    obs: 'sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id', 'sample_id', 'donor_id', 'dataset_id', 'development_stage', 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts', 'n_genes', 'total_counts_mt', 'pct_counts_mt', 'initial_clustering', 'Resample', 'Collection_Day', 'Swab_result', 'Status', 'Smoker', 'Status_on_day_collection', 'Status_on_day_collection_summary', 'Days_from_onset', 'Site', 'time_after_LPS', 'Worst_Clinical_Status', 'Outcome', 'patient_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'author_cell_type', 'organism'
    obsm: 'X_pca', '