# Prepare PBMC dataset from cellxgene portal

In this notebook we download and prepare PBMC datasets from published PBMC studies, available from the [cellxgene portal](https://cellxgene.cziscience.com/collections).


In [2]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=8ed3fa3a3818113f5ae077e206e4b8a29ae2163372659b6a8f4d05f0e50b3f64
  Stored in directory: /nfs/users/nfs_e/ed6/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os,sys
import scanpy as sc
import pandas as pd
import numpy as np
import wget
import anndata

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## r2py setup
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# anndata2ri.activate()

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
library(tidyverse)
library(reshape2)
library(patchwork)

remove_x_axis <- function(){
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.title.x = element_blank())  
}

remove_y_axis <- function(){
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title.y = element_blank())  
}

[0;1;31mSystem has not been booted with systemd as init system (PID 1). Can't operate.[0m
[0;1;31mFailed to create bus connection: Host is down[0m


In [5]:
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

### Download data from studies

In [6]:
metadata_df = pd.read_csv('../../metadata/PBMC_study_metadata.csv')

In [7]:
metadata_df['file_path'] = np.nan
for i in np.arange(metadata_df.shape[0]):
    url = metadata_df['h5ad url'][i]
    file_name = url.split("/")[-1]
    file_path = data_dir + file_name
    if not os.path.exists(file_path):
        print(f"Downloading dataset {i+1}: {file_name}\n")
        os.system(f'wget {url} -P {data_dir}')
    if os.path.exists(file_path):
        metadata_df['file_path'].iloc[i] = file_path

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_df['file_path'].iloc[i] = file_path


In [44]:
metadata_df['file_path'] = np.nan
for i in np.arange(metadata_df.shape[0]):
    url = metadata_df['h5ad url'][i]
    file_name = url.split("/")[-1]
    file_path = data_dir + file_name
    if not os.path.exists(file_path):
        print(f"Downloading dataset {i+1}: {file_name}\n")
        os.system(f'wget {url} -P {data_dir}')
    if os.path.exists(file_path):
        metadata_df['file_path'].iloc[i] = file_path

Downloading dataset 1: 10_1038_s41591_021_01329_2.h5ad



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Downloading dataset 2: ye_lupus_lupus.h5ad

Downloading dataset 3: ye_lupus_normal.h5ad

Downloading dataset 4: 10_1126_scitranslmed_abh2624.h5ad

Downloading dataset 5: 10_1038_s41467_019_12464_3.h5ad

Downloading dataset 6: 10_1101_2021_07_19_452956_blood.h5ad

Downloading dataset 7: 10_1038_s41586_020_2157_4_blood.h5ad

Downloading dataset 8: 10_1016_j_cell_2021_01_053_convalescence.h5ad

Downloading dataset 9: 10_1016_j_cell_2021_01_053_nonconvalescence.h5ad

Downloading dataset 10: 10_1038_s41591_020_0944_y.h5ad

Downloading dataset 11: 10_1126_sciimmunol_abd1554.h5ad

Downloading dataset 12: 10_1016_j_cell_2021_02_018_adaptive.h5ad

Downloading dataset 13: 10_1016_j_cell_2021_02_018_innate.h5ad

Downloading dataset 14: 10_1038_s41586_020_2922_4_10x.h5ad

Downloading dataset 15: 10_1038_s41586_020_2922_4_smartseq2.h5ad

Downloading dataset 16: 10_1016_j_cell_2022_01_012.h5ad

Downloading dataset 17: 10_1016_j_cell_2020_08_001.h5ad

Downloading dataset 18: 10_1126_science_abc6261.h

### Collect sample info

In [157]:
sample_obs_columns = ['sex', 'tissue', 'ethnicity', 'disease', 'assay', 'assay_ontology_term_id','sample_id', 'donor_id', 'dataset_id', 'development_stage']
cell_obs_columns = ['cell_type']
sample_obs_all = pd.DataFrame()
for i in np.arange(metadata_df.shape[0]):
    print(f"Loading dataset {i+1}")
    adata = sc.read_h5ad(metadata_df['file_path'][i], backed='r')
    sample_id_col = metadata_df['sample identifier column'].iloc[i]
    if ' + ' in sample_id_col:
        adata.obs['sample_id'] = adata.obs[sample_id_col.split(" + ")].astype("str").agg('-'.join, axis=1)
    else:
        adata.obs['sample_id'] = adata.obs[sample_id_col].values
    adata.obs['donor_id'] = adata.obs[metadata_df['donor identifier column'].iloc[i]]
    adata.obs['dataset_id'] = metadata_df['Dataset ID'].iloc[i]
    sample_obs = adata.obs[sample_obs_columns].groupby('sample_id').first()
    sample_obs['n_cells'] = adata.obs[sample_obs_columns].groupby('sample_id').size()
    sample_obs.index = sample_obs.index.astype('str')
    sample_obs_all = pd.concat([sample_obs_all, sample_obs])

Loading dataset 1
Loading dataset 2
Loading dataset 3
Loading dataset 4
Loading dataset 5
Loading dataset 6
Loading dataset 7
Loading dataset 8
Loading dataset 9
Loading dataset 10
Loading dataset 11
Loading dataset 12
Loading dataset 13
Loading dataset 14
Loading dataset 15
Loading dataset 16
Loading dataset 17
Loading dataset 18
Loading dataset 19
Loading dataset 20
Loading dataset 21
Loading dataset 22
Loading dataset 23
Loading dataset 24
Loading dataset 25
Loading dataset 26
Loading dataset 27
Loading dataset 28
Loading dataset 29
Loading dataset 30
Loading dataset 31
Loading dataset 32
Loading dataset 33


In [22]:
sample_obs_all.to_csv('../../metadata/PBMC_sample_metadata.csv')
# metadata_df.to_csv('../../metadata/PBMC_study_metadata.csv')

In [5]:
sample_obs_all = pd.read_csv('../../metadata/PBMC_sample_metadata.csv', index_col=0)
metadata_df = pd.read_csv('../../metadata/PBMC_study_metadata.csv')

### Filter data for merging

In [6]:
## Keep 10X genomics data
keep_sample_obs = sample_obs_all[sample_obs_all.assay.str.startswith('10x')]

## Keep samples with at least 500 cells
keep_sample_obs = keep_sample_obs[keep_sample_obs.n_cells > 500]

## Split by disease
normal_sample_obs = keep_sample_obs[keep_sample_obs.disease == 'normal']
covid_sample_obs = keep_sample_obs[keep_sample_obs.disease == 'COVID-19']
lupus_sample_obs = keep_sample_obs[keep_sample_obs.disease == 'systemic lupus erythematosus']


In [32]:
normal_sample_obs.donor_id.unique().shape


(1248,)

In [None]:

normal_sample_obs.to_csv(data_dir + 'PBMC_sample_metadata.normal.csv')
covid_sample_obs.to_csv(data_dir + 'PBMC_sample_metadata.COVID.csv')
lupus_sample_obs.to_csv(data_dir + 'PBMC_sample_metadata.lupus.csv')

In [9]:
normal_sample_obs = pd.read_csv(data_dir + 'PBMC_sample_metadata.normal.csv')

### Make merged objects

Anndata objects are filtered (subsampling to 500 cells per sample) and split by condition running:
```bash
dataset_ids=$(cat /nfs/team205/ed6/data/PBMC_CZI_integration_filtered/PBMC_sample_metadata.normal.csv | tail -n +2 | cut -f 9 -d ','| sort | uniq)
for d in $dataset_ids; do
    python split_PBMC_dataset.py ${d} normal
    done

dataset_ids=$(cat /nfs/team205/ed6/data/PBMC_CZI_integration_filtered/PBMC_sample_metadata.COVID.csv | tail -n +2 | cut -f 9 -d ','| sort | uniq)
for d in $dataset_ids; do
    python split_PBMC_dataset.py ${d} COVID
    done

dataset_ids=$(cat /nfs/team205/ed6/data/PBMC_CZI_integration_filtered/PBMC_sample_metadata.lupus.csv | tail -n +2 | cut -f 9 -d ','| sort | uniq)
for d in $dataset_ids; do
    python split_PBMC_dataset.py ${d} lupus
    done
```

Filtered AnnData objects are stored in `/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/`

In [128]:
data_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/'
tmp_dir = '/nfs/team205/ed6/data/PBMC_CZI_integration_filtered/tmp/'
h5ad_files_normal = [x for x in os.listdir(tmp_dir) if x.endswith('.normal.subsample500cells.h5ad')]

adata_ls = [sc.read_h5ad(tmp_dir + f) for f in h5ad_files_normal]

## Make obs_names unique
for a in adata_ls:
    a.obs_names = a.obs['dataset_id'].astype('str') + '-' + a.obs_names.astype("str")

# check X stores raw counts 
def _check_counts_in_X(adata):
    return(all(np.random.choice(adata.X.data, 100) % 1 == 0))

if not all([_check_counts_in_X(a) for a in adata_ls]):
    raise ValueError("Some matrix is not storing raw counts")

## Filter genes not expressed anywhere
for a in adata_ls:
    sc.pp.filter_genes(a, min_cells=1)

## Concatenate
adata_normal = anndata.concat(adata_ls)

## Make var with gene names
adata_normal.var['gene_id'] = adata_normal.var_names.values
adata_normal.var['gene_name'] = [a for a in adata_ls if 'feature_name' in a.var.columns][0].var['feature_name']

In [142]:
## Save
adata_normal.obs['donor_id'] = adata_normal.obs['donor_id'].astype('category')
adata_normal.write_h5ad(data_dir + 'PBMC_merged.normal.subsample500cells.h5ad')

## Prep supplementary tables

In [98]:
outdir = '/lustre/scratch117/cellgen/team205/ed6/PBMC_CZI_integration_filtered/'
adata_full = sc.read_h5ad(outdir + 'PBMC_merged.normal.subsample500cells.clean_celltypes.h5ad', backed=True)

In [177]:
study_metadata = pd.read_csv('../../metadata/PBMC_study_metadata.csv', index_col=0)
study_metadata = study_metadata[study_metadata['Dataset ID'].isin(sample_metadata.dataset_id)][['Dataset ID', 'DOI', 'assay', 'disease']].drop_duplicates()

sample_metadata = pd.read_csv(data_dir + 'PBMC_sample_metadata.normal.csv', index_col=0)
sample_metadata = sample_metadata.reset_index()
sample_metadata = sample_metadata[sample_metadata.sample_id.isin(adata_full.obs['sample_id'])]

In [178]:
study_metadata.columns = ['dataset_id', 'DOI', 'assay', 'disease']

In [179]:
## Fix dataset naming
study_metadata['dataset_id'] = [x[0] for x in study_metadata['dataset_id'].str.split("_innate")]
study_metadata['dataset_id'] = [x[0] for x in study_metadata['dataset_id'].str.split("_adaptive")]
study_metadata = study_metadata.drop_duplicates()

In [180]:
n_cells = pd.merge(adata_full.obs.reset_index(), study_metadata, on='dataset_id').groupby('DOI').size().reset_index()

In [181]:
n_cells.columns = ['DOI', 'n_cells']

In [182]:
n_samples = pd.merge(sample_metadata, study_metadata, on='dataset_id').groupby('DOI').size().reset_index()
n_samples.columns = ['DOI', 'n_samples']
study_table = study_metadata[['DOI', 'assay', 'disease']].drop_duplicates()
study_table = pd.merge(pd.merge(study_table, n_samples), n_cells).sort_values('n_samples', ascending=False)

In [183]:
study_table.to_csv('/home/jovyan/mount/gdrive/diff2atlas/suppl_table_studies.csv')

In [184]:
pd.merge(sample_metadata, study_metadata[['DOI', 'dataset_id']], on='dataset_id').to_csv('/home/jovyan/mount/gdrive/diff2atlas/suppl_table_samples.csv')