### Notebook to preprocess the data from Xu_2020/Guo

### Import required modules

In [1]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

- Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')



-----
anndata     0.7.5
scanpy      1.7.2
sinfo       0.3.1
-----
PIL                 8.1.0
anndata             0.7.5
anyio               NA
attr                20.3.0
babel               2.9.0
backcall            0.2.0
brotli              NA
cached_property     1.5.2
cairo               1.20.0
certifi             2021.05.30
cffi                1.14.4
chardet             4.0.0
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
dask                2.30.0
dateutil            2.8.1
decorator           4.4.2
future_fstrings     NA
get_version         2.1
google              NA
h5py                3.1.0
idna                2.10
igraph              0.8.2
ipykernel           5.4.2
ipython_genutils    0.2.0
jedi                0.18.0
jinja2              2.11.2
joblib              0.16.0
json5               NA
jsonschema          3.2.0
jupyter_server      1.1.3
jupyterlab_server   2.0.0
kiwisolver          1.3.1
legacy_api_wrap     1.2
leidenal

### Read in dataset

In [3]:
# adata_1 = sc.read_10x_mtx('./GSE135851_RAW/LAM1/', cache = True)
adata_1 = sc.read_10x_mtx('./LAM1/', cache = True)
adata_1.obs['subject_ID'] = 'SAMN12574693'
adata_1.obs['protocol'] = 'scRNA-Seq'
adata_1.obs['age'] = 72
adata_1

... writing an h5ad cache file to speedup reading next time


AnnData object with n_obs × n_vars = 737280 × 32738
    obs: 'subject_ID', 'protocol', 'age'
    var: 'gene_ids'

In [4]:
adata_2 = sc.read_10x_mtx('./LAM2/', cache = True)
adata_2.obs['subject_ID'] = 'SAMN12574700'
adata_2.obs['protocol'] = 'scRNA-Seq'
adata_2.obs['age'] = 65
adata_2

... writing an h5ad cache file to speedup reading next time


AnnData object with n_obs × n_vars = 737280 × 32738
    obs: 'subject_ID', 'protocol', 'age'
    var: 'gene_ids'

In [5]:
adata_3 = sc.read_10x_mtx('./LAM3/', cache = True)
adata_3.obs['subject_ID'] = 'SAMN12574699'
adata_3.obs['protocol'] = 'scRNA-Seq'
adata_3.obs['age'] = 50
adata_3

... writing an h5ad cache file to speedup reading next time


AnnData object with n_obs × n_vars = 737280 × 32738
    obs: 'subject_ID', 'protocol', 'age'
    var: 'gene_ids'

In [6]:
adata_4 = sc.read_10x_mtx('./LAM4/', cache = True)
adata_4.obs['subject_ID'] = 'SAMN12574698'
adata_4.obs['protocol'] = 'snRNA-Seq'
adata_4.obs['age'] = 52
adata_4

... writing an h5ad cache file to speedup reading next time
  + str(example_colliding_values)


AnnData object with n_obs × n_vars = 737280 × 63677
    obs: 'subject_ID', 'protocol', 'age'
    var: 'gene_ids'

### Merge samples

In [7]:
GSE135851 = adata_1.concatenate(adata_2, adata_3, adata_4, batch_key = 'sample', batch_categories = ['LAM1', 'LAM2', 'LAM3', 'LAM4'], join = 'inner')
GSE135851.obs['genome'] = 'hg19'
GSE135851

AnnData object with n_obs × n_vars = 2949120 × 32738
    obs: 'subject_ID', 'protocol', 'age', 'sample', 'genome'
    var: 'gene_ids-LAM1', 'gene_ids-LAM2', 'gene_ids-LAM3', 'gene_ids-LAM4'

### Is `adata.X` raw?

In [8]:
ad = GSE135851
GSE135851.X[:5,:5].todense()

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], dtype=float32)

In [9]:
ad.shape

(2949120, 32738)

In [10]:
import pandas as pd
pd.Series(GSE135851.X[:,1:100].A.flatten()).value_counts()

0.0      291791180
1.0         121806
2.0          24538
3.0          11027
4.0           5839
           ...    
185.0            1
71.0             1
125.0            1
152.0            1
94.0             1
Length: 94, dtype: int64

**It looks like to be integers**

### Add basic labels

In [11]:
ad.obs['dataset'] = 'GSE135851'
ad.obs['study'] = 'Guo2020'
ad.obs['original_celltype_ann'] = 'N/A'
ad.obs['condition'] = 'disease'
ad

AnnData object with n_obs × n_vars = 2949120 × 32738
    obs: 'subject_ID', 'protocol', 'age', 'sample', 'genome', 'dataset', 'study', 'original_celltype_ann', 'condition'
    var: 'gene_ids-LAM1', 'gene_ids-LAM2', 'gene_ids-LAM3', 'gene_ids-LAM4'

In [12]:
ad.obs.head()

Unnamed: 0,subject_ID,protocol,age,sample,genome,dataset,study,original_celltype_ann,condition
AAACCTGAGAAACCAT-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACCGC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACCTA-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACGAG-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACGCC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease


In [13]:
ad.var.head()

Unnamed: 0,gene_ids-LAM1,gene_ids-LAM2,gene_ids-LAM3,gene_ids-LAM4
MIR1302-10,ENSG00000243485,ENSG00000243485,ENSG00000243485,ENSG00000243485
FAM138A,ENSG00000237613,ENSG00000237613,ENSG00000237613,ENSG00000237613
OR4F5,ENSG00000186092,ENSG00000186092,ENSG00000186092,ENSG00000186092
RP11-34P13.7,ENSG00000238009,ENSG00000238009,ENSG00000238009,ENSG00000238009
RP11-34P13.8,ENSG00000239945,ENSG00000239945,ENSG00000239945,ENSG00000239945


In [14]:
ad.shape

(2949120, 32738)

In [15]:
ad.write('GSE135851_preprocessed.raw.ctl210706.h5ad', compression='lzf')

... storing 'subject_ID' as categorical
... storing 'protocol' as categorical
... storing 'genome' as categorical
... storing 'dataset' as categorical
... storing 'study' as categorical
... storing 'original_celltype_ann' as categorical
... storing 'condition' as categorical


### Subset datasets to Sikkema's HVGs 

In [16]:
def subset_and_pad_adata(adata, gene_set):
    """
    This function uses a gene list provided as a Pandas dataframe with gene symbols and
    Ensembl IDs and subsets a larger Anndata object to only the genes in this list. If
    Not all genes are found in the AnnData object, then zero-padding is performed.
    """
    # Example inputs:
    # genes_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/genes_for_mapping.csv'
    # data_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/ready/adams.h5ad'
    # gene_set = pd.read_csv(genes_filename)
    # adata = sc.read(data_filename)

    # Prep objects
    if 'gene_symbols' in gene_set.columns:
        gene_set.index = gene_set['gene_symbols']

    else:
        raise ValueError('The input gene list was not of the expected type!\n'
                         'Gene symbols and ensembl IDs are expected in column names:\n'
                         '\t`gene_symbols` and `Unnamed: 0`')

    # Subset adata object
    common_genes = [gene for gene in gene_set['gene_symbols'].values if gene in adata.var_names]
    if len(common_genes) == 0:
        print("WARNING: YOU SHOULD PROBABLY SWITCH YOUR ADATA.VAR INDEX COLUMN TO GENE NAMES"
                  " RATHER THAN IDS! No genes were recovered.")
        return

    adata_sub = adata[:,common_genes].copy()

    # Pad object with 0 genes if needed
    if len(common_genes) < len(gene_set):
        diff = len(gene_set) - len(common_genes)
        print(f'not all genes were recovered, filling in 0 counts for {diff} missing genes...')
        
        # Genes to pad with
        genes_to_add = set(gene_set['gene_symbols'].values).difference(set(adata_sub.var_names))
        new_var = gene_set.loc[genes_to_add]
        
        if 'Unnamed: 0' in new_var.columns:
            # Assumes the unnamed column are ensembl values
            new_var['ensembl'] = new_var['Unnamed: 0']
            del new_var['Unnamed: 0']

        df_padding = pd.DataFrame(data=np.zeros((adata_sub.shape[0],len(genes_to_add))), index=adata_sub.obs_names, columns=new_var.index)
        adata_padding = sc.AnnData(df_padding, var=new_var)
        
        # Concatenate object
        #adata_sub = concat([adata_sub, adata_padding], axis=1, join='outer', index_unique=None, merge='unique')
        adata_sub2 = adata_sub.concatenate(adata_padding, batch_key = 'process', batch_categories = ['sub', 'padded'], join = 'outer')

    # Ensure ensembl IDs are available
    adata_sub2.var['ensembl'] = gene_set['Unnamed: 0']

    return adata_sub2

In [17]:
HVG = pd.read_csv('genes_for_mapping.csv')
HVG.head()

Unnamed: 0.1,Unnamed: 0,gene_symbols
0,ENSG00000000938,FGR
1,ENSG00000000971,CFH
2,ENSG00000002587,HS3ST1
3,ENSG00000002933,TMEM176A
4,ENSG00000003436,TFPI


### Include the cell type information.

In [18]:
df = pd.read_excel('LAM_scRNAseq_cellmeta.xlsx')

In [19]:
df.shape

(12374, 7)

In [20]:
ad.obs

Unnamed: 0,subject_ID,protocol,age,sample,genome,dataset,study,original_celltype_ann,condition
AAACCTGAGAAACCAT-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACCGC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACCTA-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACGAG-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
AAACCTGAGAAACGCC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,,disease
...,...,...,...,...,...,...,...,...,...
TTTGTCATCTTTACAC-1-LAM4,SAMN12574698,snRNA-Seq,52,LAM4,hg19,GSE135851,Guo2020,,disease
TTTGTCATCTTTACGT-1-LAM4,SAMN12574698,snRNA-Seq,52,LAM4,hg19,GSE135851,Guo2020,,disease
TTTGTCATCTTTAGGG-1-LAM4,SAMN12574698,snRNA-Seq,52,LAM4,hg19,GSE135851,Guo2020,,disease
TTTGTCATCTTTAGTC-1-LAM4,SAMN12574698,snRNA-Seq,52,LAM4,hg19,GSE135851,Guo2020,,disease


In [21]:
df['code'] = df['barcode'] + '-1-' + df['donor']
celltype_by_code = df.set_index('code')['celltype'].to_dict()

In [22]:
ad.obs.original_celltype_ann.value_counts()

N/A    2949120
Name: original_celltype_ann, dtype: int64

In [23]:
ad.obs.original_celltype_ann = ad.obs.index.map(celltype_by_code)
ad.obs.original_celltype_ann.value_counts().sum()

12374

In [24]:
ad = ad[~pd.isnull(ad.obs['original_celltype_ann']),:]
ad.shape

  res = method(*args, **kwargs)


(12374, 32738)

In [25]:
ad.obs['sample'].value_counts()

LAM1    6530
LAM3    5844
Name: sample, dtype: int64

In [26]:
for c in ad.var:
    del ad.var[c]

In [None]:
ad.obs['dataset'] = 'Guo2020_LAM1/3'

In [27]:
ad_sub = subset_and_pad_adata(ad, HVG)
# del(ad.obs['process'])

ad_sub = ad_sub[ad_sub.obs.index.str.endswith('-sub'),:]
del ad_sub.obs['process']
for c in ad_sub.var:
    del ad_sub.var[c]

ad.shape, ad_sub.shape

not all genes were recovered, filling in 0 counts for 172 missing genes...


((12374, 32738), (12374, 2000))

In [28]:
ad.obs.condition.value_counts()

disease    12374
Name: condition, dtype: int64

In [29]:
ad_sub.write('GSE135851_preprocessed-HVGs.raw.ctl210706.h5ad', compression='lzf')

Trying to set attribute `.obs` of view, copying.
... storing 'subject_ID' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'protocol' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'sample' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'genome' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'dataset' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'study' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'original_celltype_ann' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.var` of view, copying.
... storing 'gene_symbols-padded' as categorical
Trying to set attribute `.var` of view, copying.
... storing 'ensembl-padded' as categorical
Trying to set attribute `.var` of view, copying.
... storing 'gene_ids-LAM1-sub' as categorical
Trying to

In [33]:
ad.obs

Unnamed: 0,subject_ID,protocol,age,sample,genome,dataset,study,original_celltype_ann,condition
AAACCTGAGCTGAAAT-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,AT2,disease
AAACCTGAGGCTCTTA-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,Macrophage,disease
AAACCTGAGTCGTACT-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,Macrophage,disease
AAACCTGCACCCTATC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,Macrophage,disease
AAACCTGCAGTTAACC-1-LAM1,SAMN12574693,scRNA-Seq,72,LAM1,hg19,GSE135851,Guo2020,Macrophage,disease
...,...,...,...,...,...,...,...,...,...
TTTGTCATCATCTGTT-1-LAM3,SAMN12574699,scRNA-Seq,50,LAM3,hg19,GSE135851,Guo2020,Monocyte,disease
TTTGTCATCCGCATCT-1-LAM3,SAMN12574699,scRNA-Seq,50,LAM3,hg19,GSE135851,Guo2020,AT2,disease
TTTGTCATCCTCCTAG-1-LAM3,SAMN12574699,scRNA-Seq,50,LAM3,hg19,GSE135851,Guo2020,NK,disease
TTTGTCATCGCAAACT-1-LAM3,SAMN12574699,scRNA-Seq,50,LAM3,hg19,GSE135851,Guo2020,Macrophage,disease


In [30]:
# Clean adata.var
full_path = '../../../data/HLCA_extended/extension_datasets/ready/full/guo.h5ad'
ad.write(full_path, compression='lzf')


Trying to set attribute `.obs` of view, copying.
... storing 'original_celltype_ann' as categorical


In [31]:
subsetted_path = '../../../data/HLCA_extended/extension_datasets/ready/subsetted/guo_sub.h5ad'
ad_sub.write(subsetted_path, compression='lzf')

### Modify access to be checked

In [32]:
!chmod 777 $full_path
!chmod 777 $subsetted_path