## Data import for Duong_lungMAP_unpubl

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import os

In [2]:
dir_data_in = "../../../data/HLCA_extended/extension_datasets/raw/Duong/"
dir_data_out = "../../../data/HLCA_extended/extension_datasets/ready/"

In [2]:
sc.logging.print_header()

scanpy==1.7.2 anndata==0.7.4 umap==0.5.1 numpy==1.17.4 scipy==1.5.4 pandas==1.0.5 scikit-learn==0.24.1 statsmodels==0.13.0.dev0+87.g52b142c python-igraph==0.8.3 louvain==0.6.1 leidenalg==0.8.3


# Load the data

In [3]:
adata = sc.read(os.path.join(dir_data_in, 'LungHubmap.h5ad'))

In [4]:
adata

AnnData object with n_obs × n_vars = 53904 × 27678
    obs: 'sample.ID', 'nCount_RNA', 'nFeature_RNA', 'subject.ID', 'donor.id.loc', 'age', 'race', 'sex', 'class', 'subclass', 'subclass.l2', 'condition', 'smoking.status'
    var: 'features'

# Check data formats

### Data

In [5]:
adata.X[:10,:10].A

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 5., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

### var

In [6]:
adata.var.head()

Unnamed: 0,features
A1BG,A1BG
A1BG-AS1,A1BG-AS1
A1CF,A1CF
A2M,A2M
A2M-AS1,A2M-AS1


### obs

In [7]:
adata.obs.head()

Unnamed: 0,sample.ID,nCount_RNA,nFeature_RNA,subject.ID,donor.id.loc,age,race,sex,class,subclass,subclass.l2,condition,smoking.status
LAP40_AAACCCAAGATGCTAA,0,3962.0,1859,D239,D239-RML-9B3,37yo,black,M,0,0,8,healthy,h/o mj
LAP40_AAACCCAAGATTGACA,0,5413.0,2587,D239,D239-RML-9B3,37yo,black,M,1,1,14,healthy,h/o mj
LAP40_AAACCCAGTTTCTATC,0,2113.0,1140,D239,D239-RML-9B3,37yo,black,M,2,2,36,healthy,h/o mj
LAP40_AAACCCATCCAACTAG,0,645.0,480,D239,D239-RML-9B3,37yo,black,M,0,3,6,healthy,h/o mj
LAP40_AAACCCATCGCGGTAC,0,7475.0,2958,D239,D239-RML-9B3,37yo,black,M,2,2,36,healthy,h/o mj


In [8]:
pd.crosstab(adata.obs['sample.ID'], adata.obs['donor.id.loc'])

donor.id.loc,D239-RML-12A3,D239-RML-12A4,D239-RML-3A4,D239-RML-7A2,D239-RML-7A3,D239-RML-9B3,D239-RML-9B5
sample.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0,0,0,0,6016,0
1,0,0,0,0,0,9177,0
2,0,0,3612,0,0,0,0
3,0,0,0,3720,0,0,0
4,0,0,0,0,5290,0,0
5,0,0,0,0,0,6967,0
6,0,0,0,0,0,0,6351
7,6030,0,0,0,0,0,0
8,0,6741,0,0,0,0,0


In [9]:
adata.obs['smoking.status'].value_counts()

h/o mj    53904
Name: smoking.status, dtype: int64

In [10]:
adata.obs['class'].value_counts()
adata.obs['subclass'].value_counts()
adata.obs['subclass.l2'].value_counts()

0    19769
3    13667
2    12704
1     7764
Name: class, dtype: int64

0     10974
5     10859
2      8746
1      5511
4      4766
8      3958
6      1348
7      1172
12     1124
13     1081
16      942
3       936
11      742
10      662
9       541
17      232
14      218
18       48
15       44
Name: subclass, dtype: int64

8     10948
22     7648
11     4645
14     4384
36     3638
24     2758
32     2436
30     1405
15     1127
21     1124
31     1093
20     1081
37      994
1       988
29      980
25      942
6       936
34      749
26      742
10      662
18      612
0       541
33      477
23      453
28      335
35      320
2       301
7       232
5       218
16      213
17      191
19      156
27      145
38      132
12      121
3        59
4        48
13       44
9        26
Name: subclass.l2, dtype: int64

In [11]:
adata.obs['race'].value_counts()

black    53904
Name: race, dtype: int64

### Obs renaming

#### Assign labels

In [12]:
c1 = pd.read_csv(os.path.join(dir_data_in, 'class_forHLCAv1.txt'), sep='\t', header=None, index_col=0)
c2 = pd.read_csv(os.path.join(dir_data_in, 'subclass_forHLCAv1.txt'), sep='\t', header=None, index_col=0)
c3 = pd.read_csv(os.path.join(dir_data_in,'raw/Duong/subclass.l2_forHLCAv1.txt'), sep='\t', header=None, index_col=0)

In [13]:
c1.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
LAP40_AAACCCAAGATGCTAA,epithelial
LAP40_AAACCCAAGATTGACA,mesenchymal
LAP40_AAACCCAGTTTCTATC,immune
LAP40_AAACCCATCCAACTAG,epithelial
LAP40_AAACCCATCGCGGTAC,immune


In [14]:
adata.obs['class_test'] = c1[1]
adata.obs['subclass_test'] = c2[1]

In [15]:
#test the mappings
pd.crosstab(adata.obs['subclass'], adata.obs['subclass_test'])

subclass_test,AT1,AT1.AT2,AT2,Art,BASC,Bas,Cap,Cil,Club,Gob,Lym,Lymphoid,MFB,Myeloid,MyoFB.SM,NE,Peri,Ser,Vein
subclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,0,0,10974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,5511,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,8746,0,0,0,0,0
3,0,0,0,0,0,0,0,0,936,0,0,0,0,0,0,0,0,0,0
4,4766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,10859,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1348,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1172,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,3958,0,0,0,0,0,0,0
9,0,0,0,0,0,541,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
adata.obs['class'] = c1[1]
adata.obs['subclass'] = c2[1]
adata.obs['subclass.l2'] = c3[1]

In [17]:
del adata.obs['class_test']
del adata.obs['subclass_test']

In [18]:
adata.obs['subclass'].value_counts()
adata.obs['subclass.l2'].value_counts()

AT2         10974
Cap         10859
Myeloid      8746
MFB          5511
AT1          4766
Lymphoid     3958
Cil          1348
MyoFB.SM     1172
Art          1124
Peri         1081
Vein          942
Club          936
Lym           742
AT1.AT2       662
Bas           541
BASC          232
Gob           218
Ser            48
NE             44
Name: subclass, dtype: int64

AT2        10948
gCap        7648
AT1         4645
MFB.1       4384
MP.1        3638
aCap        2758
Neu         2436
CD4T        1405
MFB.2       1127
Art         1124
NK          1093
Peri        1081
MP.2         994
Cil.1        988
CD8T         980
Vein         942
Club         936
Mono         749
Lym          742
AT1.AT2      662
MyoFB        612
Bas          541
Mast.Ba      477
Cap.I        453
Plasma       335
DC           320
Cil.2        301
BASC         232
Gob          218
SCMF         213
ASM          191
VSM          156
B            145
Imm.P        132
AT1.D        121
Cil.D         59
Ser           48
NE            44
AT2.P         26
Name: subclass.l2, dtype: int64

#### Rename other obs

In [19]:
# Rename obs categories
adata.obs = adata.obs.rename(columns={'sample.ID':'sample', 'subject.ID':'subject_ID','smoking.status':'smoking_status', 
                                      'race':'ethnicity', 'subclass.l2':'orig_celltype_ann'})

In [20]:
adata.obs['smoking_status'] = 'hist of marijuana use'

In [21]:
adata.obs.sex.value_counts()

M    53904
Name: sex, dtype: int64

In [22]:
adata.obs['sex'] = 'male'

In [23]:
ct_map = {'MFB.1': 'Matrix fibroblast 1',
          'MFB.2': 'Matrix fibroblast 2',
          'MFB': 'Matrix fibroblast',
          'MP.1': 'Macrophage 1',
          'MP.2': 'Macrophage 2',
          'Art': 'Arterial EC',
          'Lym': 'Lymphatic EC',
          'Mast.Ba': 'Mast/Basal',
          'Cap.I': 'Capillary intermediate',
          'Cap': 'Capillary EC',
          'BASC': 'Bronchoalveolar stem cell',
          'SCMF': 'Secondary crest myofib',
          'ASM': 'Arterial smooth muscle',
          'VSM': 'Venous smooth muscle',
          'Imm.P': 'Immune prog',
          'AT1.D': 'AT1 diff',
          'Cil.D': 'Multiciliated diff',
          'Cil.1': 'Multiciliated 1',
          'Cil.2': 'Multiciliated 2',
          'Ser': 'Serous',
          'Neu': 'Neutrophil',
          'Mono': 'Monocyte',
          'Gob': 'Goblet',
          'Bas': 'Basal',
          'Peri': 'Pericyte',
          'Vein': 'Venous EC',
          'MyoFB': 'Myofibroblast',
          'NE': 'Neuroendocrine',
          'AT2.P':  'AT2 prolif'}

adata.obs['orig_celltype_ann'] = [ct_map[ct] if ct in ct_map else ct for ct in adata.obs['orig_celltype_ann']]
adata.obs['subclass'] = [ct_map[ct] if ct in ct_map else ct for ct in adata.obs['subclass']]

adata.obs['orig_celltype_ann'].value_counts()
adata.obs['subclass'].value_counts()

AT2                          10948
gCap                          7648
AT1                           4645
Matrix fibroblast 1           4384
Macrophage 1                  3638
aCap                          2758
Neutrophil                    2436
CD4T                          1405
Matrix fibroblast 2           1127
Arterial EC                   1124
NK                            1093
Pericyte                      1081
Macrophage 2                   994
Multiciliated 1                988
CD8T                           980
Venous EC                      942
Club                           936
Monocyte                       749
Lymphatic EC                   742
AT1.AT2                        662
Myofibroblast                  612
Basal                          541
Mast/Basal                     477
Capillary intermediate         453
Plasma                         335
DC                             320
Multiciliated 2                301
Bronchoalveolar stem cell      232
Goblet              

AT2                          10974
Capillary EC                 10859
Myeloid                       8746
Matrix fibroblast             5511
AT1                           4766
Lymphoid                      3958
Cil                           1348
MyoFB.SM                      1172
Arterial EC                   1124
Pericyte                      1081
Venous EC                      942
Club                           936
Lymphatic EC                   742
AT1.AT2                        662
Basal                          541
Bronchoalveolar stem cell      232
Goblet                         218
Serous                          48
Neuroendocrine                  44
Name: subclass, dtype: int64

In [24]:
adata

AnnData object with n_obs × n_vars = 53904 × 27678
    obs: 'sample', 'nCount_RNA', 'nFeature_RNA', 'subject_ID', 'donor.id.loc', 'age', 'ethnicity', 'sex', 'class', 'subclass', 'orig_celltype_ann', 'condition', 'smoking_status'
    var: 'features'

In [25]:
adata.obs = adata.obs.drop(columns=['nCount_RNA', 'nFeature_RNA'])

In [26]:
adata.obs.head()

Unnamed: 0,sample,subject_ID,donor.id.loc,age,ethnicity,sex,class,subclass,orig_celltype_ann,condition,smoking_status
LAP40_AAACCCAAGATGCTAA,0,D239,D239-RML-9B3,37yo,black,male,epithelial,AT2,AT2,healthy,hist of marijuana use
LAP40_AAACCCAAGATTGACA,0,D239,D239-RML-9B3,37yo,black,male,mesenchymal,Matrix fibroblast,Matrix fibroblast 1,healthy,hist of marijuana use
LAP40_AAACCCAGTTTCTATC,0,D239,D239-RML-9B3,37yo,black,male,immune,Myeloid,Macrophage 1,healthy,hist of marijuana use
LAP40_AAACCCATCCAACTAG,0,D239,D239-RML-9B3,37yo,black,male,epithelial,Club,Club,healthy,hist of marijuana use
LAP40_AAACCCATCGCGGTAC,0,D239,D239-RML-9B3,37yo,black,male,immune,Myeloid,Macrophage 1,healthy,hist of marijuana use


- what is donor.id.loc? -> a location of sampling

In [27]:
adata.obs['donor.id.loc'].value_counts()

D239-RML-9B3     22160
D239-RML-12A4     6741
D239-RML-9B5      6351
D239-RML-12A3     6030
D239-RML-7A3      5290
D239-RML-7A2      3720
D239-RML-3A4      3612
Name: donor.id.loc, dtype: int64

Sample to location mapping:
- Mid: 9B3 -> anatomical location YY (Lobar bronchi with surrounding parenchyma)
- Proximal: 7A2, 7A3 -> anatomical location XX (segmental bronchi)
- Peripheral: 9B5, 3A4, 12A3, 12A4 -> anatomical location 0.97 (parenchyma)

In [28]:
# Assign some airway locations
adata.obs['anatomical_region_level_1'] = ['airway' if samp in ['D239-RML-9B3', 'D239-RML-7A3', 'D239-RML-7A2'] 
                                          else 'parenchyma' for samp in adata.obs['donor.id.loc']]

In [29]:
adata.obs['anatomical_region_coarse'] = ['airway' if samp in ['D239-RML-9B3'] else 'segmental_bronchi' if samp in ['D239-RML-7A3', 'D239-RML-7A2']
                                          else 'parenchyma' for samp in adata.obs['donor.id.loc']]

In [30]:
adata.obs['anatomical_region_fine'] = ['lobar_bronchi' if samp in ['D239-RML-9B3'] else 'segmental_bronchi' if samp in ['D239-RML-7A3', 'D239-RML-7A2']
                                          else 'parenchyma' for samp in adata.obs['donor.id.loc']]

In [31]:
adata.obs['anatomical_region_level_1'].value_counts()
adata.obs['anatomical_region_coarse'].value_counts()
adata.obs['anatomical_region_fine'].value_counts()

airway        31170
parenchyma    22734
Name: anatomical_region_level_1, dtype: int64

parenchyma           22734
airway               22160
segmental_bronchi     9010
Name: anatomical_region_coarse, dtype: int64

parenchyma           22734
lobar_bronchi        22160
segmental_bronchi     9010
Name: anatomical_region_fine, dtype: int64

In [32]:
# Subject to change in post-processing
#adata.obs['anatomical_region_ccf_score'] = [0.72 if samp in ['D239-RML-9B3'] else 0.5 if samp in ['D239-RML-7A3', 'D239-RML-7A2']
#                                          else 0.97 for samp in adata.obs['donor.id.loc']]

In [33]:
adata

AnnData object with n_obs × n_vars = 53904 × 27678
    obs: 'sample', 'subject_ID', 'donor.id.loc', 'age', 'ethnicity', 'sex', 'class', 'subclass', 'orig_celltype_ann', 'condition', 'smoking_status', 'anatomical_region_level_1', 'anatomical_region_coarse', 'anatomical_region_fine'
    var: 'features'

# Gene selection

In [34]:
gene_set = pd.read_csv('genes_for_mapping.csv')

In [35]:
cd ../scripts/

/mnt/znas/icb_zstore01/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/scripts


In [36]:
import preprocessing as pp


In [37]:
cd ../query_datasets/

/mnt/znas/icb_zstore01/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/query_datasets


In [38]:
adata_sub = pp.subset_and_pad_adata(gene_set, adata)

not all genes were recovered, filling in 0 counts for 181 missing genes...




# Writing

In [39]:
del adata.raw

In [40]:
adata.write(os.path.join(dir_data_out, 'full/duong.h5ad'))
adata_sub.write(os.path.join(dir_data_out, 'subsetted/duong_sub.h5ad'))

... storing 'subject_ID' as categorical
... storing 'donor.id.loc' as categorical
... storing 'age' as categorical
... storing 'ethnicity' as categorical
... storing 'sex' as categorical
... storing 'class' as categorical
... storing 'subclass' as categorical
... storing 'orig_celltype_ann' as categorical
... storing 'condition' as categorical
... storing 'smoking_status' as categorical
... storing 'anatomical_region_level_1' as categorical
... storing 'anatomical_region_coarse' as categorical
... storing 'anatomical_region_fine' as categorical
... storing 'subject_ID' as categorical
... storing 'donor.id.loc' as categorical
... storing 'age' as categorical
... storing 'ethnicity' as categorical
... storing 'sex' as categorical
... storing 'class' as categorical
... storing 'subclass' as categorical
... storing 'orig_celltype_ann' as categorical
... storing 'condition' as categorical
... storing 'smoking_status' as categorical
... storing 'anatomical_region_level_1' as categorical
... 

In [41]:
adata

AnnData object with n_obs × n_vars = 53904 × 27678
    obs: 'sample', 'subject_ID', 'donor.id.loc', 'age', 'ethnicity', 'sex', 'class', 'subclass', 'orig_celltype_ann', 'condition', 'smoking_status', 'anatomical_region_level_1', 'anatomical_region_coarse', 'anatomical_region_fine'
    var: 'features'