# Data import for Lafyatis_2019/valenzi dataset:

In [1]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
annotation = pd.read_csv('Valenzi-annotation-export.csv', index_col=[0])

In [3]:
data_ids = [
    'GSM3666096_SC45NOR', #C1
    'GSM3666097_SC56NOR', #C2
    'GSM3666098_SC59NOR', #C3
    'GSM3666100_SC156NORUP', #C4
    'GSM3666099_SC155NORLOW', #C5
    'GSM3909673_SC277raw_feature_bc_matrix', #C6
    'GSM3666102_SC52SSCUP', #ssc1 
    'GSM3666101_SC51SSCLOW', #ssc2
    'GSM3666104_SC64SSCUP', #ssc3
    'GSM3666103_SC63SSCLOW', #ssc4
    'GSM3666106_SC109SSCUP', #ssc5
    'GSM3666105_SC108SSCLOW', #ssc6
    'GSM3666108_SC136SSCUP', #ssc7
    'GSM3666107_SC135SSCLOW', #ssc8
    'GSM3909674_SC281raw_feature_bc_matrix', #ssc9/10
    'GSM3909675_SC284raw_feature_bc_matrix', #ssc11/12
]

annot_id = [
    'Valenzi_C1',
    'Valenzi_C2',
    'Valenzi_C3',
    'Valenzi_C4',
    'Valenzi_C5',
    'Valenzi_C6',
    'Valenzi_SSc1',
    'Valenzi_SSc2',
    'Valenzi_SSc3',
    'Valenzi_SSc4',
    'Valenzi_SSc5',
    'Valenzi_SSc6',
    'Valenzi_SSc7',
    'Valenzi_SSc8',
    'Valenzi_SSc_9',
    'Valenzi_SSc_10',
    'Valenzi_SSc_11',
    'Valenzi_SSc_12',
]

In [4]:
adatas = []
i = 0
for data_id in data_ids:
    print(data_id, i)
    if 'raw' not in data_id:
        adata_tmp = sc.read_mtx(f'data/{data_id}_matrix.mtx').transpose()
        barcodes_tmp = pd.read_csv(f'data/{data_id}_barcodes.tsv', header=None, sep='\t')
        genes_tmp = pd.read_csv(f'data/{data_id}_genes.tsv', header=None, sep='\t')

        #Annotate data
        barcodes_tmp.rename(columns={0:'barcode'}, inplace=True)
        barcodes_tmp.set_index('barcode', inplace=True)
        adata_tmp.obs = barcodes_tmp
        adata_tmp.obs_names = [
            '_'.join([annot_id[i], adata_tmp.obs_names[j]]) 
            for j in range(len(adata_tmp))
        ]
        adata_tmp.obs['sample'] = [data_id.split("_")[1]]*adata_tmp.n_obs

        genes_tmp.rename(columns={0:'gene_id', 1:'gene_symbol'}, inplace=True)
        genes_tmp.set_index('gene_symbol', inplace=True)
        adata_tmp.var = genes_tmp
        adata_tmp.var_names_make_unique()
        adatas.append(adata_tmp)
        i += 1
    else:
        adata_tmp = sc.read_10x_h5(f'data/{data_id}.h5')
        adata_tmp.var_names_make_unique()
        if '281' not in data_id and '284' not in data_id:
            adata_tmp.obs_names = [
                    '_'.join([annot_id[i], adata_tmp.obs_names[j]]) 
                    for j in range(len(adata_tmp))
                ]
            adata_tmp.obs['sample'] = [data_id.split("_")[1]]*adata_tmp.n_obs
            adatas.append(adata_tmp)
            i += 1
        elif '281' in data_id:
            annot_sc281 = pd.read_csv('SC281.csv', index_col=[0])
            adata_tmp.obs = annot_sc281.reindex(adata_tmp.obs_names)
            #adata_tmp = adata_tmp.var_names_make_unique()
            adata_tmp = adata_tmp[~np.any(adata_tmp.obs.isna(), axis=1)]
            adata_tmp = adata_tmp[~adata_tmp.obs['x'].isin(['Negative', 'Doublet'])]
            adata_tmp_hto3 = adata_tmp[adata_tmp.obs['x'].str.contains('HTO3')]
            adata_tmp_hto4 = adata_tmp[adata_tmp.obs['x'].str.contains('HTO4')]
            adata_tmp_hto3.obs_names = [
                '_'.join([annot_id[i], adata_tmp_hto3.obs_names[j]]) 
                for j in range(len(adata_tmp_hto3))
            ]
            adata_tmp_hto4.obs_names = [
                '_'.join([annot_id[i+1], adata_tmp_hto4.obs_names[j]]) 
                for j in range(len(adata_tmp_hto4))
            ]
            adata_tmp_hto3.obs['sample'] = [data_id.split("_")[1] + '_HTO3']*adata_tmp_hto3.n_obs
            adata_tmp_hto4.obs['sample'] = [data_id.split("_")[1] + '_HTO4']*adata_tmp_hto4.n_obs
            adatas.append(adata_tmp_hto3)
            adatas.append(adata_tmp_hto4)
            i += 2
        elif '284' in data_id:
            annot_sc284 = pd.read_csv('SC284.csv', index_col=[0])
            adata_tmp.obs = annot_sc284.reindex(adata_tmp.obs_names)
            #adata_tmp = adata_tmp.var_names_make_unique()
            adata_tmp = adata_tmp[~np.any(adata_tmp.obs.isna(), axis=1)]
            adata_tmp = adata_tmp[~adata_tmp.obs['x'].isin(['Negative', 'Doublet'])]
            adata_tmp_hto5 = adata_tmp[adata_tmp.obs['x'].str.contains('HTO5')]
            adata_tmp_hto6 = adata_tmp[adata_tmp.obs['x'].str.contains('HTO5')]
            adata_tmp_hto5.obs_names = [
                '_'.join([annot_id[i], adata_tmp_hto5.obs_names[j]]) 
                for j in range(len(adata_tmp_hto5))
            ]
            adata_tmp_hto6.obs_names = [
                '_'.join([annot_id[i+1], adata_tmp_hto6.obs_names[j]]) 
                for j in range(len(adata_tmp_hto6))
            ]
            adata_tmp_hto5.obs['sample'] = [data_id.split("_")[1] + '_HTO5']*adata_tmp_hto5.n_obs
            adata_tmp_hto6.obs['sample'] = [data_id.split("_")[1] + '_HTO6']*adata_tmp_hto6.n_obs
            adatas.append(adata_tmp_hto5)
            adatas.append(adata_tmp_hto6)
            i += 2

GSM3666096_SC45NOR 0
GSM3666097_SC56NOR 1
GSM3666098_SC59NOR 2
GSM3666100_SC156NORUP 3
GSM3666099_SC155NORLOW 4
GSM3909673_SC277raw_feature_bc_matrix 5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM3666102_SC52SSCUP 6
GSM3666101_SC51SSCLOW 7
GSM3666104_SC64SSCUP 8
GSM3666103_SC63SSCLOW 9
GSM3666106_SC109SSCUP 10
GSM3666105_SC108SSCLOW 11
GSM3666108_SC136SSCUP 12
GSM3666107_SC135SSCLOW 13
GSM3909674_SC281raw_feature_bc_matrix 14


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GSM3909675_SC284raw_feature_bc_matrix 16


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [5]:
adata = ad.concat(adatas)

In [6]:
adata.obs_names = [x[:-2] for x in adata.obs_names]

In [7]:
adata = adata[adata.obs_names.isin(annotation.index)]
adata.obs = annotation.reindex(adata.obs_names)

In [8]:
adata.obs['original_celltype_ann'] = adata.obs['cell_type_3'].map(
    {
        'AT1' : 'AT1',
        'AT1,AT2' : 'Alveolar epithelium',
        'AT1,AT2,HHIP+ AT2' : 'Alveolar epithelium',
        'AT2' : 'AT2',
        'AT2,Basal' : 'REMOVE',
        'AT2,Club' : 'REMOVE',
        'AT2,HHIP+ AT2' : 'AT2',
        'AT2,Secr' : 'REMOVE',
        'Aerocyte' : 'Aerocyte',
        'Aerocyte,Artery' : 'Arterial aerocyte',
        'Aerocyte,Vein' : 'Venous aerocyte',
        'Artery' : 'Arterial EC',
        'B cell lineage' : 'B cell',
        'B cell lineage,DC,Mac' : 'REMOVE',
        'Basal' : 'Basal',
        'Basal,Secr' : 'Airway epithelium',
        'Ciliated' : 'Ciliated',
        'Ciliated,Low quality' : 'REMOVE',
        'Ciliated,Low quality,Mac' : 'REMOVE',
        'Cycling cells' : 'Cycling cell',
        'DASC' : 'Disease-associated stromal cell',
        'DC' : 'DC',
        'DC,Gnlc,Mac' : 'REMOVE',
        'DC,Mac' : 'Myeloid',
        'Doublets' : 'REMOVE',
        'Endothelial' : 'Endothelial cell',
        'Endothelial,Lymphatics' : 'Lymphatic EC',
        'Endothelial,Lymphatics,Stromal' : 'REMOVE',
        'Erythrocytes' : 'Erythrocytes',
        'Fibroblast' : 'Fibroblast',
        'Fibroblast,SMC' : 'Stromal cell',
        'Gnlc' : 'Granulocytes',
        'HHIP+ AT2' : 'HHIP+ AT2',
        'Immune' : 'REMOVE',
        'Low quality' : 'REMOVE',
        'Low quality,Mac' : 'REMOVE',
        'Low quality,T cells' : 'REMOVE',
        'Lymphatics' : 'Lymphatics',
        'Mac' : 'Macrophage',
        'Mac,Mo' : 'Myeloid',
        'Mast' : 'Mast',
        'Mo' : 'Monocyte',
        'MyoF' : 'Myofibroblast',
        'NK cells' : 'NK cell',
        'Pericyte' : 'Pericyte',
        'SMC' : 'Smooth muscle cell',
        'Secr' : 'Secretory cell',
        'Stromal' : 'Stromal cell',
        'T cells' : 'T cell',
        'Vein' : 'Venous endothelial cell',
    }
)

In [9]:
adata.obs.groupby('cell_type_2')['original_celltype_ann'].value_counts()

cell_type_2                     original_celltype_ann          
AT1                             AT1                                  711
AT1,AT2                         Alveolar epithelium                 1500
AT2                             AT2                                 1902
                                HHIP+ AT2                            317
AT2,Basal                       REMOVE                               184
AT2,Club                        REMOVE                               343
AT2,Secr                        REMOVE                                83
Basal                           Basal                                232
Basal,Secr                      Airway epithelium                    474
Ciliated                        Ciliated                            3149
Ciliated,Low quality            REMOVE                               101
Ciliated,Low quality,Myeloid    REMOVE                                70
Cycling cells                   Cycling cell                

In [10]:
adata.obs['study'] = 'Lafyatis2019'
adata.obs['dataset'] = 'Lafyatis2019'
adata.obs['condition'] = adata.obs['Status']
adata.obs['subject_ID'] = adata.obs['Patient']
adata.obs['sample'] = adata.obs['Sample']
adata.obs['sex'] = adata.obs['Sex']
adata.obs['age'] = adata.obs['Age']

for col in adata.obs.columns:
    if col not in ['study', 
                   'dataset', 
                   'condition',
                   'original_celltype_ann',
                   'subject_ID',
                   'sex',
                   'smoking_status',
                   'age',
                   'sample'
                  ]:
        del adata.obs[col]

In [11]:
adata = adata[adata.obs['condition'] == 'SSc']
adata = adata[adata.obs['original_celltype_ann'] != 'REMOVE']

In [12]:
genes_subset = pd.read_csv('genes_for_mapping.csv')

In [13]:
import preprocessing as pp
adata_sub = pp.subset_and_pad_adata(genes_subset, adata)

not all genes were recovered, filling in 0 counts for 92 missing genes...


In [14]:
adata.write('valenzi.h5ad')
adata_sub.write('valenzi_sub.h5ad')

Trying to set attribute `.obs` of view, copying.
... storing 'original_celltype_ann' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'study' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'dataset' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'condition' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'subject_ID' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'sample' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'sex' as categorical
... storing 'original_celltype_ann' as categorical
... storing 'study' as categorical
... storing 'dataset' as categorical
... storing 'condition' as categorical
... storing 'subject_ID' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'gene_symbols' as categorical


In [15]:
adata

AnnData object with n_obs × n_vars = 27297 × 22164
    obs: 'original_celltype_ann', 'study', 'dataset', 'condition', 'subject_ID', 'sample', 'sex', 'age'