## Prepare Aging FCA data for use as scArches reference
Get raw counts from raw 10x cell ranger files and get annotations from processed AFCA data portal. 
Create a new h5 file with raw counts and annotations. 

In [117]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import scvi
from pathlib import Path
import glob

In [118]:
## Read cellranger raw h5 files
unannotated_h5_path = "/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/*.h5"
h5files = glob.glob(unannotated_h5_path)
print(h5files)

['/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_30dWT_S2.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Female_head_adult_30dWT_S1.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_30dWT_S6.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_50dWT_S3.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_70dWT_S1.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_body_adult_70dWT_S1.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_50dWT_S5.h5', '/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Female_head_adult_50dWT_S6.h5', '/projectnb

In [119]:
## Create a combined anndata object of raw h5 files
adatas = []
for h5f in h5files:
    print(h5f)
    filename = h5f.split("/")[-1].split(".h5")[0]
    adata = sc.read_10x_h5(h5f)
    adata.var_names_make_unique()
    adata.obs['filename'] = filename
    print(adata)
    adatas.append(adata)
    #break

/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_30dWT_S2.h5
AnnData object with n_obs × n_vars = 5088 × 17562
    obs: 'filename'
    var: 'gene_ids', 'feature_types', 'genome'
/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Female_head_adult_30dWT_S1.h5
AnnData object with n_obs × n_vars = 5080 × 17562
    obs: 'filename'
    var: 'gene_ids', 'feature_types', 'genome'
/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_30dWT_S6.h5
AnnData object with n_obs × n_vars = 5453 × 17562
    obs: 'filename'
    var: 'gene_ids', 'feature_types', 'genome'
/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/cellrangerSummary/allH5_filter/Male_head_adult_50dWT_S3.h5
AnnData object with n_obs × n_vars = 11010 × 17562
    obs: 'filename'
    var: 'gene_ids', 'feature_types', 'genome'
/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/ref

In [120]:
len(adatas)

72

In [121]:
afca_combined = sc.concat(adatas,join="outer")
afca_combined

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 479612 × 17562
    obs: 'filename'

In [122]:
afca_combined.obs[['sex','sample']] = afca_combined.obs['filename'].str.split("dWT",expand=True)
afca_combined.obs[['sex','day']] = afca_combined.obs['sex'].str.split("adult_",expand=True)
afca_combined.obs['oldindex'] = afca_combined.obs.index
afca_combined.obs['newindex'] = afca_combined.obs['oldindex'] + "_AFCA_" +  afca_combined.obs['sex'] + afca_combined.obs['day'] + afca_combined.obs['sample']
afca_combined.obs['newindex'] = afca_combined.obs['newindex'].str.lower()
afca_combined.obs


Unnamed: 0,filename,sex,sample,day,oldindex,newindex
AAACCCACAACGGCTC-1,Male_head_adult_30dWT_S2,Male_head_,_S2,30,AAACCCACAACGGCTC-1,aaacccacaacggctc-1_afca_male_head_30_s2
AAACCCACACTGCGAC-1,Male_head_adult_30dWT_S2,Male_head_,_S2,30,AAACCCACACTGCGAC-1,aaacccacactgcgac-1_afca_male_head_30_s2
AAACCCAGTGACACAG-1,Male_head_adult_30dWT_S2,Male_head_,_S2,30,AAACCCAGTGACACAG-1,aaacccagtgacacag-1_afca_male_head_30_s2
AAACCCAGTGCATACT-1,Male_head_adult_30dWT_S2,Male_head_,_S2,30,AAACCCAGTGCATACT-1,aaacccagtgcatact-1_afca_male_head_30_s2
AAACGAAAGCAGGTCA-1,Male_head_adult_30dWT_S2,Male_head_,_S2,30,AAACGAAAGCAGGTCA-1,aaacgaaagcaggtca-1_afca_male_head_30_s2
...,...,...,...,...,...,...
TTTGTTGGTGGCATCC-1,Male_head_adult_50dWT_S1,Male_head_,_S1,50,TTTGTTGGTGGCATCC-1,tttgttggtggcatcc-1_afca_male_head_50_s1
TTTGTTGGTGTAACGG-1,Male_head_adult_50dWT_S1,Male_head_,_S1,50,TTTGTTGGTGTAACGG-1,tttgttggtgtaacgg-1_afca_male_head_50_s1
TTTGTTGGTTTGTTCT-1,Male_head_adult_50dWT_S1,Male_head_,_S1,50,TTTGTTGGTTTGTTCT-1,tttgttggtttgttct-1_afca_male_head_50_s1
TTTGTTGTCCACTGAA-1,Male_head_adult_50dWT_S1,Male_head_,_S1,50,TTTGTTGTCCACTGAA-1,tttgttgtccactgaa-1_afca_male_head_50_s1


In [123]:
## read annotated adata file from FCA data portal and subset to AFCA nuclei
referencepath = "/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/"
annotated_h5 = sc.read_h5ad(referencepath+"adata_head_S_v1.0.h5ad")
#annotated_h5 = sc.read_h5ad(referencepath+"GSE218661_adata_head_S_v1.0.h5ad")
annotated_h5 = annotated_h5[annotated_h5.obs.dataset=="AFCA"]
annotated_h5

View of AnnData object with n_obs × n_vars = 189670 × 15992
    obs: 'tissue', 'sex', 'age', 'sex_age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'dataset', 'fca_annotation', 'afca_annotation', 'afca_annotation_broad'
    uns: 'afca_annotation_colors'
    obsm: 'X_pca', 'X_tsne', 'X_umap'

In [124]:
annotated_h5.raw = annotated_h5 # Save normalized counts in raw 

In [125]:
#annotated_h5.obs['nuclei_filename'] =annotated_h5.obs.index
#annotated_h5.obs[['nuclei','file']] = annotated_h5.obs["nuclei_filename"].str.split("_AFCA_",expand=True)
#annotated_h5.obs.index = annotated_h5.obs.nuclei
annotated_h5.obs['oldindex'] = annotated_h5.obs.index
annotated_h5.obs['newindex'] = annotated_h5.obs['oldindex'].str.lower()
annotated_h5.obs.index = annotated_h5.obs['newindex']

In [126]:
print(annotated_h5.obs['newindex'].tolist()[0])
print(afca_combined.obs['newindex'].tolist()[0])

aaacccacagtgagca-1_afca_female_head_30_s1
aaacccacaacggctc-1_afca_male_head_30_s2


In [127]:
## Get nuclei common to both adata objects
common_cells = list(set(annotated_h5.obs['newindex'].tolist()).intersection(set(afca_combined.obs['newindex'].tolist())))
print(len(common_cells)) ## 189670 - same as total cells in annotated_h5

common_genes = list(set(annotated_h5.var_names).intersection(set(afca_combined.var_names)))
print(len(common_genes))

189670
15992


In [128]:
## check if there are duplicate indexes
import collections
print([item for item, count in collections.Counter(common_cells).items() if count > 1])

[]


In [129]:
## Subset raw counts adata for cells and genes present in annotated adata
afca_combined.obs.index = afca_combined.obs['newindex']
afca_combined_subset = afca_combined[afca_combined.obs['newindex'].isin(common_cells)]
afca_combined_subset = afca_combined_subset[:,common_genes]
afca_combined_subset

View of AnnData object with n_obs × n_vars = 189670 × 15992
    obs: 'filename', 'sex', 'sample', 'day', 'oldindex', 'newindex'

In [130]:
## Save raw counts from 10x adata as a layer in the annotated adata
annotated_h5.layers['counts'] = afca_combined_subset.X.copy()

In [131]:
annotated_h5

AnnData object with n_obs × n_vars = 189670 × 15992
    obs: 'tissue', 'sex', 'age', 'sex_age', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'log1p_total_counts_mt', 'dataset', 'fca_annotation', 'afca_annotation', 'afca_annotation_broad', 'oldindex', 'newindex'
    uns: 'afca_annotation_colors'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    layers: 'counts'

In [132]:
annotated_h5.X = annotated_h5.layers['counts'].copy()
print(annotated_h5.X.min(),annotated_h5.X.max())

0.0 2227.0


In [136]:
annotated_h5.write("/projectnb/mccall/sbandyadka/drpr42d_snrnaseq/reference/AFCA/afca_head_raw_v1.0.h5ad")