**This notebook provides code samples to be used to manipulate AnnData objects towards CELLxGENE curation\
It is not intended to be used as a single coherent workflow**

# data layers

**move a layer to the raw slot**

In [None]:
raw_adata = ad.AnnData(adata.layers['counts'], var=adata.var, obs=adata.obs)
adata.raw = raw_adata

**delete a layer**

In [None]:
del adata.layers['counts']

# obsm

**update uns.default_embedding**\
Ideally, the default_embedding matches the figures in the paper

In [None]:
adata.uns['default_embedding'] = 'X_umap'

**add spatial embeddings based on two columns in obs**

In [None]:
adata.obsm['X_spatial'] = adata.obs[['xcoord','ycoord']].to_numpy()

# uns

**define a field in uns**

In [None]:
adata.uns['schema_version'] = '3.0.0'

**remove a field from uns**

In [None]:
del adata.uns['X_normalization']

# obs / var

**set a column with all the same values**

In [None]:
adata.obs['is_primary_data'] = True
adata.obs['suspension_type'] = 'nucleus'
####################
adata.var['feature_is_filtered'] = False

**Remove columns**

In [None]:
var_remove = [
    'gene_symbols'
]

adata.var.drop(columns=var_remove, inplace=True)
####################
obs_remove = [
    'tissue',
    'organism',
    'self_reported_ethnicity',
    'assay',
    'disease',
    'sex',
    'cell_type',
    'development_stage'
]

obs_remove = [o for o in obs_remove if o in adata.obs.columns]
adata.obs.drop(columns=obs_remove, inplace=True)
if obs_remove:
    print('removed: ' + ','.join(obs_remove))

**change column names**

In [None]:
rename_me = {
    'cell_type': 'author_cell_type',
    'ethnicity_ontology_id': 'self_reported_ethnicity_ontology_term_id',
    'disease_ontology_id': 'disease_ontology_term_id'
}

adata.obs.rename(columns=rename_me, inplace=True)

# obs

**fill null values of a specific column with a specified value**

In [None]:
adata.obs['sex_ontology_term_id'].cat.add_categories('unknown', inplace=True)
adata.obs.fillna({'sex_ontology_term_id': 'unknown'}, inplace=True)

**adjust the values in a specific column in a standard way with a function**

In [None]:
def fix_typo(x):
    return x.replace('_',':')


adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].apply(fix_typo)

**replace specified values in specified columns**

In [None]:
replace_me = {
    'organism_ontology_term_id':{'human':'NCBITaxon:9606', 'mouse': 'NCBITaxon:10090'},
    'assay_ontology_term_id': {'EFO:0030003': 'EFO:0009899'}
}

adata.obs.replace(replace_me,inplace=True)

**add a new column with values based on values in an existing column - with DataFrame**\
**Step 1:** get the values to map from

In [None]:
for k in adata.obs['author_cell_type'].unique():
    print(k)

**Step 2 - Option A:** set up a dataframe with the mapping from a dictionary

In [None]:
#map in values based on another field - step 2: set up a dataframe with the mapping
#option A: from dict
celltypes = {
    'Myeloid': 'CL:0001082',
    'Endothelial': 'CL:0010008',
    'Fibroblast': 'CL:0002548',
    'Cardiomyocyte': 'CL:0000513',
    'Pericyte': 'CL:0000669',
    'Lymphoid': 'CL:0000838',
    'Cycling cells': 'CL:0000003',
    'vSMCs': 'CL:0000514',
    'Neuronal': 'CL:0000006'
}

ct_df = pd.DataFrame.from_dict(celltypes,orient='index',columns=['cell_type_ontology_term_id']).reset_index().rename(columns={'index':'author_cell_type'})
ct_df

**Step 2 - Option B:** set up a dataframe with the mapping from a Google Sheet\
*Google Sheet permissions must be Anyone with Link is a Viewer*

In [None]:
sheet_id = '15oG8v5BS6HMPqCehYQcujMZUq9PgQNpo8osKhO7yA5o'
tab_name = 'Sheet1'
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={tab_name}'
ct_df = pd.read_csv(url)
ct_df

**Step 3:** merge the dataframe into obs\
*`how='left'` is critical to ensure obs order is retained\
`set_index` is critical to ensure the index is retained*

In [None]:
adata.obs = adata.obs.merge(ct_df, on='author_cell_type',how='left').set_index(adata.obs.index)

**add a new column with values based on values in an existing column - with Dictionary**

In [None]:
donor_map = {
    'KL001': 'P21',
    'KL002': 'P22',
    'KL003': 'P23'
}

adata.obs['donor_id'] = adata.obs['sample'].map(donor_map)
adata.obs[['donor_id','sample']].value_counts(dropna=False)

**Update a gradient field to categorical**

In [None]:
adata.obs['cluster_id'] = adata.obs['cluster_id'].map(str)

# var

**Add a column for Gene IDs based on a column with Gene *version* IDs**

In [None]:
adata.var['gene_ids'] = adata.var['ensembl'].apply(lambda x: x.split('.')[0])

**Fill in the mapping file to use to map symbols to Ensembl IDs**<br>
*Expecting a .tsv with columns `gene_symbols` & `gene_ids`*

In [None]:
var_mapping_file = 'refdata-cellranger-GRCh38-3_0_0_genes_gtf.tsv'

**View what features are not mapped in this**<br>
*Check for typos or other alterations to the symbols that can be fixed*<br>
*Common to see many ending in `.1` or `-1` resulting from duplicated symbols in the reference*

In [None]:
var_map_df = pd.read_csv(var_mapping_file, sep='\t')
adata.var[adata.var.index.isin(var_map_df['gene_symbols']) != True]

**Create the list of approved IDs to filter on**<br>
*For the initial run, download the 4 genes_ csv files from https://github.com/chanzuckerberg/single-cell-curation/tree/main/cellxgene_schema_cli/cellxgene_schema/ontology_files*<br>
*After that, if the `genes_approved.csv` is available locally, then the 4 genes_ files won't be necessary*

In [None]:
ref_files = [
    'genes_ercc.csv',
    'genes_homo_sapiens.csv',
    'genes_mus_musculus.csv',
    'genes_sars_cov_2.csv'
]

if not os.path.exists('genes_approved.csv'):
    ids = pd.DataFrame()
    for f in ref_files:
        df = pd.read_csv(f, names=['feature_id','symb','num','length'],dtype='str',index_col=False)
        ids = ids.append(df)
        os.remove(f)
    ids.to_csv('genes_approved.csv', index=False)

approved = pd.read_csv('genes_approved.csv',dtype='str')

**Map the Ensembl IDs**

In [None]:
adata.var = adata.var.merge(var_map_df,left_index=True,right_on='gene_symbols',how='left').set_index(adata.var.index)

**Filter out genes that don't appear in the approved annotation**

In [None]:
var_to_keep = adata.var.index.tolist()
var_in_approved = adata.var.index[adata.var['gene_ids'].isin(approved['feature_id'])].tolist()
var_to_keep = [e for e in var_to_keep if e in var_in_approved]
adata = adata[:, var_to_keep]
adata.var.set_index('gene_ids',inplace=True)

**Repeat much of the same steps for the `raw.var`, if it exists**

In [None]:
raw_var_remove = [
    'gene_symbols'
]
adata.raw.var.drop(columns=raw_var_remove, inplace=True)

raw_adata = ad.AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)

raw_adata.var = raw_adata.var.merge(var_map_df,left_index=True,right_on='gene_symbols',how='left').set_index(raw_adata.var.index)

raw_adata = raw_adata[:, var_to_keep]
raw_adata.var.set_index('gene_ids',inplace=True)
adata.raw = raw_adata
adata.raw.var