In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd

In [2]:
# load the file
adata = sc.read_h5ad("jdm_obj_09-20-23.h5ad")

In [3]:
# add metadata
adata.uns["title"]="CITEseq of JDM PBMCs"
adata.uns["description"]="Mapping immune dysregulation in juvenile dermatomyositis at single cell resolution"
adata.uns["schema_version"]="3.0.0"
adata.uns["batch_condition"]="well"
adata.uns["default_embedding"]="X_wnn.umap"

In [4]:
# convert formats for metadata
categorical_cols = adata.obs.dtypes.index[(adata.obs.dtypes=="object")].tolist()
for col in categorical_cols:
    adata.obs[col] =  adata.obs[col].astype('category')
float_cols = adata.obs.dtypes.index[(adata.obs.dtypes=="float64")].tolist()
for col in float_cols:
    adata.obs[col] =  adata.obs[col].astype('float32').round(3)

In [7]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_ADT',
       'nFeature_ADT', 'percent.mt', 'percent.ribo', 'S.Score', 'G2M.Score',
       'Phase', 'RNA.weight', 'cluster_number', 'cluster_label', 'visit',
       'disease_group', 'case_control', 'age', 'on_meds', 'steroids',
       'other_immune_suppressant', 'MSA', 'DA_cat', 'medpred', 'preddose',
       'medmethyl', 'methyldose', 'medmtx', 'medhql', 'medmmf', 'medivig',
       'vasglobal', 'vascutaneous', 'vasmuscle', 'cdasiact', 'menzymeyn',
       'mmt8score', 'chaqscore', 'vaspatient', 'donor_id',
       'organism_ontology_term_id', 'assay_ontology_term',
       'tissue_ontology_term_id', 'suspension_type',
       'developmental_stage_ontology_term_id', 'sex_ontology_term_id',
       'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'cell_type_ontology_term_id'],
      dtype='object')

In [23]:
adata.obs['vascutaneous'][adata.obs['vascutaneous'].isnull()][1]

nan

In [24]:
import numpy as np

In [27]:
np.isnan(adata.obs['vascutaneous'][adata.obs['vascutaneous'].isnull()])

AAACCCACAATTTCTC-1    True
AAACCCACATCGAGCC-1    True
AAACCCAGTAACGATA-1    True
AAACCCATCAAGTGTC-1    True
AAACGAAAGTTGCGAG-1    True
                      ... 
TTTGGAGTCAGTCACA-6    True
TTTGGTTCAATGTGGG-6    True
TTTGGTTCATGCCATA-6    True
TTTGTTGCAAGATGGC-6    True
TTTGTTGGTGTGTCGC-6    True
Name: vascutaneous, Length: 23161, dtype: bool

In [8]:
#menzymeyn # 1, 0, or Nan
adata.obs["menzymeyn"] = adata.obs["menzymeyn"].astype('category')
adata.obs["medpred"] = adata.obs["medpred"].astype('category')
adata.obs["medmethyl"] = adata.obs["medmethyl"].astype('category')
adata.obs["medmtx"] = adata.obs["medmtx"].astype('category')
adata.obs["medmmf"] = adata.obs["medmmf"].astype('category')
adata.obs["medhql"] = adata.obs["medhql"].astype('category')
adata.obs["medivig"] = adata.obs["medivig"].astype('category')
adata.obs["cluster_number"] = adata.obs["cluster_number"].astype('category')

In [None]:
# cdasiact, mmt8score should be integer

In [None]:
#For the four VAS scores, these can be continuous variables; same with CDASIact scores;
#"menzymeyn" is binary (y/n), "mmt8score" continuous, "chaqscore" continuous; remove "p155", "nxp2", 
#"mda5" as this is all captured in "MSA" which is categorical;  "DA_cat" keep categorical
# many of the numbers have multiple decimal places--would it make sense to round to two decimals? 

In [9]:
# add index
ensembl = pd.read_csv("ensembl_ord_raw.csv")
ensembl2 = ensembl[ensembl['symbol'].isin(adata.var._index.tolist())]
adata.var['ensembl'] = ensembl2['ensembl'].tolist()
adata.var.set_index(['ensembl'], inplace=True)
adata.var.drop(columns="_index", inplace=True)

In [10]:
# set "is_filtered"
filt_genes = pd.read_csv("filtered_genes.csv")
adata.var['feature_is_filtered']=adata.var.features.isin(filt_genes['gene'].tolist())

In [11]:
adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_var']

#add ADT to uns
adata.uns['antibody_features']: a pandas.DataFrame with each row representing one antibody, where at least one of the following is included for every antibody:
Catalog ID (e.g. 100569)
Barcode ID (e.g. 0001)
Clone ID (e.g RM4-5)

adata.uns['antibody_raw.X']: matrix of “raw” counts
observations MUST be in the same order as the expression matrix

features MUST be in the same order as the dataframe in uns[‘antibody_features’]

adata.uns['antibody.X']: matrix of “final” counts
observations MUST be in the same order as the expression matrix
features MUST be in the same order as the dataframe in uns[‘antibody_features’]

In [12]:
ab = sc.read_h5ad("jdm_adt_09-20-23.h5ad")
adt_feats = pd.read_csv("adt_feature_df.csv")


In [13]:
filt_adts = pd.read_csv("filtered_adts.csv")
adt_feats['feature_is_filtered']=adt_feats['Feature ID'].isin(filt_adts['adt'].tolist())

In [14]:
adt_feats.set_index(['Feature ID'], inplace=True)

In [15]:
adata.uns['antibody_features'] = adt_feats
adata.uns['antibody.X'] = ab.raw.X
adata.uns['antibody.X'] = ab.X

In [None]:
#adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
#adata.__dict__['_var'] = adata.__dict__['_var'].rename(columns={'_index': 'features'})


In [16]:
# convert formats for matrices to float32
adata.__dict__['_raw'].__dict__['_X'] = adata.raw.X.astype("float32")
adata.__dict__['_X'] = adata.X.astype("float32")

In [17]:
adata.write_h5ad("jdm_obj_09-20-23_fixed_v2.h5ad", compression="gzip")

In [None]:
## fix vars

In [30]:
adata=sc.read_h5ad("jdm_obj_09-20-23_fixed_v2.h5ad")

In [42]:
adata.obs.drop("DA_cat", axis=1, inplace=True)

In [48]:
adata.obs.rename(columns={
    "medpred":"prednisone", "preddose": "prednisone_dose(mg)", 
    "medmethyl":"methylprednisolone", "methyldose":"methyprednisolone_dose(mg)",
"medmtx":"methotrexate", "medhql":"hydroxychloroquine",
"medmmf":"mycophenolate_mofetil", "medivig":"intravenous_immune_globulin",
"vasglobal":"physician_global_VAS", "vascutaneous":"cutaneous_VAS",
"vasmuscle": "muscle_VAS", "cdasiact":"CDASI_activity_score", 
"menzymeyn":"muscle_enzyme_elevation", "mmt8score":"MMT8_score", "chaqscore":"CHAQ_score",
"vaspatient":"patient_global_VAS", "MSA":"myositis_specific_antibody"}, inplace=True)

In [50]:
adata.write_h5ad("jdm_obj_09-21-23.h5ad")