# SCENIC+ ANDATA OBJECT PREPARATION

REFERENCES:

anndata object wiki = https://anndata.readthedocs.io/en/latest/index.html

anndata basics = https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html

scenicplus workflow = https://scenicplus.readthedocs.io/en/latest/pbmc_multiome_tutorial.html

Single-cell best practices = https://www.sc-best-practices.org/cellular_structure/annotation.html

## Set-up environment

In [1]:
# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

# Use non-corrupt UMAP version
!pip install -q umap-learn==0.5.1

In [2]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as pl
from scipy import io
import pandas as pd

### Store data directories

In [3]:
# Set up directories
sample = "timecourse"

work_dir = '/g/scb/zaugg/deuner/SCENIC+/'
indata_dir = "/g/scb/zaugg/deuner/SCENIC+/inputdata/"
seurat_dir = "/g/scb/zaugg/deuner/GRaNIE/tmp/"
outdata_dir = "/g/scb/zaugg/deuner/SCENIC+/outputdata/"
tmp_dir = "/g/scb/zaugg/deuner/SCENIC+/tmp/"
fig_dir = '/g/scb/zaugg/deuner/SCENIC+/figures/'

### Import Anndata (preprocessed with Seurat)

In [17]:
# Load adata object
adata = sc.read(os.path.join(seurat_dir, 'timecourse.nomicro.h5ad'))

In [5]:
# inspect the data 
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.20,SCT_nn_res.0.5,celltype,basic_celltype,pseudotime,pseudotime_clusters_n7,pseudotime_clusters_n14,wsnn_res.0.8,celltype_wnn,sampleID
timecourse_AAACAGCCAGCCAGTT,timecourse,5020.0,2676,6.155378,3.705179,6577.0,2676,2,3,0.084507,...,73,3,diff - immature.neuron,diff,27.596102,3,6,3,diff - immature.neuron,timecourse
timecourse_AAACAGCCAGTAAAGC,timecourse,3832.0,2004,0.287056,0.730689,6634.0,2045,5,5,0.183099,...,39,9,neuron - excitatory,neuron,37.653171,4,8,6,neuron - excitatory,timecourse
timecourse_AAACAGCCATAAGTCT,timecourse,7329.0,3535,1.159776,1.132487,7330.0,3535,7,6,0.154930,...,17,7,hiPSC - start.diff,hiPSC,1.120917,1,1,5,diff - hiPSC-like,timecourse
timecourse_AAACAGCCATAGGCGA,timecourse,4511.0,2253,7.337619,1.906451,6652.0,2254,3,3,0.140845,...,1,3,diff - immature.neuron,diff,33.677399,4,7,3,diff - immature.neuron,timecourse
timecourse_AAACAGCCATCACAGC,timecourse,12095.0,4857,6.572964,1.537826,8336.0,4694,12,8,0.028169,...,35,7,hiPSC - start.diff,hiPSC,2.482587,1,1,10,hiPSC - start.diff,timecourse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
timecourse_TTTGTTGGTGAGGTAG,timecourse,5826.0,2665,5.029180,0.944044,6678.0,2665,5,5,0.070423,...,2,9,neuron - excitatory,neuron,41.142820,5,9,6,neuron - excitatory,timecourse
timecourse_TTTGTTGGTGCGCGTA,timecourse,13128.0,5157,6.093845,1.706277,8351.0,4729,9,7,0.577465,...,51,10,mature.neuron - adhesion,neuron,57.379511,6,12,7,mature.neuron - adhesion,timecourse
timecourse_TTTGTTGGTGTTGTGA,timecourse,3832.0,2250,7.489562,3.183716,6674.0,2285,2,2,0.197183,...,92,2,diff,diff,31.413775,4,7,1,diff,timecourse
timecourse_TTTGTTGGTTCCGGCT,timecourse,6420.0,3037,7.009346,1.043614,6780.0,3035,2,2,0.098592,...,91,2,diff,diff,35.442656,4,8,1,diff,timecourse


### Add a column in adata.obs with the quivalent ATAC-seq barcode

In [18]:
# Get file with ATAC and RNA barcodes matchings
barcodes_df = pd.read_csv(os.path.join(work_dir, "inputdata/cellBarcodes.rna.atac.timecourse.txt"), sep = " ")
barcodes_dict = dict()

# Create a dictionary where keys are rna barcodes and values atac barcodes
for i in range(len(barcodes_df.index.values)):
    barcodes_dict[barcodes_df.iloc[i,0]] = barcodes_df.iloc[i,1]

In [19]:
adata.obs['barcode'] = [""]*len(adata.obs.index.values)
for i in range(len(adata.obs.index.values)):
    rna_barcodes = adata.obs.index.values
    adata.obs['barcode'][i] = barcodes_dict[rna_barcodes[i][11:]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['barcode'][i] = barcodes_dict[rna_barcodes[i][11:]]


In [20]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,SCT_nn_res.0.5,celltype,basic_celltype,pseudotime,pseudotime_clusters_n7,pseudotime_clusters_n14,wsnn_res.0.8,celltype_wnn,sampleID,barcode
timecourse_AAACAGCCAGCCAGTT,timecourse,5020.0,2676,6.155378,3.705179,6577.0,2676,2,3,0.084507,...,3,diff - immature.neuron,diff,27.596102,3,6,3,diff - immature.neuron,timecourse,AAACCGTACCCGCTGT
timecourse_AAACAGCCAGTAAAGC,timecourse,3832.0,2004,0.287056,0.730689,6634.0,2045,5,5,0.183099,...,9,neuron - excitatory,neuron,37.653171,4,8,6,neuron - excitatory,timecourse,CTAGTAAACCCGCTGT
timecourse_AAACAGCCATAAGTCT,timecourse,7329.0,3535,1.159776,1.132487,7330.0,3535,7,6,0.154930,...,7,hiPSC - start.diff,hiPSC,1.120917,1,1,5,diff - hiPSC-like,timecourse,TGGCATGACCCGCTGT
timecourse_AAACAGCCATAGGCGA,timecourse,4511.0,2253,7.337619,1.906451,6652.0,2254,3,3,0.140845,...,3,diff - immature.neuron,diff,33.677399,4,7,3,diff - immature.neuron,timecourse,TTTGTGCACCCGCTGT
timecourse_AAACAGCCATCACAGC,timecourse,12095.0,4857,6.572964,1.537826,8336.0,4694,12,8,0.028169,...,7,hiPSC - start.diff,hiPSC,2.482587,1,1,10,hiPSC - start.diff,timecourse,TCCAGGAACCCGCTGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
timecourse_TTTGTTGGTGAGGTAG,timecourse,5826.0,2665,5.029180,0.944044,6678.0,2665,5,5,0.070423,...,9,neuron - excitatory,neuron,41.142820,5,9,6,neuron - excitatory,timecourse,CAGGTCCTGAAGTACG
timecourse_TTTGTTGGTGCGCGTA,timecourse,13128.0,5157,6.093845,1.706277,8351.0,4729,9,7,0.577465,...,10,mature.neuron - adhesion,neuron,57.379511,6,12,7,mature.neuron - adhesion,timecourse,CGGCACTTGAAGTACG
timecourse_TTTGTTGGTGTTGTGA,timecourse,3832.0,2250,7.489562,3.183716,6674.0,2285,2,2,0.197183,...,2,diff,diff,31.413775,4,7,1,diff,timecourse,GGCTTCATGAAGTACG
timecourse_TTTGTTGGTTCCGGCT,timecourse,6420.0,3037,7.009346,1.043614,6780.0,3035,2,2,0.098592,...,2,diff,diff,35.442656,4,8,1,diff,timecourse,GCAGGCATGAAGTACG


### Subset the adata object for code debugging

In [21]:
adata_full = adata
adata = sc.pp.subsample(adata_full, fraction = 0.1, copy = True)

In [22]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,SCT_nn_res.0.5,celltype,basic_celltype,pseudotime,pseudotime_clusters_n7,pseudotime_clusters_n14,wsnn_res.0.8,celltype_wnn,sampleID,barcode
timecourse_GCGGTTGGTAACGTGC,timecourse,7372.0,3241,11.516549,2.902876,7372.0,3241,0,1,0.098592,...,0,diff - NPC-like,diff,24.902457,3,5,8,diff - NPC-like,timecourse,GCCTCAATGACAGAAC
timecourse_AAGCCTCCAATCCTGA,timecourse,5596.0,2285,0.178699,0.321658,6674.0,2283,4,4,0.056338,...,4,neuron - mature.neuron,neuron,59.514298,6,12,4,neuron - mature.neuron,timecourse,ATTGTCTACTAAAGCT
timecourse_GGCTGTCAGTTGTCCC,timecourse,6767.0,3109,9.915768,3.058963,6935.0,3108,0,1,0.056338,...,0,diff - NPC-like,diff,23.183780,3,5,8,diff - NPC-like,timecourse,TTTGTACGAGTAACTA
timecourse_AGAGATTAGGGTCTAT,timecourse,3941.0,2071,0.355240,0.761228,6663.0,2089,5,5,0.084507,...,9,neuron - excitatory,neuron,37.680784,4,8,6,neuron - excitatory,timecourse,GATTTGAGAAGTAATC
timecourse_TCGTTATTCGCCTAAG,timecourse,2149.0,1407,11.633318,2.326664,6189.0,1968,2,2,0.183099,...,2,diff,diff,31.633097,4,7,1,diff,timecourse,CCCTTTACTACAAACA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
timecourse_GAGTAACCATCCTAGA,timecourse,4619.0,2664,2.359818,2.208270,6478.0,2666,0,1,0.253521,...,0,diff - NPC-like,diff,25.236132,3,6,2,diff - NPC-like,timecourse,AAGACAAACTTGCTCA
timecourse_TCTAACCGTTGGTTCT,timecourse,6192.0,2610,4.731912,0.645995,6780.0,2609,6,5,0.070423,...,8,neuron - development,neuron,42.729866,5,9,6,neuron - excitatory,timecourse,AGCCTTGTGCATCCCT
timecourse_CATAATCCATGTCGCG,timecourse,4049.0,2435,3.754013,1.778217,6620.0,2447,0,1,0.084507,...,0,diff - NPC-like,diff,25.116435,3,6,2,diff - NPC-like,timecourse,CTAACTCACAATACCG
timecourse_TTTGCGACAGTTATCG,timecourse,22122.0,5835,0.226019,0.479161,7707.0,3437,10,7,0.070423,...,10,mature.neuron - adhesion,neuron,57.454289,6,12,7,mature.neuron - adhesion,timecourse,GGAATAAACCTCACAT


In [25]:
# Save it
adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
adata.write(os.path.join(tmp_dir,'timecourse.nomicro.subset.adata.h5ad'), compression='gzip')