# SCENIC+ ANDATA OBJECT PREPARATION

REFERENCES:

anndata object wiki = https://anndata.readthedocs.io/en/latest/index.html

anndata basics = https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html

scenicplus workflow = https://scenicplus.readthedocs.io/en/latest/pbmc_multiome_tutorial.html ,
                      https://github.com/aertslab/pycisTopic/blob/master/notebooks/Cortex_pycisTopic.ipynb

Single-cell best practices = https://www.sc-best-practices.org/cellular_structure/annotation.html

## Set-up environment

In [9]:
# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

In [10]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as pl
from scipy import io
import pandas as pd

### Store data directories

In [11]:
# Set up directories
sample = "combined"

work_dir = '/g/scb/zaugg/deuner/SCENIC+/'
indata_dir = "/g/scb/zaugg/deuner/SCENIC+/inputdata/"
seurat_dir = "/g/scb/zaugg/deuner/GRaNIE/tmp/"
outdata_dir = "/g/scb/zaugg/deuner/SCENIC+/outputdata/"
tmp_dir = "/g/scb/zaugg/deuner/SCENIC+/tmp/combined/"
fig_dir = '/g/scb/zaugg/deuner/SCENIC+/figures/'

### Import Anndata (preprocessed with Seurat)

In [12]:
# Load adata object
adata = sc.read(os.path.join(seurat_dir, 'combined.nomicro.h5ad'))

In [13]:
# inspect the data 
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.7,wsnn_res.8,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn
timecourse_timecourse_AAACAGCCAGCCAGTT,timecourse,5020.0,2676,6.155378,3.705179,6037.0,2676,7,25,0.084507,...,6,51,48,44,31,27,51,35,25,diff-NPC
timecourse_timecourse_AAACAGCCAGGCGAGT,timecourse,4719.0,2179,0.190718,0.466200,6032.0,2178,14,117,0.239437,...,69,72,96,96,109,114,120,115,117,neuron-1
timecourse_timecourse_AAACAGCCAGTAAAGC,timecourse,3832.0,2004,0.287056,0.730689,5979.0,2008,12,50,0.183099,...,41,46,34,32,26,23,14,62,50,diff-neuron
timecourse_timecourse_AAACAGCCATAAGTCT,timecourse,7329.0,3535,1.159776,1.132487,7247.0,3535,8,151,0.154930,...,55,52,47,49,50,47,34,147,151,hiPSC-2
timecourse_timecourse_AAACAGCCATAGGCGA,timecourse,4511.0,2253,7.337619,1.906451,6061.0,2253,7,51,0.140845,...,19,19,11,12,41,36,70,52,51,diff-NPC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_cocultured28_TTTGTGAAGTCTATGA,cocultured28,11717.0,3893,0.162158,0.187761,7918.0,3746,5,72,,...,26,25,22,14,55,49,107,105,72,neuron-2
cocultured28_cocultured28_TTTGTGGCAGCAAGTG,cocultured28,6845.0,2998,0.613587,0.555150,6845.0,2998,14,85,,...,59,59,88,89,100,103,105,93,85,neuron-4
cocultured28_cocultured28_TTTGTGTTCTCGCCTG,cocultured28,9557.0,3435,0.292979,0.282515,7697.0,3433,6,39,,...,23,24,15,70,79,78,73,50,39,neuron-3
cocultured28_cocultured28_TTTGTGTTCTTAGCCC,cocultured28,1732.0,1112,2.078522,0.577367,5364.0,1614,6,123,,...,45,45,33,61,29,21,45,73,123,neuron-3


In [14]:
adata.var

Unnamed: 0,features
MIR1302-2HG,MIR1302-2HG
AL627309.1,AL627309.1
AL627309.5,AL627309.5
AL627309.4,AL627309.4
AP006222.2,AP006222.2
...,...
HCFC1-AS1,HCFC1-AS1
AC009494.2,AC009494.2
AC136616.3,AC136616.3
AC023491.2,AC023491.2


In [15]:
# format rownames and create a barcode column
old_names = adata.obs_names
new_names = []
barcodes = []
for name in old_names:
    name_split = str.split(name, "_")
    new_name = name_split[1] + "_" + name_split[2]
    new_names.append(new_name)
    barcodes.append(name_split[2])
    
adata.obs_names = new_names
adata.obs["barcode"] = barcodes

In [16]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.8,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode
timecourse_AAACAGCCAGCCAGTT,timecourse,5020.0,2676,6.155378,3.705179,6037.0,2676,7,25,0.084507,...,51,48,44,31,27,51,35,25,diff-NPC,AAACAGCCAGCCAGTT
timecourse_AAACAGCCAGGCGAGT,timecourse,4719.0,2179,0.190718,0.466200,6032.0,2178,14,117,0.239437,...,72,96,96,109,114,120,115,117,neuron-1,AAACAGCCAGGCGAGT
timecourse_AAACAGCCAGTAAAGC,timecourse,3832.0,2004,0.287056,0.730689,5979.0,2008,12,50,0.183099,...,46,34,32,26,23,14,62,50,diff-neuron,AAACAGCCAGTAAAGC
timecourse_AAACAGCCATAAGTCT,timecourse,7329.0,3535,1.159776,1.132487,7247.0,3535,8,151,0.154930,...,52,47,49,50,47,34,147,151,hiPSC-2,AAACAGCCATAAGTCT
timecourse_AAACAGCCATAGGCGA,timecourse,4511.0,2253,7.337619,1.906451,6061.0,2253,7,51,0.140845,...,19,11,12,41,36,70,52,51,diff-NPC,AAACAGCCATAGGCGA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_TTTGTGAAGTCTATGA,cocultured28,11717.0,3893,0.162158,0.187761,7918.0,3746,5,72,,...,25,22,14,55,49,107,105,72,neuron-2,TTTGTGAAGTCTATGA
cocultured28_TTTGTGGCAGCAAGTG,cocultured28,6845.0,2998,0.613587,0.555150,6845.0,2998,14,85,,...,59,88,89,100,103,105,93,85,neuron-4,TTTGTGGCAGCAAGTG
cocultured28_TTTGTGTTCTCGCCTG,cocultured28,9557.0,3435,0.292979,0.282515,7697.0,3433,6,39,,...,24,15,70,79,78,73,50,39,neuron-3,TTTGTGTTCTCGCCTG
cocultured28_TTTGTGTTCTTAGCCC,cocultured28,1732.0,1112,2.078522,0.577367,5364.0,1614,6,123,,...,45,33,61,29,21,45,73,123,neuron-3,TTTGTGTTCTTAGCCC


### Add a column in adata.obs with the quivalent ATAC-seq barcode

In [17]:
# Get file with ATAC and RNA barcodes matchings (add sample id to rna barcode because there is some barcode redundancy between samples)
timecourse_barcodes_df = pd.read_csv(os.path.join(work_dir, "inputdata/cellBarcodes.rna.atac.timecourse.txt"), sep = " ")
for i in range(len(timecourse_barcodes_df)):
    bc = timecourse_barcodes_df["rna"].iloc[i]
    timecourse_barcodes_df["rna"].iloc[i] = "timecourse_" + bc
    
Neuron_barcodes_df = pd.read_csv(os.path.join(work_dir, "inputdata/cellBarcodes.rna.atac.Neuron.txt"), sep = " ")
for i in range(len(Neuron_barcodes_df)):
    bc = Neuron_barcodes_df["rna"].iloc[i]
    Neuron_barcodes_df["rna"].iloc[i] = "Neuron_" + bc
    
NPC_barcodes_df = pd.read_csv(os.path.join(work_dir, "inputdata/cellBarcodes.rna.atac.NPC.txt"), sep = " ")
for i in range(len(NPC_barcodes_df)):
    bc = NPC_barcodes_df["rna"].iloc[i]
    NPC_barcodes_df["rna"].iloc[i] = "NPC_" + bc

cocultured28_barcodes_df = pd.read_csv(os.path.join(work_dir, "inputdata/cellBarcodes.rna.atac.cocultured28.txt"), sep = " ")
for i in range(len(cocultured28_barcodes_df)):
    bc = cocultured28_barcodes_df["rna"].iloc[i]
    cocultured28_barcodes_df["rna"].iloc[i] = "cocultured28_" + bc

# merge them
barcodes_df = pd.concat([timecourse_barcodes_df, Neuron_barcodes_df, NPC_barcodes_df, cocultured28_barcodes_df])

barcodes_dict = dict()


# Create a dictionary where keys are rna barcodes and values atac barcodes
for i in range(len(barcodes_df.index.values)):
    barcodes_dict[barcodes_df.iloc[i,0]] = barcodes_df.iloc[i,1]

In [18]:
len(barcodes_dict) == len(barcodes_df)

True

In [19]:
adata.obs['barcode'] = [""]*len(adata.obs.index.values)
for i in range(len(adata.obs.index.values)):
    rna_barcodes = adata.obs.index.values
    adata.obs['barcode'][i] = barcodes_dict[rna_barcodes[i]] #barcodes_dict[rna_barcodes[i][11:]]
# the barcode column represents the atac barcode

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['barcode'][i] = barcodes_dict[rna_barcodes[i]] #barcodes_dict[rna_barcodes[i][11:]]


In [20]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.8,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode
timecourse_AAACAGCCAGCCAGTT,timecourse,5020.0,2676,6.155378,3.705179,6037.0,2676,7,25,0.084507,...,51,48,44,31,27,51,35,25,diff-NPC,AAACCGTACCCGCTGT
timecourse_AAACAGCCAGGCGAGT,timecourse,4719.0,2179,0.190718,0.466200,6032.0,2178,14,117,0.239437,...,72,96,96,109,114,120,115,117,neuron-1,GTAGGTTACCCGCTGT
timecourse_AAACAGCCAGTAAAGC,timecourse,3832.0,2004,0.287056,0.730689,5979.0,2008,12,50,0.183099,...,46,34,32,26,23,14,62,50,diff-neuron,CTAGTAAACCCGCTGT
timecourse_AAACAGCCATAAGTCT,timecourse,7329.0,3535,1.159776,1.132487,7247.0,3535,8,151,0.154930,...,52,47,49,50,47,34,147,151,hiPSC-2,TGGCATGACCCGCTGT
timecourse_AAACAGCCATAGGCGA,timecourse,4511.0,2253,7.337619,1.906451,6061.0,2253,7,51,0.140845,...,19,11,12,41,36,70,52,51,diff-NPC,TTTGTGCACCCGCTGT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cocultured28_TTTGTGAAGTCTATGA,cocultured28,11717.0,3893,0.162158,0.187761,7918.0,3746,5,72,,...,25,22,14,55,49,107,105,72,neuron-2,AGCAATTGACACCTAC
cocultured28_TTTGTGGCAGCAAGTG,cocultured28,6845.0,2998,0.613587,0.555150,6845.0,2998,14,85,,...,59,88,89,100,103,105,93,85,neuron-4,TGATTAGACACCAAAC
cocultured28_TTTGTGTTCTCGCCTG,cocultured28,9557.0,3435,0.292979,0.282515,7697.0,3433,6,39,,...,24,15,70,79,78,73,50,39,neuron-3,GTTAAAGCTTGAGAAC
cocultured28_TTTGTGTTCTTAGCCC,cocultured28,1732.0,1112,2.078522,0.577367,5364.0,1614,6,123,,...,45,33,61,29,21,45,73,123,neuron-3,ACTAAGACTTGAGAAC


In [21]:
# Save it
adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
adata.write(os.path.join(tmp_dir,'combined.nomicro.adata.h5ad'), compression='gzip')

### Subset the adata object for code debugging

In [40]:
# adata_full = adata
# adata = sc.pp.subsample(adata_full, fraction = 0.1, copy = True)

In [41]:
# adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,percent.ribo,nCount_SCT,nFeature_SCT,SCT_snn_res.0.5,seurat_clusters,pANN_0.25_0.005_794,...,wsnn_res.8,wsnn_res.9,wsnn_res.10,wsnn_res.12,wsnn_res.14,wsnn_res.16,wsnn_res.18,wsnn_res.20,celltype_wnn,barcode
NPC_TGATGAACACCTATAG,NPC,7888.0,3703,5.337221,7.264199,7521.0,3703,1,105,,...,32,67,26,24,79,74,65,105,NPC-3,GCTCGTTACCATTACC
Neuron_CCCTGGACAGGACCAA,Neuron,6155.0,2707,4.126726,0.503656,6409.0,2707,2,44,,...,53,51,48,35,32,20,16,44,neuron-5,ACCGAATACGCTTGGG
timecourse_TCCGCCATCCAAGTTA,timecourse,9927.0,4071,1.067795,3.646620,7839.0,4068,1,3,0.366197,...,40,30,28,14,13,6,5,3,hiPSC-2,CTATTAGCTATGGCTT
cocultured28_GCTGGTTCAGGCGATA,cocultured28,9908.0,3447,2.775535,0.272507,7716.0,3446,6,109,,...,30,42,45,85,87,111,99,109,neuron-3,TCAATCCACGGGCGTT
cocultured28_TCTTGTCCAGCACGTT,cocultured28,33559.0,6871,0.441014,0.187729,7548.0,3295,5,143,,...,84,101,103,118,121,136,136,143,neuron-3,CTTACTGACATAAAGG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NPC_GCATTGCCATTGACAT,NPC,4870.0,2690,4.702259,4.702259,5988.0,2690,4,13,,...,66,72,67,52,37,25,22,13,NPC-2,GCAATTGACGTGGTTA
NPC_GCAGGAAGTAGGTGTC,NPC,11199.0,4468,8.938298,10.152692,7964.0,4376,1,33,,...,32,14,26,24,85,83,81,33,NPC-3,ATGGGCGTGGCGGAAC
cocultured28_GAAGTATAGTAAGTGG,cocultured28,9366.0,3541,0.629938,0.181508,7766.0,3540,5,148,,...,29,21,18,119,122,137,140,148,neuron-2,CAGTTAGGATCAATCT
timecourse_CGCATGATCCTTCGTA,timecourse,5905.0,2565,11.075360,1.558002,6312.0,2565,7,165,0.098592,...,5,3,6,27,31,56,160,165,diff-NPC,TTAGGTTCTAAGTGGC


In [42]:
# # Save it
# adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})
# adata.write(os.path.join(tmp_dir,'combined.nomicro.subset.adata.h5ad'), compression='gzip')