# Notebook to make SCE objects, in order to run BigSCale

## _Run with Python 3 kernel_

## Imports

In [1]:
import pickle
import pandas as pd
import numpy as np
import scanpy.api as sc
import scanpy
import scipy
from scipy import io



## Load datasets

In [7]:
from RegulonPreprocessing import load_PB_datasets, retain_overlapping_genes
droplet, facs, mca = load_PB_datasets('data')

In [8]:
droplet, facs, mca = retain_overlapping_genes(droplet, facs, mca)

## Save TM - 10X in a universal format 

In [9]:
#Expression matrix
io.mmwrite('mtx.mtx', droplet.X.T)
# Gene Names
genes = pd.DataFrame(droplet.var.index.values)
genes['1'] = pd.DataFrame(droplet.var.index.values)
genes.to_csv("genes.tsv", sep='\t', index = False, index_label = False, header=False)
# Cell ID's
barcodes = pd.DataFrame(droplet.obs.index.values)
barcodes.to_csv("barcodes.tsv",sep='\t', index = False, index_label = False, header=False)

In [10]:
#Cell annotations
droplet.obs[["celltype"]].to_csv("annotation.tsv", sep='\t',  header=False)


In [11]:
droplet

View of AnnData object with n_obs × n_vars = 1061 × 11245 
    obs: 'celltype'
    var: 'n_cells'

## _Run with R kernel_

## Imports

In [1]:
library(scater)
library(SingleCellExperiment)
library(Matrix)

Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
    pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames,
    

In [2]:
# Read the files
cellbarcodes <- read.table("barcodes.tsv", sep = '\t')
genenames <- read.table("genes.tsv")
molecules <- Matrix::readMM("mtx.mtx")

In [3]:
rownames(molecules) <- genenames[,1]
colnames(molecules) <- cellbarcodes[,1]

In [4]:
ann <- read.csv("annotation.tsv" , header=F, sep = '\t')

In [5]:
ann_subset <- ann[match(colnames(molecules), ann[,1]),]
celltype <- ann_subset[,2]

In [6]:
cell_anns <- data.frame(type=celltype)
rownames(cell_anns) <- colnames(molecules);

### Make SCE object

In [7]:
molecules <- as.matrix(molecules)
sce <- SingleCellExperiment(assays = list(counts = as.matrix(molecules)), colData=cell_anns)

In [8]:
sce

class: SingleCellExperiment 
dim: 11245 1061 
metadata(0):
assays(1): counts
rownames(11245): Prdm2 Otud1 ... Ccdc82 Cwc15
rowData names(0):
colnames(1061): Thymus_immature T cell_2 Thymus_immature T cell_3 ...
  Marrow_immature B cell_2 Marrow_immature B cell_3
colData names(1): type
reducedDimNames(0):
spikeNames(0):

In [9]:
saveRDS(sce, file = "droplet_PB_sce.rds")