# Notebook to make scanpy objects of tabula muris and MCA datasets

## Imports

In [1]:
import pickle
import pandas as pd
import numpy as np
import scanpy.api as sc
import scanpy
import scipy



## MCA

Firstly, load the python files of the MCA dataset generated in the notebook "making_python_objects.ipynb"

In [2]:
with open("/work/sduknn/Andreas/TM_MCA/MCA/cell_names", 'rb') as pickle_file:
    cell_names = pickle.load(pickle_file)

with open("/work/sduknn/Andreas/TM_MCA/MCA/gene_names", 'rb') as pickle_file:
    gene_names = pickle.load(pickle_file)
    
with open("/work/sduknn/Andreas/TM_MCA/MCA/MCA_py_mat", 'rb') as pickle_file:
    mca_mat = pickle.load(pickle_file)

In [3]:
mca_mat.shape #Needs transpose

(39855, 242533)

In [4]:
mca_mat = mca_mat.T
    

In [8]:
mca = sc.AnnData(X=mca_mat, obs = pd.DataFrame(data = None, index = cell_names), var = pd.DataFrame(data = None, index = gene_names))

In [10]:
mca

AnnData object with n_obs × n_vars = 242533 × 39855 

In [11]:
MCA_metadata = pd.read_csv('/work/sduknn/Andreas/TM_MCA/MCA/MCA_CellAssignments.csv')

In [12]:
MCA_metadata.shape

(270848, 7)

In [13]:
#Subset mca scanpy and metadata objects to only retain ID's that exist in both.
MCA_metadata = MCA_metadata[MCA_metadata['Cell.name'].isin(mca.obs.index)  ]
mca = mca[mca.obs.index.isin(MCA_metadata['Cell.name'].values)]
print(MCA_metadata['Cell.name'].values.shape)
print(mca.obs.index.shape)

(233994,)
(233994,)


In [15]:
mca.obs["batch"] = pd.Categorical(MCA_metadata["Batch"])
mca.obs["tissue"] = pd.Categorical(MCA_metadata["Tissue"])
mca.obs["annotation"] = pd.Categorical(MCA_metadata["Annotation"])

## TM - SS2

In [2]:
#loading
tm_facs_metadata = pd.read_csv('/work/sduknn/Andreas/TM_MCA/TM/TM_facs_metadata.csv')
tm_facs_data = sc.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/TM_facs_mat.h5ad')

#annotating
tm_facs_data.obs["plate_barcode"] = pd.Categorical(tm_facs_metadata["plate.barcode"])
tm_facs_data.obs["mouse_id"] = pd.Categorical(tm_facs_metadata["mouse.id"])
tm_facs_data.obs["tissue"] = pd.Categorical(tm_facs_metadata["tissue"])
tm_facs_data.obs["subtissue"] = pd.Categorical(tm_facs_metadata["subtissue"])
tm_facs_data.obs["FACS_selection"] = pd.Categorical(tm_facs_metadata["FACS.selection"])
tm_facs_data.obs["mouse_sex"] = pd.Categorical(tm_facs_metadata["mouse.sex"])
tm_facs_data.obs["method"] = pd.Categorical(tm_facs_metadata["method"])
tm_facs_data.obs["cell_ontology_class"] = pd.Categorical(tm_facs_metadata["cell_ontology_class"])
tm_facs_data.obs["cell_ontology_id"] = pd.Categorical(tm_facs_metadata["cell_ontology_id"])
tm_facs_data.obs["free_annotation"] = pd.Categorical(tm_facs_metadata["free_annotation"])

In [3]:
tm_facs_data

AnnData object with n_obs × n_vars = 53760 × 23433 
    obs: 'plate_barcode', 'mouse_id', 'tissue', 'subtissue', 'FACS_selection', 'mouse_sex', 'method', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation'

## TM - 10X

In [4]:
#loading
tm_droplet_metadata = pd.read_csv('/work/sduknn/Andreas/TM_MCA/TM/TM_droplet_metadata.csv')
tm_droplet_data = sc.read_h5ad('/work/sduknn/Andreas/TM_MCA/TM/TM_droplet_mat.h5ad')

#annotating
tm_droplet_data.obs["channel"] = pd.Categorical(tm_droplet_metadata["channel"])
tm_droplet_data.obs["mouse_id"] = pd.Categorical(tm_droplet_metadata["mouse.id"])
tm_droplet_data.obs["tissue"] = pd.Categorical(tm_droplet_metadata["tissue"])
tm_droplet_data.obs["subtissue"] = pd.Categorical(tm_droplet_metadata["subtissue"])
tm_droplet_data.obs["mouse_sex"] = pd.Categorical(tm_droplet_metadata["mouse.sex"])
tm_droplet_data.obs["method"] = pd.Categorical(tm_droplet_metadata["method"])
tm_droplet_data.obs["cell_ontology_class"] = pd.Categorical(tm_droplet_metadata["cell_ontology_class"])
tm_droplet_data.obs["cell_ontology_id"] = pd.Categorical(tm_droplet_metadata["cell_ontology_id"])
tm_droplet_data.obs["free_annotation"] = pd.Categorical(tm_droplet_metadata["free_annotation"])
tm_droplet_data

  interactivity=interactivity, compiler=compiler, result=result)


AnnData object with n_obs × n_vars = 70118 × 23433 
    obs: 'channel', 'mouse_id', 'tissue', 'subtissue', 'mouse_sex', 'method', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation'

### Save scanpy objects

In [None]:
mca.write('/work/sduknn/Andreas/TM_MCA/MCA/mca_scanpy_no_processing.h5ad')
tm_facs_data.write('/work/sduknn/Andreas/TM_MCA/TM/tm_facs_scanpy_no_processing.h5ad')
tm_droplet_data.write('/work/sduknn/Andreas/TM_MCA/TM/tm_droplet_scanpy_no_processing.h5ad')

## Save to universal files that I can also load into R

#### MCA

In [None]:
from scipy import io
#Expression matrix
io.mmwrite('/work/sduknn/Andreas/TM_MCA/MCA/mca_mtx.mtx', mca.X.T)
#Genes
genes = pd.DataFrame(mca.var.index.values)
genes['1'] = pd.DataFrame(mca.var.index.values)
genes.to_csv("/work/sduknn/Andreas/TM_MCA/MCA/genes.tsv", sep='\t', index = False, index_label = False, header=False)
# Cell Id
barcodes = pd.DataFrame(mca.obs.index.values)
barcodes.to_csv("/work/sduknn/Andreas/TM_MCA/MCA/barcodes.tsv", index = False, index_label = False, header=False)

In [None]:
io.mmwrite('/work/sduknn/Andreas/TM_MCA/TM/facs_mtx.mtx', tm_facs_data.X.T)
genes = pd.DataFrame(tm_facs_data.var.index.values)
genes['1'] = pd.DataFrame(tm_facs_data.var.index.values)
barcodes = pd.DataFrame(tm_facs_data.obs.index.values)

genes.to_csv("/work/sduknn/Andreas/TM_MCA/TM/facs/genes.tsv", sep='\t', index = False, index_label = False, header=False)
barcodes.to_csv("/work/sduknn/Andreas/TM_MCA/TM/facs/barcodes.tsv", index = False, index_label = False, header=False)

In [None]:
io.mmwrite('/work/sduknn/Andreas/TM_MCA/TM/droplet/droplet_mtx.mtx', tm_droplet_data.X.T)

genes = pd.DataFrame(tm_droplet_data.var.index.values)
genes['1'] = pd.DataFrame(tm_droplet_data.var.index.values)
barcodes = pd.DataFrame(tm_droplet_data.obs.index.values)

genes.to_csv("/work/sduknn/Andreas/TM_MCA/TM/droplet/genes.tsv", sep='\t', index = False, index_label = False, header=False)
barcodes.to_csv("/work/sduknn/Andreas/TM_MCA/TM/droplet/barcodes.tsv", index = False, index_label = False, header=False)

Save the annotations as a tsv file. Then they can also be imported into R

In [21]:
mca.obs[["annotation", 'tissue']].to_csv("/work/sduknn/Andreas/TM_MCA/MCA/annotation_for_r.tsv", sep='\t',  header=False)
tm_facs_data.obs[['cell_ontology_class', 'tissue']].to_csv("/work/sduknn/Andreas/TM_MCA/TM/facs/annotation_for_r.tsv", sep='\t',  header=False)
tm_droplet_data.obs[['cell_ontology_class', 'tissue']].to_csv("/work/sduknn/Andreas/TM_MCA/TM/droplet/annotation_for_r.tsv", sep='\t',  header=False)