In [1]:
import anndata as ad
import scvelo as scv
import scanpy as sc
import pandas as pd
import numpy as np


In a future version of Scanpy, `scanpy.api` will be removed.
Simply use `import scanpy as sc` and `import scanpy.external as sce` instead.



## Read the files

In [2]:
d1mtx = ad.read_mtx('D1-D3-D5-byHaiping/D1/filtered_feature_bc_matrix/matrix.mtx')
d1barcodes = pd.read_csv('D1-D3-D5-byHaiping/D1/filtered_feature_bc_matrix/barcodes.tsv',header=None)
d1features = pd.read_csv('D1-D3-D5-byHaiping/D1/filtered_feature_bc_matrix/features.tsv',header=None,sep='\t')

In [3]:
d3mtx = ad.read_mtx('D1-D3-D5-byHaiping/D3/filtered_feature_bc_matrix/matrix.mtx')
d3barcodes = pd.read_csv('D1-D3-D5-byHaiping/D3/filtered_feature_bc_matrix/barcodes.tsv',header=None)
d3features = pd.read_csv('D1-D3-D5-byHaiping/D3/filtered_feature_bc_matrix/features.tsv',header=None,sep='\t')

In [4]:
d5mtx = ad.read_mtx('D1-D3-D5-byHaiping/D5/filtered_feature_bc_matrix/matrix.mtx')
d5barcodes = pd.read_csv('D1-D3-D5-byHaiping/D5/filtered_feature_bc_matrix/barcodes.tsv',header=None)
d5features = pd.read_csv('D1-D3-D5-byHaiping/D5/filtered_feature_bc_matrix/features.tsv',header=None,sep='\t')

In [5]:
d1mtx = d1mtx.transpose()
d3mtx = d3mtx.transpose()
d5mtx = d5mtx.transpose()

## Assign cell and gene lables

In [6]:
def cleanBarcodes(df):
    barcodes = df.iloc[:,0].values
    return [barcode[:-2] for barcode in barcodes]

In [7]:
d1barcodes = cleanBarcodes(d1barcodes)
d3barcodes = cleanBarcodes(d3barcodes)
d5barcodes = cleanBarcodes(d5barcodes)

In [8]:
d1mtx.obs['cellId'] = d1barcodes
d3mtx.obs['cellId'] = d3barcodes
d5mtx.obs['cellId'] = d5barcodes

In [9]:
def assignVarValues(adata,df):
    adata.var['geneName'] = df.iloc[:,1].values
    adata.var['geneId'] = df.iloc[:,0].values

In [10]:
assignVarValues(d1mtx,d1features)
assignVarValues(d3mtx,d3features)
assignVarValues(d5mtx,d5features)

In [21]:
d1lables = pd.read_csv('finalSampleLabelsD1.txt',sep='\t')
d3lables = pd.read_csv('finalSampleLabelsD3.txt',sep='\t')
d5lables = pd.read_csv('finalSampleLabelsD5.txt',sep='\t')

In [22]:
d1lables.head()

Unnamed: 0,final.calls,cellBarcodes
1,Mouse1_Collagen_D1,AAACCCACACCATAAC
2,Mouse3_Matrigel_D1,AAACCCATCGTCGGGT
3,Mouse2_Matrigel_D1,AAACGAAAGTAGTGCG
4,Doublet,AAACGAATCTGTCTCG
5,Mouse1_Matrigel_D1,AAACGCTCACCGTGGT


In [23]:
def removeDuplicates(df):
    dup = df.duplicated('cellBarcodes',keep='last')
    notDup = [not x for x in dup]
    df = df.loc[notDup,:].sort_values('cellBarcodes')
    return df

In [24]:
d1lables = removeDuplicates(d1lables)
d3lables = removeDuplicates(d3lables)
d5lables = removeDuplicates(d5lables)

In [20]:
d1mtx.obs = d1mtx.obs.set_index('cellId')
d3mtx.obs = d3mtx.obs.set_index('cellId')
d5mtx.obs = d5mtx.obs.set_index('cellId')

KeyError: "None of ['cellId'] are in the columns"

In [25]:
d1mtx = ad.AnnData(d1mtx[d1mtx.obs.index.isin(d1lables['cellBarcodes'].values)])
d3mtx = ad.AnnData(d3mtx[d3mtx.obs.index.isin(d3lables['cellBarcodes'].values)])
d5mtx = ad.AnnData(d5mtx[d5mtx.obs.index.isin(d5lables['cellBarcodes'].values)])

In [26]:
d1lables.set_index('cellBarcodes',inplace=True)
d3lables.set_index('cellBarcodes',inplace=True)
d5lables.set_index('cellBarcodes',inplace=True)

In [151]:
d1mtx.shape

(6021, 31053)

In [155]:
d1mtx.obs.head()

AAACCCACACCATAAC
AAACCCACACCGTACG
AAACCCACAGACTCTA
AAACCCACATCAACCA
AAACCCACATTGCAAC


In [29]:
d1mtx.obs['cellLabel'] = d1lables.loc[list(d1mtx.obs.index.values),'final.calls'].values
d3mtx.obs['cellLabel'] = d3lables.loc[list(d3mtx.obs.index.values),'final.calls'].values
d5mtx.obs['cellLabel'] = d5lables.loc[list(d5mtx.obs.index.values),'final.calls'].values

In [31]:
b1 = list(d1barcodes)
b3 = list(d3barcodes)
b5 = list(d5barcodes)

# common cell barcodes
print(len([x for x in b1 if x in b3]))
print(len([x for x in b1 if x in b5]))
print(len([x for x in b5 if x in b3]))

902
968
711


In [32]:
adata = d1mtx.concatenate(d3mtx,join='outer')
adata = adata.concatenate(d5mtx,join='outer')

In [33]:
adata

AnnData object with n_obs × n_vars = 14108 × 31053 
    obs: 'batch', 'cellLabel'
    var: 'geneName-0-0', 'geneId-0-0', 'geneName-1-0', 'geneId-1-0', 'geneName-1', 'geneId-1'

In [44]:
adata.var = adata.var.drop(['geneName-0-0', 'geneId-0-0', 'geneName-1-0', 'geneId-1-0'],axis=1)

In [47]:
adata.var = adata.var.rename({'geneName-1':'geneName','geneId-1':'geneId'},axis=1)

In [48]:
adata.write_h5ad('mergedAdata.h5ad')

... storing 'cellLabel' as categorical
... storing 'geneName' as categorical
