### Do the following steps in bash to get CellTag Matrix Expression
1. Download BAM file for samples at day 15
2. Extract celltag reads from BAM file
3. Parse reads to extract required information
I've also added the extracted file in the repository.

In [11]:
# #bash
# wget https://sra-pub-src-1.s3.amazonaws.com/SRR7347033/hf1.d15.possorted_genome_bam.bam.1

# samtools view hf1.d15.possorted_genome_bam.bam | grep -P 'GGT[ACTG]{8}GAATTC' > v1.celltag.reads.out
# samtools view hf1.d15.possorted_genome_bam.bam | grep -P 'GTGATG[ACTG]{8}GAATTC' > v2.celltag.reads.out
# samtools view hf1.d15.possorted_genome_bam.bam | grep -P 'TGTACG[ACTG]{8}GAATTC' > v3.celltag.reads.out

# ./scripts/celltag.parse.reads.10x.sh -v tagregex="CCGGT([ACTG]{8})GAATTC" v1.celltag.reads.out > v1.celltag.parsed.tsv
# ./scripts/celltag.parse.reads.10x.sh -v tagregex="GTGATG([ACTG]{8})GAATTC" v2.celltag.reads.out > v2.celltag.parsed.tsv
# ./scripts/celltag.parse.reads.10x.sh -v tagregex="TGTACG([ACTG]{8})GAATTC" v3.celltag.reads.out > v3.celltag.parsed.tsv

# Rscript ./scripts/matrix.count.celltags.R ./cell.barcodes/hf1.d15.barcodes.tsv v1.celltag.parsed.tsv hf1.d15.v1
# Rscript ./scripts/matrix.count.celltags.R ./cell.barcodes/hf1.d15.barcodes.tsv v2.celltag.parsed.tsv hf1.d15.v2
# Rscript ./scripts/matrix.count.celltags.R ./cell.barcodes/hf1.d15.barcodes.tsv v3.celltag.parsed.tsv hf1.d15.v3

In [12]:
import pyreadr #!pip install pyradr
import numpy as np
import pandas as pd
import celltagging_utils as ct

### Read data from celltag matrix

In [13]:
mef = pyreadr.read_r("./celltag_matrix/hf1.d15.v1.celltag.matrix.Rds")[None]
d3 = pyreadr.read_r("./celltag_matrix/hf1.d15.v2.celltag.matrix.Rds")[None]
d13 = pyreadr.read_r("./celltag_matrix/hf1.d15.v3.celltag.matrix.Rds")[None]

mef.set_index('Cell.BC',inplace=True)
d3.set_index('Cell.BC',inplace=True)
d13.set_index('Cell.BC',inplace=True)

mef.shape, d3.shape, d13.shape

((3812, 6319), (3812, 8246), (3812, 4630))

In [14]:
mef.head()

Unnamed: 0_level_0,AAAAAAGA,AAAAAAGC,AAAAAATA,AAAAACTC,AAAAACTG,AAAAAGAC,AAAAAGCC,AAAAAGCG,AAAAAGGG,AAAACTAA,...,TTTTTCGG,TTTTTCTA,TTTTTGAT,TTTTTGCA,TTTTTGTT,TTTTTTAT,TTTTTTCC,TTTTTTCT,TTTTTTGG,TTTTTTTT
Cell.BC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGTATGACA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGCAGCCTATA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGGTAAGTAGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGTCACAACGT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACCTGTCCGCGCAA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get stats

In [15]:
ct.get_stats(mef)

Unnamed: 0,Cell_UMI_Counts,CellTags_per_Cell,CellTag_UMI_Counts,Cells_per_CellTag
count,3812.0,3812.0,6319.0,6319.0
mean,54.328437,5.917629,32.774173,3.569869
std,96.041066,6.887981,379.364072,16.430824
min,0.0,0.0,1.0,1.0
25%,2.0,2.0,1.0,1.0
50%,20.0,4.0,1.0,1.0
75%,64.0,8.0,4.0,2.0
max,1549.0,84.0,20226.0,569.0


### Binarize data
Convert the CellTag UMI count matrices into binary matrices and any Cell Barcode/CellTag pair with a UMI count less than a cutoff will be disregarded

In [16]:
mef_bin = ct.single_cell_data_binarization(mef, 2)
d3_bin = ct.single_cell_data_binarization(d3, 2)
d13_bin = ct.single_cell_data_binarization(d13, 2)

### Filter Tags (White list)
‘Whitelisting’ is performed to remove PCR and sequencing artifacts that are not corrected in the previous step. This whitelisting consists of filtering out CellTags that are not detected from sequencing of the original complex CellTag library.

In [17]:
mef_filt = ct.single_cell_data_whitelist(mef_bin, "./whitelist/V1.CellTag.Whitelist.csv")
d3_filt = ct.single_cell_data_whitelist(d3_bin, "./whitelist/V2.CellTag.Whitelist.csv")
d13_filt = ct.single_cell_data_whitelist(d13_bin, "./whitelist/V3.CellTag.Whitelist.csv")
print(mef_filt.shape, d3_filt.shape, d13_filt.shape)

(3812, 3256) (3812, 2537) (3812, 1981)


### Filter Cells
Cells with >20 CellTags (likely to correspond to cell multiplets) and less than two unique CellTags per cell are filtered out.

In [18]:
mef_filt = ct.metric_based_filtering(mef_filt, 20, "less")
d3_filt = ct.metric_based_filtering(d3_filt, 20, "less")
d13_filt = ct.metric_based_filtering(d13_filt, 20, "less")

mef_filt = ct.metric_based_filtering(mef_filt, 2, "greater")
d3_filt = ct.metric_based_filtering(d3_filt, 2, "greater")
d13_filt = ct.metric_based_filtering(d13_filt, 2, "greater")
print(mef_filt.shape, d3_filt.shape, d13_filt.shape)

(1852, 3256) (1733, 2537) (717, 1981)


### Jaccard Analysis (Cell-Cell)
Clone calling is performed where Jaccard coefficient scores were calculated to assess the similarity of CellTag expression signatures in all cells in a pairwise manner, thereby identifying clonally related cells.

In [19]:
mef_sim = ct.jaccard_analysis(mef_filt, id='mef')
d3_sim = ct.jaccard_analysis(d3_filt, id='d3')
d13_sim = ct.jaccard_analysis(d13_filt, id='d13')

Calculating Jaccard Similarities:   0%|          | 0/1852 [00:00<?, ?it/s]

Calculating Jaccard Similarities:   0%|          | 0/1733 [00:00<?, ?it/s]

Calculating Jaccard Similarities:   0%|          | 0/717 [00:00<?, ?it/s]

### Clone Calling
Cells with a given threshold for similarity (here 0.7) will be get the same clone index. Celones with one cell will be disregarded.

In [20]:
mef_clones, mef_clone_size = ct.clone_calling(mef_sim, "./hf1.d15.v1.clones.csv", 0.7)
d3_clones, d3_clone_size = ct.clone_calling(d3_sim, "./hf1.d15.v2.clones.csv", 0.7)
d13_clones, d13_clone_size = ct.clone_calling(d13_sim, "./hf1.d15.v13.clones.csv", 0.7)
print(mef_clones.head())
print(mef_clone_size.head())

   clone_id      cell_barcode
0         1  CTAGCCTAGTGTCCAT
1         1  AGCTTGAAGTACACCT
2         1  CTGAAGTAGAGTTGGC
3         1  CAGATCACAACTTGAC
4         1  ATCCACCTCGAATGCT
   Clone_ID  Frequency
0         2        228
1         4        202
2        16         65
3        13         56
4         8         53


### Lineage and Visualization

In [21]:
mef_clones.rename(columns={mef_clones.columns[0]: "CellTagV1"}, inplace=True)
d3_clones.rename(columns={d3_clones.columns[0]: "CellTagV2"}, inplace=True)
d13_clones.rename(columns={d13_clones.columns[0]: "CellTagV3"}, inplace=True)

In [23]:
clone_cells = pd.concat([mef_clones.cell_barcode, d3_clones.cell_barcode, d13_clones.cell_barcode]).unique()
celltag_data = pd.DataFrame(index=clone_cells, columns=["CellTagV1", "CellTagV2", "CellTagV3"])

In [24]:
celltag_data.loc[mef_clones['cell_barcode'], "CellTagV1"] = mef_clones["CellTagV1"].values
celltag_data.loc[d3_clones['cell_barcode'], "CellTagV2"] = d3_clones["CellTagV2"].values
celltag_data.loc[d13_clones['cell_barcode'], "CellTagV3"] = d13_clones["CellTagV3"].values
celltag_data.index = celltag_data.index.map(lambda x: x + "-1")
celltag_data

Unnamed: 0,CellTagV1,CellTagV2,CellTagV3
CTAGCCTAGTGTCCAT-1,1,,
AGCTTGAAGTACACCT-1,1,,
CTGAAGTAGAGTTGGC-1,1,,
CAGATCACAACTTGAC-1,1,,
ATCCACCTCGAATGCT-1,1,,
...,...,...,...
GGAGCAAGTTACCGAT-1,,,45
TCTCATAGTCCGTTAA-1,,,48
TGACAACCAAAGGAAG-1,,,48
TGCGCAGAGATATGCA-1,,,50


### Create Linklist of Relation between Cells and CellTags

In [25]:
link_list = ct.convert_cell_tag_matrix_to_link_list(celltag_data)
nodes = ct.get_nodes_from_link_list(link_list)

Preprocessing data..
Cells that have CellTagV1: 1402
Cells that have CellTagV2: 1174
Cells that have CellTagV3: 100
find connection between [celltag -> cells]...
find hidden links [CellTagV2 -> CellTagV3], or [CellTagV1 -> CellTagV3]...
find hidden links [CellTagV1 -> CellTagV2]...
finished


In [26]:
link_list

Unnamed: 0,source,target,tag,target_unmodified
0,CellTagV3_30,GCGGGTTTCATGCTCC-1_V3,CellTagV3,GCGGGTTTCATGCTCC-1
1,CellTagV3_46,GGTGAAGCAGTCACTA-1_V3,CellTagV3,GGTGAAGCAGTCACTA-1
2,CellTagV3_12,ATCATCTCAATGGAGC-1_V3,CellTagV3,ATCATCTCAATGGAGC-1
3,CellTagV3_3,AGGCCACAGCGGCTTC-1_V3,CellTagV3,AGGCCACAGCGGCTTC-1
4,CellTagV3_30,CGAGCACAGCACCGTC-1_V3,CellTagV3,CGAGCACAGCACCGTC-1
...,...,...,...,...
2802,CellTagV2_176,CellTagV3_39,CellTagV2,CellTagV3_39
2803,CellTagV2_17,CellTagV3_2,CellTagV2,CellTagV3_2
2804,CellTagV2_100,CellTagV3_14,CellTagV2,CellTagV3_14
2805,CellTagV2_114,CellTagV3_17,CellTagV2,CellTagV3_17
