In [1]:
import glob
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import pysam
import time
from collections import defaultdict
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from bam_barcodes_function import extract_barcode_info

In [3]:
BAM = "../data/test/aml027_post_transplant_possorted_genome_bam.MT.bam"
BARCODES = "../data/raw/Human2_002_Barcodes.txt"
NUM_CORES = 32
OUT_DIR = "../data/scBam/"
BARCODE_INFO = "../data/test/barcode_data_aml027_post_transplant_possorted_genome_bam.MT.p"

## Get info from BAM files

In [4]:
extract_barcode_info(BAM, BARCODE_INFO)

8292887it [01:32, 89290.45it/s] 


(defaultdict(int,
             {'CCGCCGGGCTGGCC': 1,
              'GGGGGGGGGGTTGG': 25,
              'GGTGAACCTATTCA': 17,
              'GCGCACGACAGCGG': 1,
              'GGGGGGGGGGGGGG': 50467,
              'GTTGAGTTGAGTTG': 1,
              'GTGGATCTCGGCTT': 1,
              'ACCCCCGCGGCGCC': 2,
              'CTTGCACTCACTGA': 4,
              'AATTACACACTATA': 1,
              'CGATAATCTTACAA': 1,
              'AGTGCAACCCAATA': 1,
              'TAAGCACTGGTCCC': 1,
              'TTTGTACTGTCTTT': 1,
              'ATGTACCTCTACCA': 5,
              'ATACCACTCGCACT': 2,
              'TTGATCTGAATAAG': 1,
              'CAGTAGCTGGCCTT': 1,
              'AAACGCACACGCAT': 190,
              'AAACGGCTTGTCCC': 346,
              'AACAATACTCTAGG': 30,
              'AACAGAGACGAGAG': 1200,
              'AACCAGTGTTTCGT': 157,
              'AACCTACTCCAACA': 157,
              'AACGTGTGTTGTCT': 31,
              'AACGTTCTACAGTC': 1661,
              'AACTACCTTTATCC': 48,
              

In [4]:
[CR_read_number,CB_read_number,BC_read_number, barcodes, corrected_barcodes, barcode_pairs] = pickle.load(open(BARCODE_INFO,"rb"))

In [5]:
len(CB_read_number)

143877

In [6]:
len(CR_read_number)

810680

In [8]:
print(len(BC_read_number))
BC_read_number

8


defaultdict(int,
            {'AGGGCGTT': 1004988,
             'GCTCGTCA': 1063341,
             'TACATAAG': 924429,
             'CTATACGC': 1037301,
             'ATCGCCAT': 998068,
             'CATAAAGG': 1047861,
             'GGGTTTCC': 1237219,
             'TCACGGTA': 979680})

# Similar to the paper:
For the AML datasets previously generated by 10X Genomics (Zheng et al., 2017b), cells from two patients (AML027 and AML035) were analyzed for mitochondrial genotypes. Aligned and processed .bam files were downloaded from the 10X website (https:// support.10xgenomics.com/single-cell-gene-expression/datasets/) and further processed using custom Python scripts. Cell barco- des associated with at least 200 reads uniquely aligning to the mitochondrial genome were considered for downstream analysis. Barcodes were further filtered by requiring coverage by at least one read at two specific variants at mtDNA positions 3010 and 9698. We note that we did not observe a barcode that contained a read to support both alternate alleles (3010G > A and 9698T > C). We determined that 4 out of 1,077 cells were derived from the recipient (Figure 7M), a higher estimate than in the previously reported analysis performed with nuclear genome variants (reported exactly 0%) (Zheng et al., 2017b), though these four cells were not included in the published analysis as they did not pass the author’s barcode/ transcriptome filters. We did not observe a well-covered set of variants separating the donor/ recipient pair in the AML027 dataset, and did not further analyze it for mutations but only for determining well-covered barcodes (Figures S7G and S7H

## Filter for reads with more than 200 bps

In [10]:
count = 0
CB_filter = set()
for key in CB_read_number:
    if CB_read_number[key] > 200:
        CB_filter.add(key)
        count += 1
print(count)

7716


## Filter for reads with variants at 3010G and 9698T)


In [15]:
# genome_fa = "/data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa"

# cmd = f'samtools mpileup -g -f  {genome_fa} {BAM} > {BAM.replace(".bam",".bcf")}'
# print(cmd)
# !{cmd}

samtools mpileup -g -f  /data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa ../data/test/aml027_post_transplant_possorted_genome_bam.MT.bam > ../data/test/aml027_post_transplant_possorted_genome_bam.MT.bcf
[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000


samtools view -b aml027_post_transplant_possorted_genome_bam.MT.bam MT:3010-3010 | samtools fillmd -e - /data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa | grep -v "^@"| awk -v pos="3010" 'BEGIN {OFS = FS = "\t" } ; {n=split($10,a,"") ; if(a[(pos-$4)+1] != "=" ) print $0, "V:" a[(pos-$4)+1]}' > 3010_reads.txt

samtools view -b aml027_post_transplant_possorted_genome_bam.MT.bam MT:9698-9698 | samtools fillmd -e - /data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa | grep -v "^@"| awk -v pos="9698" 'BEGIN {OFS = FS = "\t" } ; {n=split($10,a,"") ; if(a[(pos-$4)+1] != "=" ) print $0, "V:" a[(pos-$4)+1]}' > 9698_reads.txt

In [12]:
with open("../data/test/3010_reads.txt", "r") as f:
    lines = list(map(lambda x: x.strip(), f.readlines()))
    
    
lines
ref = "A"
var_3010=defaultdict(int)
for i in lines:
    if "CB:Z:" in i:
        if i[-1] != ref and i[-1] != ":":
            curr_bc = i.split("CB:Z:")[1].split("\t")[0].split("-")[0]
            var_3010[curr_bc] +=1

print(f"Total reads mapping there: {len(lines)}")
print(len(var_3010))


Total reads mapping there: 2395
498


In [13]:
with open("../data/test/9698_reads.txt", "r") as f:
    lines = list(map(lambda x: x.strip(), f.readlines()))
    
    
lines
ref = "C"
var_9698=defaultdict(int)
for i in lines:
    if "CB:Z:" in i:
        if i[-1] != ref and i[-1] != ":":
            curr_bc = i.split("CB:Z:")[1].split("\t")[0].split("-")[0]
            var_9698[curr_bc] +=1
print(f"Total reads mapping there: {len(lines)}")
print(len(var_9698))


Total reads mapping there: 1676
589


In [16]:
overlap_var = set(var_9698.keys()).union(set(var_3010.keys()))
print(f"Number of unique barcodes across both variants: {len(overlap_var)}")

Number of unique barcodes across both variants: 1069


In [17]:
print(f"Number of unique barcodes across both variants and with more than 200 reads: {len(overlap_var.intersection(CB_filter))}")

Number of unique barcodes across both variants and with more than 200 reads: 661
