## Trying to understand how to properly process data from /data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim, which is amplified by MT primers and done with 5'-end sequencing with 10x genomics. 

## Libraries and files

In [1]:
from os.path import basename, join
import os
import yaml
import pandas as pd
from glob import glob
import pysam


In [2]:
params = "../parameters/trim_adaptors.yaml"
params = yaml.load(open(params), Loader=yaml.FullLoader)
params

{'samples': 'parameters/samples.tsv',
 'raw_folder': '/data2/isshamie/mito_lineage/data/raw/',
 'dataset': '200403_A00953_0090_BHC23FDSXY',
 'primers_list': '/data2/mito_lineage/BWA-Primers-MT/final_primers_with_tag.csv',
 'adapter': 'AAGCAGTGGTATCAACGCAGAGTAC',
 'ref_fa': '/data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa',
 'transcriptome_dir': 'transcriptome=/data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/'}

In [3]:
proc_dir = "/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/"
#sample_name = "bc5prime_sc5p_pe_lanes"
sample_name = "bc5prime_sc5p_r2_A_1"
data = join(proc_dir, sample_name,"outs/")
bam_f = glob(join(data, "*.bam"))[0]
print(bam_f)

/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/bc5prime_sc5p_r2_A_1/outs/possorted_genome_bam.bam


In [4]:
outdir = join(proc_dir, "picard") #, sample_name)
if not os.path.exists(outdir):
    os.mkdir(outdir)
    
out_f = join(outdir, sample_name+'.txt')
print(out_f)

/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_r2_A_1.txt


## After trimming the adaptor and running cellranger, check the bam file (which had no alignment)

## A. looking at using the -SC5P R2 in 10x

# Check Picard stats on bam file 

In [5]:
cmd = f"picard CollectAlignmentSummaryMetrics \
R={params['ref_fa']} \
I={bam_f} \
O={out_f}"
print(cmd)

if not os.path.exists(out_f):
    os.system(cmd)
else:
    print('command already done')

picard CollectAlignmentSummaryMetrics R=/data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa I=/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/bc5prime_sc5p_r2_A_1/outs/possorted_genome_bam.bam O=/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_r2_A_1.txt
command already done


In [6]:
align_df = pd.read_csv(out_f, sep="\t",comment="#")
align_df.transpose()

Unnamed: 0,0
CATEGORY,UNPAIRED
TOTAL_READS,14207734
PF_READS,14207734
PCT_PF_READS,1
PF_NOISE_READS,0
PF_READS_ALIGNED,51330
PCT_PF_READS_ALIGNED,0.003613
PF_ALIGNED_BASES,4535922
PF_HQ_ALIGNED_READS,13418
PF_HQ_ALIGNED_BASES,1146468


## Read in portion of sam file

In [7]:
samfile = pysam.AlignmentFile(join(data,'tmp_100.sam'))

for read in samfile.fetch():
    print(read)
    break

A00953:90:HC23FDSXY:3:1559:18982:5400	272	0	379486	3	9M253923N20M	-1	-1	29	TCTAACCATGGCCCGTATTTACCCTATAG	array('B', [37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 25, 37, 37, 37])	[('NH', 2), ('HI', 2), ('AS', 21), ('nM', 0), ('RE', 'I'), ('li', 0), ('CR', 'CACCAAGCACCGAAAA'), ('CY', ',,FFFFFF:FFFFFFF'), ('UR', 'TACCCTAAAT'), ('UY', 'FFFFFFFFF:'), ('UB', 'TACCCTAAAT'), ('RG', 'bc5prime_sc5p_r2_A_1:0:1:HC23FDSXY:3')]


In [19]:
sam_df = pd.read_csv(join(data,'tmp_100.sam.txt'),sep="\t", header=None,usecols= list(range(23)))
sam_df = sam_df.rename({9:"Sequence"},axis=1)
sam_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,Sequence,...,13,14,15,16,17,18,19,20,21,22
0,A00953:90:HC23FDSXY:3:1559:18982:5400,272,1,379487,3,9M253923N20M,*,0,0,TCTAACCATGGCCCGTATTTACCCTATAG,...,AS:i:21,nM:i:0,RE:A:I,li:i:0,CR:Z:CACCAAGCACCGAAAA,"CY:Z:,,FFFFFF:FFFFFFF",UR:Z:TACCCTAAAT,UY:Z:FFFFFFFFF:,UB:Z:TACCCTAAAT,RG:Z:bc5prime_sc5p_r2_A_1:0:1:HC23FDSXY:3
1,A00953:90:HC23FDSXY:3:1559:18982:5400,16,1,607673,3,9M25737N20M,*,0,0,TCTAACCATGGCCCGTATTTACCCTATAG,...,AS:i:21,nM:i:0,RE:A:I,xf:i:0,li:i:0,CR:Z:CACCAAGCACCGAAAA,"CY:Z:,,FFFFFF:FFFFFFF",UR:Z:TACCCTAAAT,UY:Z:FFFFFFFFF:,UB:Z:TACCCTAAAT
2,A00953:90:HC23FDSXY:3:2608:15130:23218,272,1,629113,0,8S20M,*,0,0,CTCTGCCTATCGAATACGCCGCAGGCCC,...,AS:i:19,nM:i:0,RE:A:I,li:i:0,CR:Z:CTGCCTATCGAATACG,CY:Z::FFFFFFFFFFFFFFF,UR:Z:CCGCAGGCCC,UY:Z:FFFFFFFFFF,UB:Z:CCGCAGGCCC,RG:Z:bc5prime_sc5p_r2_A_1:0:1:HC23FDSXY:3
3,A00953:90:HC23FDSXY:3:2608:13169:23484,272,1,629113,0,8S20M,*,0,0,CTCTGCCTATCGAATACGCCGCAGGCCC,...,AS:i:19,nM:i:0,RE:A:I,li:i:0,CR:Z:CTGCCTATCGAATACG,CY:Z::FFFFFFFFFFFFFFF,UR:Z:CCGCAGGCCC,UY:Z:FFFFFFFFFF,UB:Z:CCGCAGGCCC,RG:Z:bc5prime_sc5p_r2_A_1:0:1:HC23FDSXY:3
4,A00953:90:HC23FDSXY:3:1668:20066:23343,272,1,629118,0,15M,*,0,0,ATACGCCGCAGGCCC,...,AS:i:14,nM:i:0,RE:A:I,li:i:0,CR:Z:TTTAAACAAAAAATAA,"CY:Z:FFF:,:FFFF,:F::F",UR:Z:AAAACCCATC,UY:Z::FFFFFFFFF,UB:Z:AAAACCCATC,RG:Z:bc5prime_sc5p_r2_A_1:0:1:HC23FDSXY:3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,A00953:90:HC23FDSXY:3:2619:22869:1297,272,1,631081,0,101M,*,0,0,CCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGGAACACT...,...,AS:i:99,nM:i:0,RE:A:I,li:i:0,CR:Z:AACTCTTGTCCTAGCG,CY:Z:FFFFFFFFFFFFFFFF,CB:Z:AACTCTTGTCCTAGCG-1,UR:Z:ACGAGCAAGG,UY:Z:FFFFFFFFFF,UB:Z:ACGAGCAAGG
96,A00953:90:HC23FDSXY:3:1307:20980:30217,0,1,631084,3,101M,*,0,0,ACCGTTGACTATTCTCTACAAACCACAAAGACATTGGAACACTATA...,...,AS:i:99,nM:i:0,RE:A:I,xf:i:0,li:i:0,CR:Z:GCCAAATCATTGGGCC,CY:Z:FFFFFFFFFFFFFFFF,CB:Z:GCCAAATCATTGGGCC-1,UR:Z:ACCTAATTGG,UY:Z:FFFFFFFFFF
97,A00953:90:HC23FDSXY:3:2519:6587:10175,272,1,631090,0,101M,*,0,0,GACTATTCTCTACAAACCACAAAGACATTGGAACACTATACCTATT...,...,AS:i:99,nM:i:0,RE:A:I,li:i:0,CR:Z:GGTATTGAGCTAAGAT,CY:Z:FFFFFFFFFFFFFFFF,CB:Z:GGTATTGAGCTAAGAT-1,UR:Z:TTAAATGGAT,UY:Z:FFFFFFFFFF,UB:Z:TTAAATGGAT
98,A00953:90:HC23FDSXY:3:1459:29975:9815,272,1,631218,0,101M,*,0,0,GGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCATTTG...,...,AS:i:99,nM:i:0,RE:A:I,li:i:0,CR:Z:CTCTACGCAAACGCGA,CY:Z:FFFFFFFFFFFFFFFF,CB:Z:CTCTACGCAAACGCGA-1,UR:Z:GTCAAAGTAA,UY:Z:FFFFFFFFFF,UB:Z:GTCAAAGTAA


## See if the fastq and bam contain the primers we designed

In [10]:
primers = pd.read_csv(params["primers_list"])
primers

Unnamed: 0,Name,Sequence 5',Gene,Strand,MT Position,Sequence 5' with tag,Name with tag
0,gene60922_0,TGTTTATGGGGTGATGTGAG,gene60922,-,646,AAGCAGTGGTATCAACGCAGAGTACTGTTTATGGGGTGATGTGAG,tag_gene60922_0
1,gene60923_349,CTGAGCAAGAGGTGGTGAGG,gene60923,-,1251,AAGCAGTGGTATCAACGCAGAGTACCTGAGCAAGAGGTGGTGAGG,tag_gene60923_349
2,gene60923_830,TGCTGCGTGCTTGATGCTTG,gene60923,-,770,AAGCAGTGGTATCAACGCAGAGTACTGCTGCGTGCTTGATGCTTG,tag_gene60923_830
3,gene60923_499,TAGGGCTAAGCATAGTGGGG,gene60923,-,1101,AAGCAGTGGTATCAACGCAGAGTACTAGGGCTAAGCATAGTGGGG,tag_gene60923_499
4,gene60923_0,GTTCGTCCAAGTGCACTTTC,gene60923,-,1600,AAGCAGTGGTATCAACGCAGAGTACGTTCGTCCAAGTGCACTTTC,tag_gene60923_0
...,...,...,...,...,...,...,...
117,gene60956_981,GCATGGCTAGGAATAGTCCT,gene60956,-,14905,AAGCAGTGGTATCAACGCAGAGTACGCATGGCTAGGAATAGTCCT,tag_gene60956_981
118,gene60956_831,AATATAGGCCTCGCCCGATG,gene60956,-,15055,AAGCAGTGGTATCAACGCAGAGTACAATATAGGCCTCGCCCGATG,tag_gene60956_831
119,gene60956_0,AGGCCCATTTGAGTATTTTG,gene60956,-,15886,AAGCAGTGGTATCAACGCAGAGTACAGGCCCATTTGAGTATTTTG,tag_gene60956_0
120,gene60957_0,TGTCCTTGGAAAAAGGTTTT,gene60957,-,15952,AAGCAGTGGTATCAACGCAGAGTACTGTCCTTGGAAAAAGGTTTT,tag_gene60957_0


## For each one, see if it's in the SAM strings

In [38]:
from collections import defaultdict 


In [40]:
count_primer = defaultdict(int)
count_fq = defaultdict(int)
for ind,val in primers.iterrows():
    for ind2, val2 in sam_df.iterrows():
        if val["Sequence 5'"] in val2["Sequence"]:
            print("Here!")
            count_primer[val["Sequence 5'"]] += 1
        if  val2["Sequence"] in val["Sequence 5'"]:
            print("Hi")
            count_fq[val2["Sequence"]] += 1

print(sam_df.shape)
print(count_fq)
print(count_primer)



Here!
Hi
Here!
Hi
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Here!
Hi
Hi
Hi
(100, 23)
defaultdict(<class 'int'>, {'CTAGGACTATGAGAAT': 1, 'CTAGGACTATGAGAATCGAA': 1, 'TAAGGACTGCAAAACCCC': 2, 'TAAGCACCCTAATCAA': 1})
defaultdict(<class 'int'>, {'CTAGGACTATGAGAATCGAA': 15, 'TAAGGACTGCAAAACCCCAC': 6})


In [42]:
def check_bam(sample_name):
    #sample_name = "bc5prime_sc5p_pe_lanes"
    #sample_name = "bc5prime_sc5p_pe_lanes"
    data = join(proc_dir, sample_name,"outs/")
    bam_f = glob(join(data, "*.bam"))[0]
    print(bam_f)
    outdir = join(proc_dir, "picard") #, sample_name)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    out_f = join(outdir, sample_name+'.txt')
    print(out_f)

    cmd = f"picard CollectAlignmentSummaryMetrics \
    R={params['ref_fa']} \
    I={bam_f} \
    O={out_f}"
    print(cmd)

    if not os.path.exists(out_f):
        os.system(cmd)
    else:
        print('command already done')
        
    align_df = pd.read_csv(out_f, sep="\t",comment="#")
    print(align_df.transpose())
    

In [43]:
check_bam("bc5prime_sc5p_pe_A_1")


/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/bc5prime_sc5p_pe_A_1/outs/possorted_genome_bam.bam
/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_pe_A_1.txt
picard CollectAlignmentSummaryMetrics     R=/data2/genome/human_GRCh38/cellranger/refdata-cellranger-GRCh38-3.0.0/fasta/genome.fa     I=/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/bc5prime_sc5p_pe_A_1/outs/possorted_genome_bam.bam     O=/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_pe_A_1.txt


FileNotFoundError: [Errno 2] File b'/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_pe_A_1.txt' does not exist: b'/data2/isshamie/mito_lineage/data/processed/200403_A00953_0090_BHC23FDSXY/trim/picard/bc5prime_sc5p_pe_A_1.txt'

In [None]:
/data/isshamie/software/cellranger-3.1.0/martian-cs/v3.2.3/adapters/python
