In [1]:
import numpy as np
import pandas as pd
import gzip
from collections import defaultdict
from tqdm import tqdm
from os.path import join
from Bio import SeqIO
import os

In [2]:
from alignment import *

### Set variables

In [3]:
data_path = '/data2/genome/human_GRCh38/ncbi/'
mito_chrom = 'NC_012920.1'

window =150 #bps covered for sequencing


## Load genomic info and extract mitochondria information
- MT fasta sequence
- gff file
- MT genes in fasta

In [4]:
mt_fasta = join(data_path, 'GCF_000001405.38_GRCh38.p12_genomic.fna')

if not os.path.exists("MT.fasta"):
    ref_seq = SeqIO.index(mt_fasta, "fasta")

    mt_seq = ref_seq[mito_chrom]
    with open("MT.fasta", "w") as output_handle:
        SeqIO.write(mt_seq, output_handle, "fasta")
    mt_seq_biopython = mt_seq.seq
    mt_seq = str(mt_seq.seq)
else:
    mt_seq = str(list(SeqIO.parse("MT.fasta", format='fasta'))[0].seq)
    mt_seq_biopython = list(SeqIO.parse("MT.fasta", format='fasta'))[0].seq
    #mt_seq_biopython_reverse = mt_seq_biopython.reverse_complement()

In [5]:
full_gff_filename = join(data_path,'GRCh38_latest_genomic.gff')
gff_filename = "MT.gff"
if not os.path.exists(gff_filename):
    cmd = f"grep {mito_chrom} {full_gff_filename} > {gff_filename}"
    print(cmd)
    os.system(cmd)

gff_info = read_gff(gff_filename, file_format=None)
## Analyzing GFF
gff_info.Type.unique()

gff_info = gff_info.loc[gff_info.Location == mito_chrom].reset_index(drop=True)
gff_info

Reading line 0


Unnamed: 0,Id,Type,Start,End,Strand,Location
0,id1959760,region,1,16569,+,NC_012920.1
1,gene60922,gene,577,647,+,NC_012920.1
2,rna171173,tRNA,577,647,+,NC_012920.1
3,id1959761,exon,577,647,+,NC_012920.1
4,gene60923,gene,648,1601,+,NC_012920.1
...,...,...,...,...,...,...
96,id1959784,exon,15888,15953,+,NC_012920.1
97,gene60958,gene,15956,16023,-,NC_012920.1
98,rna171196,tRNA,15956,16023,-,NC_012920.1
99,id1959785,exon,15956,16023,-,NC_012920.1


In [6]:
gff_info_genes = gff_info[gff_info["Type"]=='gene']
gff_info_genes

Unnamed: 0,Id,Type,Start,End,Strand,Location
1,gene60922,gene,577,647,+,NC_012920.1
4,gene60923,gene,648,1601,+,NC_012920.1
7,gene60924,gene,1602,1670,+,NC_012920.1
10,gene60925,gene,1671,3229,+,NC_012920.1
14,gene60926,gene,3230,3304,+,NC_012920.1
17,gene60927,gene,3307,4262,+,NC_012920.1
19,gene60928,gene,4263,4331,+,NC_012920.1
22,gene60929,gene,4329,4400,-,NC_012920.1
25,gene60930,gene,4402,4469,+,NC_012920.1
28,gene60931,gene,4470,5511,+,NC_012920.1


In [7]:
#cmd = f"bedtools getfasta -fi MT.fasta -bed {gff_filename} -name -fo MT.genes.fasta"
if not os.path.exists('MT.genes.fasta'):
    cmd = 'gff2bed < MT.gff > MT.bed'
    print(cmd)
    os.system(cmd)
    cmd = f"bedtools getfasta -fi MT.fasta -bed MT.bed -s -name -fo MT.genes.fasta"
    print(cmd)
    os.system(cmd) #gff_filename = data_path + 'MT.gff'


In [8]:
genes_seq = SeqIO.to_dict(SeqIO.parse('MT.genes.fasta',format='fasta'))

## Clean up genes_seq names
genes_dict = dict()
for g in genes_seq:
    if "gene" in g:
        
        genes_dict[g.split(":")[0]] = genes_seq[g]
genes_dict

{'gene60922': SeqRecord(seq=Seq('GTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTC...ACA', SingleLetterAlphabet()), id='gene60922::NC_012920.1:576-647(+)', name='gene60922::NC_012920.1:576-647(+)', description='gene60922::NC_012920.1:576-647(+)', dbxrefs=[]),
 'gene60923': SeqRecord(seq=Seq('AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAG...AAC', SingleLetterAlphabet()), id='gene60923::NC_012920.1:647-1601(+)', name='gene60923::NC_012920.1:647-1601(+)', description='gene60923::NC_012920.1:647-1601(+)', dbxrefs=[]),
 'gene60924': SeqRecord(seq=Seq('CAGAGTGTAGCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTA...TGA', SingleLetterAlphabet()), id='gene60924::NC_012920.1:1601-1670(+)', name='gene60924::NC_012920.1:1601-1670(+)', description='gene60924::NC_012920.1:1601-1670(+)', dbxrefs=[]),
 'gene60925': SeqRecord(seq=Seq('GCTAAACCTAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAA...TTT', SingleLetterAlphabet()), id='gene60925::NC_012920.1:1670-3229(+)', name='gene60925::NC_012920.1:1670

### Ensure the gff + means the mt_seq sequence and negative is reverse 

In [9]:
print("Positive strand")
for g in genes_seq:
    if 'gene' in g:
        curr_gene = g.split(":")[0]
        curr_gff = gff_info_genes[gff_info_genes["Id"] == curr_gene]
        if curr_gff["Strand"].values[0] == '+':
            
            print(curr_gff)
            print("MT seq fasta  ", mt_seq[int(curr_gff["Start"])-1:int(curr_gff["End"])])
            print("MT genes fasta", genes_seq[g].seq)

print("Negative strand")
for g in genes_seq:
    if 'gene' in g:
        curr_gene = g.split(":")[0]
        curr_gff = gff_info_genes[gff_info_genes["Id"] == curr_gene]
        if curr_gff["Strand"].values[0] == '-':
            print(curr_gff)
            print("MT seq fasta  ", mt_seq_biopython[int(curr_gff["Start"]-1):int(curr_gff["End"])].reverse_complement())
            print("MT genes fasta", genes_seq[g].seq)
        

Positive strand
          Id  Type  Start  End Strand     Location
1  gene60922  gene    577  647      +  NC_012920.1
MT seq fasta   GTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACA
MT genes fasta GTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACA
          Id  Type  Start   End Strand     Location
4  gene60923  gene    648  1601      +  NC_012920.1
MT seq fasta   AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTC

22  gene60929  gene   4329  4400      -  NC_012920.1
MT seq fasta   TAGGATGGGGTGTGATAGGTGGCACGGAGAATTTTGGATTCTCAGGGATGGGTTCGATTCTCATAGTCCTAG
MT genes fasta TAGGATGGGGTGTGATAGGTGGCACGGAGAATTTTGGATTCTCAGGGATGGGTTCGATTCTCATAGTCCTAG
           Id  Type  Start   End Strand     Location
33  gene60933  gene   5587  5655      -  NC_012920.1
MT seq fasta   AAGGGCTTAGCTTAATTAAAGTGGCTGATTTGCGTTCAGTTGATGCAGAGTGGGGTTTTGCAGTCCTTA
MT genes fasta AAGGGCTTAGCTTAATTAAAGTGGCTGATTTGCGTTCAGTTGATGCAGAGTGGGGTTTTGCAGTCCTTA
           Id  Type  Start   End Strand     Location
36  gene60934  gene   5657  5729      -  NC_012920.1
MT seq fasta   TAGATTGAAGCCAGTTGATTAGGGTGCTTAGCTGTTAACTAAGTGTTTGTGGGTTTAAGTCCCATTGGTCTAG
MT genes fasta TAGATTGAAGCCAGTTGATTAGGGTGCTTAGCTGTTAACTAAGTGTTTGTGGGTTTAAGTCCCATTGGTCTAG
           Id  Type  Start   End Strand     Location
39  gene60935  gene   5761  5826      -  NC_012920.1
MT seq fasta   AGCTCCGAGGTGATTTTCATATTGAATTGCAAATTCGAAGAAGCAGCTTCAAACCTGCCGGGGCTT
MT genes fasta AGCTCCGA

# Load previous primer data
10.1186/1471-2164-10-139

### Primers

In [10]:
primers = pd.read_excel('96-Primers.xlsx')
## remove the first 4, which are restriction enzymes
primers = primers[~(primers["Name"].str.contains("amp"))]
primers

Unnamed: 0,Name,Sequence 5',Corresponding fragment
4,F1,CCGCTTCTGGCCACAGCACT,B
5,F2,GGTTGGTCAATTTCGTGCCAG,B
6,R1,ACTTGGGTTAATCGTGTGACC,B
7,F3,CATCAAGCACGCAGCAATG,B
8,F4,CTCACCACCTCTTGCTCAGC,B
...,...,...,...
95,F69,CGGCTTCGACCCTATATCC,A
96,R24,GGTAAAAGGAGGGCAATTTCT,A
97,F70,CTACTCTCATAACCCTCAACACC,A
98,F71,ATTAAACCAGACCCAGCTACG,A


## Add to primer dataframe gene and what positions of the gene is covered

In [11]:
primers_ext = primers.copy()
primers_ext["Gene"] = ""
primers_ext["Start"] = -1
primers_ext["End"] = -1
for g in tqdm(genes_dict):
    #curr_gene = str(genes_dict[g].seq)
    curr_gene_reverse = str(genes_dict[g].seq.reverse_complement())
    for p_ind,p_val in primers_ext.iterrows():
        pos = curr_gene_reverse.find(p_val["Sequence 5'"])
        if pos != -1:
            if primers_ext.loc[p_ind,"Gene"] == "":
                primers_ext.loc[p_ind,"Gene"] = g
                primers_ext.loc[p_ind,"Start"] = pos
                primers_ext.loc[p_ind,"End"] = pos + window
            else:
                if not "Gene_B" in primers_ext.columns.values:
                    primers_ext["Gene_B"] = ""
                    primers_ext["Start_B"] = -1
                    primers_ext["End_B"] = -1
                if primers_ext.loc[p_ind,"Gene_B"] == "":
                    primers_ext.loc[p_ind,"Start_B"] = pos
                    primers_ext.loc[p_ind,"End_B"] = pos + window
                else:
                    print("More than 2 genes")

100%|██████████| 37/37 [00:00<00:00, 74.17it/s]


In [12]:
(primers_ext["Gene"] != "").sum()
primers_ext[primers_ext["Gene"] != ""]

Unnamed: 0,Name,Sequence 5',Corresponding fragment,Gene,Start,End
6,R1,ACTTGGGTTAATCGTGTGACC,B,gene60923,680,830
14,R2,CTGTTTGTCGTAGGCAGATGG,B,gene60949,537,687
20,R3,GATATCGCCGATACGGTTG,B,gene60953,1272,1422
21,R4,AGCGGATGAGTAAGAAGATTCC,B,gene60953,1025,1175
22,R5,TTGAAGAAGGCGTGGGTACAG,B,gene60953,805,955
29,F21,TCCAAAGACAACCATCATTCC,B,gene60954,321,471
30,R6,TTATCGGAATGGGAGGTGATTC,B,gene60956,491,641
36,R8,TCATAAGGGCTATCGTAGTTTTC,B,gene60923,199,349
42,R9,GCTGTGTTGGCATCTGCTC,B,gene60953,1312,1462
45,F33,CCTTCATAAATTATTCAGCTTCCT,B,gene60954,139,289


In [13]:
(primers_ext["Gene"] == '').sum()

69

In [14]:
(primers_ext["Gene"] != '').sum()

27

## Determine the region of genes that are covered or not

In [15]:
def cover_regions(primers,genes_dict,frag_size= 50 ):
    print("here")
    genes_covered = defaultdict(list)
    
    # For each gene, determine which regions are covered by primer, and where they start and stop
    for g in genes_dict:
        genes_covered[g] = []
        matches = primers[primers["Gene"] == g]
        for m_ind, m_val in matches.iterrows():
            genes_covered[g].append((m_val["Start"],m_val["End"]))

    # For each gene create a boolean vector for each position if it is covered or not
    gene_positions = dict()
    for g in genes_dict:
        gene_positions[g] = [False]*len(genes_dict[g].seq)
        for regions in genes_covered[g]:
            gene_positions[g][regions[0]:regions[1]] = [True]*(regions[1]-regions[0])    

    #Determine the uncovered regoins 
    genes_uncovered = defaultdict(list)
    for g in gene_positions:
        curr_start = 0
        curr_bool = gene_positions[g][0]
        for ind,val in enumerate(gene_positions[g]):        
            if not val == curr_bool:
                if curr_bool == False:
                    genes_uncovered[g].append((curr_start,ind))
                curr_bool = val
                curr_start = ind
        #If end is uncovered
        if curr_bool == False:
            genes_uncovered[g].append((curr_start,len(gene_positions[g])))
    
    print("Regions not covered")
    count = 0
    for g in genes_uncovered:
        print(g)
        for p in genes_uncovered[g]:
            print(p)
            print("Fragment size:", p[1]-p[0])
            count += 1
    print(f"Number of uncovered regions: {count}")
    
    ## Add primers in uncovered regions at the beginning of each gene
    new_primers = dict()
    for g in genes_uncovered:
        curr_seq_reverse = str(genes_dict[g].seq.reverse_complement())
        for r in genes_uncovered[g]:
            if r[1]-r[0] >= frag_size:
                new_primers[f"{g}_{r[0]}"] = curr_seq_reverse[r[0]:r[0]+20]

    
    final_primers = primers[primers["Gene"] != ""]
    for p in new_primers:
        final_primers = pd.concat((final_primers, 
                                   pd.DataFrame({"Sequence 5'":new_primers[p], "Gene":p.split("_")[0],"Name":p,
                                                "Start":int(p.split("_")[1]), "End":int(p.split("_")[1])+window},
                                                index=[p])),sort=False)
    
    
    if len(primers) == len(final_primers):
        return final_primers, genes_uncovered
    else: # If there is more to change, run again. 
        return cover_regions(final_primers,genes_dict,frag_size= 50 )
    return 

In [16]:
final_primers, genes_uncovered = cover_regions(primers_ext,genes_dict,frag_size= 50 )
final_primers = final_primers[["Name","Sequence 5'", "Start","End", "Gene"]]

here
Regions not covered
gene60922
(0, 71)
Fragment size: 71
gene60923
(0, 199)
Fragment size: 199
(349, 680)
Fragment size: 331
(830, 954)
Fragment size: 124
gene60924
(0, 69)
Fragment size: 69
gene60925
(0, 223)
Fragment size: 223
(373, 1500)
Fragment size: 1127
gene60926
(0, 75)
Fragment size: 75
gene60927
(0, 705)
Fragment size: 705
(855, 956)
Fragment size: 101
gene60928
(0, 69)
Fragment size: 69
gene60929
(0, 72)
Fragment size: 72
gene60930
(0, 68)
Fragment size: 68
gene60931
(0, 1042)
Fragment size: 1042
gene60932
(0, 68)
Fragment size: 68
gene60933
(0, 69)
Fragment size: 69
gene60934
(0, 43)
Fragment size: 43
gene60935
(0, 44)
Fragment size: 44
gene60936
(0, 66)
Fragment size: 66
gene60937
(0, 190)
Fragment size: 190
(340, 765)
Fragment size: 425
(915, 919)
Fragment size: 4
(1069, 1542)
Fragment size: 473
gene60938
(0, 69)
Fragment size: 69
gene60939
(0, 68)
Fragment size: 68
gene60940
(0, 165)
Fragment size: 165
(315, 477)
Fragment size: 162
(627, 684)
Fragment size: 57
gene60

In [17]:
genes_uncovered

defaultdict(list,
            {'gene60923': [(150, 199), (649, 680)],
             'gene60934': [(0, 43)],
             'gene60935': [(0, 44)],
             'gene60937': [(150, 190), (915, 919), (1519, 1542)],
             'gene60940': [(150, 165), (465, 477)],
             'gene60944': [(737, 784)],
             'gene60954': [(289, 321)],
             'gene60955': [(0, 27)],
             'gene60956': [(450, 491), (641, 681), (1131, 1141)]})

### Add in strand

In [18]:
final_primers["Strand"] = ""
for ind,p in final_primers.iterrows():
    strand = gff_info_genes.set_index("Id").loc[p["Gene"],"Strand"]
    if (strand) == "+":
        final_primers.loc[ind,"Strand"] = "-"
    else:
        final_primers.loc[ind,"Strand"] = "+"
final_primers

Unnamed: 0,Name,Sequence 5',Start,End,Gene,Strand
6,R1,ACTTGGGTTAATCGTGTGACC,680,830,gene60923,-
14,R2,CTGTTTGTCGTAGGCAGATGG,537,687,gene60949,-
20,R3,GATATCGCCGATACGGTTG,1272,1422,gene60953,-
21,R4,AGCGGATGAGTAAGAAGATTCC,1025,1175,gene60953,-
22,R5,TTGAAGAAGGCGTGGGTACAG,805,955,gene60953,-
...,...,...,...,...,...,...
gene60931_750,gene60931_750,TTATTAATGATGAGTATTGA,750,900,gene60931,-
gene60953_750,gene60953_750,TTGTTCATTGTTAAGGTTGT,750,900,gene60953,-
gene60925_1273,gene60925_1273,ACATAGACGGGTGTGCTCTT,1273,1423,gene60925,-
gene60931_900,gene60931_900,TTTTTTTGGTTAGAACTGGA,900,1050,gene60931,-


### Add in MT position (and if its negative strand, then position is reverse complement position

In [19]:
final_primers["MT Position"] = -1
for ind,p in final_primers.iterrows():
    if p["Strand"] == "+":
        pos = mt_seq.find(p["Sequence 5'"])
    else:
        pos = mt_seq_biopython.reverse_complement().find(p["Sequence 5'"])
        if pos == -1:
            print("Not here!")
        else:
            pos = len(mt_seq) - pos - 1
    if pos == -1:
        print("Not here!")
    else:
        final_primers.loc[ind,"MT Position"] = pos
        
final_primers

Unnamed: 0,Name,Sequence 5',Start,End,Gene,Strand,MT Position
6,R1,ACTTGGGTTAATCGTGTGACC,680,830,gene60923,-,920
14,R2,CTGTTTGTCGTAGGCAGATGG,537,687,gene60949,-,11599
20,R3,GATATCGCCGATACGGTTG,1272,1422,gene60953,-,12875
21,R4,AGCGGATGAGTAAGAAGATTCC,1025,1175,gene60953,-,13122
22,R5,TTGAAGAAGGCGTGGGTACAG,805,955,gene60953,-,13342
...,...,...,...,...,...,...,...
gene60931_750,gene60931_750,TTATTAATGATGAGTATTGA,750,900,gene60931,-,4760
gene60953_750,gene60953_750,TTGTTCATTGTTAAGGTTGT,750,900,gene60953,-,13397
gene60925_1273,gene60925_1273,ACATAGACGGGTGTGCTCTT,1273,1423,gene60925,-,1955
gene60931_900,gene60931_900,TTTTTTTGGTTAGAACTGGA,900,1050,gene60931,-,4610


# SAVE

In [20]:
final_primers[["Name","Sequence 5'","Gene", "Strand", "MT Position"]].sort_values("Gene").to_csv("final_primers.csv", index=None)

In [21]:
final_primers

Unnamed: 0,Name,Sequence 5',Start,End,Gene,Strand,MT Position
6,R1,ACTTGGGTTAATCGTGTGACC,680,830,gene60923,-,920
14,R2,CTGTTTGTCGTAGGCAGATGG,537,687,gene60949,-,11599
20,R3,GATATCGCCGATACGGTTG,1272,1422,gene60953,-,12875
21,R4,AGCGGATGAGTAAGAAGATTCC,1025,1175,gene60953,-,13122
22,R5,TTGAAGAAGGCGTGGGTACAG,805,955,gene60953,-,13342
...,...,...,...,...,...,...,...
gene60931_750,gene60931_750,TTATTAATGATGAGTATTGA,750,900,gene60931,-,4760
gene60953_750,gene60953_750,TTGTTCATTGTTAAGGTTGT,750,900,gene60953,-,13397
gene60925_1273,gene60925_1273,ACATAGACGGGTGTGCTCTT,1273,1423,gene60925,-,1955
gene60931_900,gene60931_900,TTTTTTTGGTTAGAACTGGA,900,1050,gene60931,-,4610


In [22]:
# Regions covered in DNA 
pos_mt = [False]*(len(mt_seq))
neg_mt = [False]*(len(mt_seq))
for ind,val in final_primers.iterrows(): 
    if val["Strand"] == "-":
        neg_mt[val["MT Position"]:min(val["MT Position"]+150,len(mt_seq))] = [True]*(min(val["MT Position"]+150,len(mt_seq))-val["MT Position"])
    elif val["Strand"] == "+" :
        pos_mt[val["MT Position"]:min(val["MT Position"]+150,len(mt_seq))] = [True]*(min(val["MT Position"]+150,len(mt_seq))-val["MT Position"])
    
print(np.array(pos_mt).sum())
print(np.array(neg_mt).sum())

1509
14223


In [23]:
gene_pos_mt = [False]*(len(mt_seq))
gene_neg_mt = [False]*(len(mt_seq))
for ind, val in gff_info_genes.iterrows():
    if val["Strand"] == "-":
        gene_neg_mt[val["Start"]:val["End"]] = [True]*(val["End"]-val["Start"])
    else:
        gene_pos_mt[val["Start"]:val["End"]] = [True]*(val["End"]-val["Start"])
    

print(np.array(gene_pos_mt).sum())
print(np.array(gene_neg_mt).sum())

14260
1068


# Include all the missing primers

### Find the primers used and not used in old set

In [24]:
primers_used = set()
primers_not_used = set()
for p in primers["Name"].values:
    if p in final_primers["Name"].values:
        primers_used.add(p)
    else:
        primers_not_used.add(p)
primers_used
primers_not_used

{'F1',
 'F10',
 'F11',
 'F12',
 'F13',
 'F14',
 'F15',
 'F16',
 'F17',
 'F18',
 'F19',
 'F2',
 'F20',
 'F22',
 'F23',
 'F24',
 'F25',
 'F26',
 'F27',
 'F28',
 'F29',
 'F3',
 'F30',
 'F31',
 'F32',
 'F35',
 'F37',
 'F38',
 'F39',
 'F4',
 'F40',
 'F41',
 'F42',
 'F43',
 'F44',
 'F45',
 'F47',
 'F48',
 'F49',
 'F5',
 'F50',
 'F51',
 'F52',
 'F53',
 'F54',
 'F55',
 'F56',
 'F57',
 'F58',
 'F59',
 'F6',
 'F60',
 'F61',
 'F63',
 'F64',
 'F65',
 'F66',
 'F67',
 'F68',
 'F69',
 'F7',
 'F70',
 'F71',
 'F72',
 'F8',
 'F9',
 'R14',
 'R18',
 'R7'}

In [25]:
len(primers_not_used)

69

In [26]:
len(primers_used)

27

In [27]:
final_primers_withAllOld = final_primers.copy()
primers_ext = primers_ext.set_index("Name", drop=False)

for p in primers_not_used:    
    print(p)
    curr_seq = primers_ext.loc[p,"Sequence 5'"]
    pos = mt_seq.find(curr_seq)
    if pos == -1:
        #Try negative
        pos = str(mt_seq_biopython.reverse_complement()).find(curr_seq)
        if pos != -1:
            strand = "-"
            pos = len(mt_seq) - pos - 1
    else:
        strand = "+"
        
    final_primers_withAllOld = pd.concat((final_primers_withAllOld, 
                       pd.DataFrame({"Sequence 5'":primers_ext.loc[p,"Sequence 5'"],"Name":p,
                                    "Strand":strand,"MT Position": pos}, index=[p])),sort=False)


final_primers_withAllOld[["Name","Sequence 5'","Gene", "Strand", "MT Position"]].sort_values("Gene").to_csv("final_primers_withAllOld.csv", index=None)

final_primers_withAllOld

F7
F28
F5
F57
R18
F8
F6
F24
F29
F70
F35
F44
F11
F58
F4
F69
F9
F65
F52
F54
F22
F45
F53
F13
F16
F56
F27
F3
F67
F49
F19
F2
F1
F41
F39
F40
F59
F37
F71
F17
F10
F51
F50
F12
F42
R7
F23
F14
F30
F32
F25
R14
F47
F48
F61
F31
F68
F72
F18
F60
F55
F63
F43
F20
F26
F38
F66
F15
F64


Unnamed: 0,Name,Sequence 5',Start,End,Gene,Strand,MT Position
6,R1,ACTTGGGTTAATCGTGTGACC,680.0,830.0,gene60923,-,920
14,R2,CTGTTTGTCGTAGGCAGATGG,537.0,687.0,gene60949,-,11599
20,R3,GATATCGCCGATACGGTTG,1272.0,1422.0,gene60953,-,12875
21,R4,AGCGGATGAGTAAGAAGATTCC,1025.0,1175.0,gene60953,-,13122
22,R5,TTGAAGAAGGCGTGGGTACAG,805.0,955.0,gene60953,-,13342
...,...,...,...,...,...,...,...
F26,F26,GTGGCAAGAAATGGGCTAC,,,,+,1347
F38,F38,CGACCTCGATGTTGGATCAGGACA,,,,+,2981
F66,F66,AACAACCGACTAATCACCACCCAACAATG,,,,+,8640
F15,F15,TTCATCCCTGTAGCATTGTTCG,,,,+,12600


In [28]:
# Regions covered in DNA 
pos_mt = [False]*(len(mt_seq))
neg_mt = [False]*(len(mt_seq))
for ind,val in final_primers_withAllOld.iterrows(): 
    if val["Strand"] == "-":
        neg_mt[val["MT Position"]:min(val["MT Position"]+150,len(mt_seq))] = [True]*(min(val["MT Position"]+150,len(mt_seq))-val["MT Position"])
    elif val["Strand"] == "+" :
        pos_mt[val["MT Position"]:min(val["MT Position"]+150,len(mt_seq))] = [True]*(min(val["MT Position"]+150,len(mt_seq))-val["MT Position"])
    
print(np.array(pos_mt).sum())
print(np.array(neg_mt).sum())

9862
14504
