In [1]:
import numpy as np
import pandas as pd

import gzip

from collections import defaultdict

from tqdm import tqdm
from os.path import join
from Bio import SeqIO

import os

In [2]:
from alignment import *

### Set variables

In [3]:
data_path = '/data2/genome/human_GRCh38/ncbi/'
mito_chrom = 'NC_012920.1'

window =150 #bps covered for sequencing


## Load genomic info and extract mitochondria information
- fasta sequence
- gff file
- genes as fasta

In [4]:
mt_fasta = join(data_path, 'GCF_000001405.38_GRCh38.p12_genomic.fna')

if not os.path.exists("MT.fasta"):
    ref_seq = SeqIO.index(mt_fasta, "fasta")

    mt_seq = ref_seq[mito_chrom]
    with open("MT.fasta", "w") as output_handle:
        SeqIO.write(mt_seq, output_handle, "fasta")
    mt_seq_biopython = mt_seq.seq
    mt_seq = str(mt_seq.seq)
else:
    mt_seq = str(list(SeqIO.parse("MT.fasta", format='fasta'))[0].seq)
    mt_seq_biopython = list(SeqIO.parse("MT.fasta", format='fasta'))[0].seq
    #mt_seq_biopython_reverse = mt_seq_biopython.reverse_complement()

In [5]:
full_gff_filename = join(data_path,'GRCh38_latest_genomic.gff')
gff_filename = "MT.gff"
if not os.path.exists(gff_filename):
    cmd = f"grep {mito_chrom} {full_gff_filename} > {gff_filename}"
    print(cmd)
    os.system(cmd)

gff_info = read_gff(gff_filename, file_format=None)
## Analyzing GFF
gff_info.Type.unique()

gff_info = gff_info.loc[gff_info.Location == mito_chrom].reset_index(drop=True)
gff_info

Reading line 0


Unnamed: 0,Id,Type,Start,End,Strand,Location
0,id1959760,region,1,16569,+,NC_012920.1
1,gene60922,gene,577,647,+,NC_012920.1
2,rna171173,tRNA,577,647,+,NC_012920.1
3,id1959761,exon,577,647,+,NC_012920.1
4,gene60923,gene,648,1601,+,NC_012920.1
...,...,...,...,...,...,...
96,id1959784,exon,15888,15953,+,NC_012920.1
97,gene60958,gene,15956,16023,-,NC_012920.1
98,rna171196,tRNA,15956,16023,-,NC_012920.1
99,id1959785,exon,15956,16023,-,NC_012920.1


In [6]:
#cmd = f"bedtools getfasta -fi MT.fasta -bed {gff_filename} -name -fo MT.genes.fasta"
if not os.path.exists('MT.genes.fasta'):
    cmd = 'gff2bed < MT.gff > MT.bed'
    print(cmd)
    os.system(cmd)
    cmd = f"bedtools getfasta -fi MT.fasta -bed MT.bed -name -fo MT.genes.fasta"
    print(cmd)
    os.system(cmd) #gff_filename = data_path + 'MT.gff'


In [10]:
mt_fasta = join(data_path, 'GCF_000001405.38_GRCh38.p12_genomic.fna')
ref_seq = get_read_sequences(mt_fasta, file_format='fasta', compression=None)
mt_seq = ref_seq[mito_chrom]
mt_seq

Loading /data2/genome/human_GRCh38/ncbi/GCF_000001405.38_GRCh38.p12_genomic.fna


'GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCACTTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAAC

In [None]:
gff_filename = join(data_path,'GRCh38_latest_genomic.gff')
gff_info = read_gff(gff_filename, file_format=None)
## Analyzing GFF
gff_info.Type.unique()

gff_info = gff_info.loc[gff_info.Location == mito_chrom].reset_index(drop=True)

# Load data

### Primers

In [7]:
primers = pd.read_excel('96-Primers.xlsx')

In [8]:
primers.head()

Unnamed: 0,Name,Sequence 5',Corresponding fragment
0,FampA,AAATCTTACCCCGCCTGTTT,A
1,RampA,AATTAGGCTGTGGGTGGTTG,A
2,FampB,GCCATACTAGTCTTTGCCGC,B
3,RampB,GGCAGGTCAATTTCACTGGT,B
4,F1,CCGCTTCTGGCCACAGCACT,B


In [9]:
## remove the first 4, which are restriction enzymes
primers = primers[~(primers["Name"].str.contains("amp"))]
primers

Unnamed: 0,Name,Sequence 5',Corresponding fragment
4,F1,CCGCTTCTGGCCACAGCACT,B
5,F2,GGTTGGTCAATTTCGTGCCAG,B
6,R1,ACTTGGGTTAATCGTGTGACC,B
7,F3,CATCAAGCACGCAGCAATG,B
8,F4,CTCACCACCTCTTGCTCAGC,B
...,...,...,...
95,F69,CGGCTTCGACCCTATATCC,A
96,R24,GGTAAAAGGAGGGCAATTTCT,A
97,F70,CTACTCTCATAACCCTCAACACC,A
98,F71,ATTAAACCAGACCCAGCTACG,A


In [10]:
with open('96-Primers.fasta', mode='w') as f:
    for row in primers.iterrows():
        print('>Primer-{}|MT|Fragment-{}'.format(row[1]['Name'], row[1]['Corresponding fragment']), file=f)
        print(row[1]["Sequence 5'"], file=f)

In [11]:
len(primers)

96

In [12]:
labels = dict(zip(primers['Sequence 5\''].values.tolist(), primers['Name'].values.tolist())) 
labels

{'CCGCTTCTGGCCACAGCACT': 'F1',
 'GGTTGGTCAATTTCGTGCCAG': 'F2',
 'ACTTGGGTTAATCGTGTGACC': 'R1',
 'CATCAAGCACGCAGCAATG': 'F3',
 'CTCACCACCTCTTGCTCAGC': 'F4',
 'CTTGACCGCTCTGAGCTAAAC': 'F5',
 'AAGCTAAGACCCCCGAAACC': 'F6',
 'AAACCTACCGAGCCTGGTG': 'F7',
 'GAGGAACAGCTCTTTGGACAC': 'F8',
 'TCGTCCCAACAATTATATTACTACCA': 'F9',
 'CTGTTTGTCGTAGGCAGATGG': 'R2',
 'AACGCCACTTATCCAGTGAACC': 'F10',
 'GACTCCCTAAAGCCCATGTCG': 'F11',
 'CATCTGCCTACGACAAACA': 'F12',
 'ACAGCCATTCTCATCCAAACCC': 'F13',
 'AACCACGTTCTCCTGATCAAA': 'F14',
 'GATATCGCCGATACGGTTG': 'R3',
 'AGCGGATGAGTAAGAAGATTCC': 'R4',
 'TTGAAGAAGGCGTGGGTACAG': 'R5',
 'TTCATCCCTGTAGCATTGTTCG': 'F15',
 'TTGCTCATCAGTTGATGATACG': 'F16',
 'CACTCTGTTCGCAGCAGTATG': 'F17',
 'CATCATCGAAACCGCAAAC': 'F18',
 'TTTCTCCAACATACTCGGATTC': 'F19',
 'ACAAACAATGGTCAACCAGTAAC': 'F20',
 'TCCAAAGACAACCATCATTCC': 'F21',
 'TTATCGGAATGGGAGGTGATTC': 'R6',
 'TACTCACCAGACGCCTCAACCG': 'F22',
 'AGTCCCACCCTCACACGATTC': 'F23',
 'CGCCTACACAATTCTCCGATC': 'F24',
 'CGGTTGTTGATGGGTGAGTC'

# BWA

In [13]:
complementarity = {'A' : 'T',
                   'C' : 'G',
                   'G' : 'C',
                   'T' : 'A',
                   'N' : 'N'
                   }


## Perform search

In [14]:
primers["Sequence 5'"].values.tolist()

['CCGCTTCTGGCCACAGCACT',
 'GGTTGGTCAATTTCGTGCCAG',
 'ACTTGGGTTAATCGTGTGACC',
 'CATCAAGCACGCAGCAATG',
 'CTCACCACCTCTTGCTCAGC',
 'CTTGACCGCTCTGAGCTAAAC',
 'AAGCTAAGACCCCCGAAACC',
 'AAACCTACCGAGCCTGGTG',
 'GAGGAACAGCTCTTTGGACAC',
 'TCGTCCCAACAATTATATTACTACCA',
 'CTGTTTGTCGTAGGCAGATGG',
 'AACGCCACTTATCCAGTGAACC',
 'GACTCCCTAAAGCCCATGTCG',
 'CATCTGCCTACGACAAACA',
 'ACAGCCATTCTCATCCAAACCC',
 'AACCACGTTCTCCTGATCAAA',
 'GATATCGCCGATACGGTTG',
 'AGCGGATGAGTAAGAAGATTCC',
 'TTGAAGAAGGCGTGGGTACAG',
 'TTCATCCCTGTAGCATTGTTCG',
 'TTGCTCATCAGTTGATGATACG',
 'CACTCTGTTCGCAGCAGTATG',
 'CATCATCGAAACCGCAAAC',
 'TTTCTCCAACATACTCGGATTC',
 'ACAAACAATGGTCAACCAGTAAC',
 'TCCAAAGACAACCATCATTCC',
 'TTATCGGAATGGGAGGTGATTC',
 'TACTCACCAGACGCCTCAACCG',
 'AGTCCCACCCTCACACGATTC',
 'CGCCTACACAATTCTCCGATC',
 'CGGTTGTTGATGGGTGAGTC',
 'AAATGGGCCTGTCCTTGTAG',
 'TCATAAGGGCTATCGTAGTTTTC',
 'GTGGCAAGAAATGGGCTAC',
 'AACATATAACTGAACTCCTCACACC',
 'GCCGCAGTACTCTTAAAACTAGG',
 'AGGACTCAACATACTAGTCACAGC',
 'GCCATACTAGTCTTTGCCGC',
 'GC

In [15]:
result = search_positions(primers["Sequence 5'"].values.tolist(), mt_seq, K=100)
result = result.dropna().reset_index(drop=True)

In [16]:
result.head(10)

Unnamed: 0,Pattern,Start-Position,End-Position,Strand
0,CCGCTTCTGGCCACAGCACT,314.0,333.0,+
1,GGTTGGTCAATTTCGTGCCAG,873.0,893.0,+
2,CATCAAGCACGCAGCAATG,756.0,774.0,+
3,CTCACCACCTCTTGCTCAGC,1234.0,1253.0,+
4,CTTGACCGCTCTGAGCTAAAC,1657.0,1677.0,+
5,AAGCTAAGACCCCCGAAACC,1892.0,1911.0,+
6,AAACCTACCGAGCCTGGTG,1993.0,2011.0,+
7,GAGGAACAGCTCTTTGGACAC,2105.0,2125.0,+
8,TCGTCCCAACAATTATATTACTACCA,10773.0,10798.0,+
9,AACGCCACTTATCCAGTGAACC,11001.0,11022.0,+


In [17]:
result['Pattern'].unique().shape

(93,)

In [18]:
result['Label'] = result.Pattern.map(labels)
result

Unnamed: 0,Pattern,Start-Position,End-Position,Strand,Label
0,CCGCTTCTGGCCACAGCACT,314.0,333.0,+,F1
1,GGTTGGTCAATTTCGTGCCAG,873.0,893.0,+,F2
2,CATCAAGCACGCAGCAATG,756.0,774.0,+,F3
3,CTCACCACCTCTTGCTCAGC,1234.0,1253.0,+,F4
4,CTTGACCGCTCTGAGCTAAAC,1657.0,1677.0,+,F5
...,...,...,...,...,...
88,GTGCTTTCTCGTGTTACATCG,9382.0,9403.0,-,R20
89,GAAAGTTGAGCCAATAATGACG,9826.0,9848.0,-,R21
90,AGTTACAATATGGGAGATTATTCC,6656.0,6680.0,-,R22
91,ATCTGTTTTTAAGCCTAATGTGG,8081.0,8104.0,-,R23


In [19]:
result.to_excel('96-Primers-Alignment.xlsx', index=False)

# Map positions into metadata in GFF

## Analyzing GFF

In [20]:
gff_info.Type.unique()

array(['region', 'gene', 'tRNA', 'exon', 'rRNA', 'sequence_feature',
       'CDS', 'D_loop'], dtype=object)

In [21]:
gff_info = gff_info.loc[gff_info.Location == mito_chrom].reset_index(drop=True)

In [22]:
gff_info.loc[gff_info.Type == 'gene']

Unnamed: 0,Id,Type,Start,End,Strand,Location
1,gene60922,gene,577,647,+,NC_012920.1
4,gene60923,gene,648,1601,+,NC_012920.1
7,gene60924,gene,1602,1670,+,NC_012920.1
10,gene60925,gene,1671,3229,+,NC_012920.1
14,gene60926,gene,3230,3304,+,NC_012920.1
17,gene60927,gene,3307,4262,+,NC_012920.1
19,gene60928,gene,4263,4331,+,NC_012920.1
22,gene60929,gene,4329,4400,-,NC_012920.1
25,gene60930,gene,4402,4469,+,NC_012920.1
28,gene60931,gene,4470,5511,+,NC_012920.1


## Mapping

### Map GFF elements into positions

In [23]:
pos_info = defaultdict(lambda: {'elements' : set(), 'primers' : set()})

In [24]:
gff_transcript = gff_info.loc[gff_info.Type == 'gene'].reset_index(drop=True)

for i in tqdm(range(1, len(mt_seq)+1)):
    mapped_df = gff_transcript.loc[gff_transcript.apply(lambda row: between(row['Start'], row['End'], i),
                                                        axis=1)]
    pos_info[i]['elements'] = pos_info[i]['elements'].union(set(mapped_df.Id.unique()))

100%|██████████| 16569/16569 [00:48<00:00, 342.08it/s]


In [25]:
pos_info[7444]

{'elements': {'gene60937'}, 'primers': set()}

### Map Primers into positions

In [26]:
location_info = []
for row in result.iterrows():
    r = row[1]
    # See if primer is located in a gene and in the opposite strand (complementary to the transcript)
    df = map_element_with_gff(r['Start-Position'],
                              r['End-Position'],
                              gff_transcript,
                              #strand=r['Strand']
                             )
    if len(df) != 0:
        # When gene is in - and primer in +
        if r['Strand'] == '+':
            start = int(r['Start-Position'])
            end = start + window - 1 
        elif r['Strand'] == '-':
            end = int(r['End-Position'])
            start = end - window +1
            
        for i in range(start, end+1):
            pos_info[i]['primers'].add(r['Label'])
            
        location_info.append((int(r['Start-Position']), r['Label'], df.loc[0, 'Id']))

print(len(location_info))
print('-'*100)
print('Primers that are complementary to the transcripts')
print('-'*100)
for l in sorted(location_info):
        print('Primer {} located in gene {} at {}'.format(l[1], l[2], l[0]))

87
----------------------------------------------------------------------------------------------------
Primers that are complementary to the transcripts
----------------------------------------------------------------------------------------------------
Primer F3 located in gene gene60923 at 756
Primer F35 located in gene gene60923 at 870
Primer F2 located in gene gene60923 at 873
Primer R1 located in gene gene60923 at 900
Primer F4 located in gene gene60923 at 1234
Primer F26 located in gene gene60923 at 1348
Primer R8 located in gene gene60923 at 1379
Primer F36 located in gene gene60925 at 1709
Primer F6 located in gene gene60925 at 1892
Primer F7 located in gene gene60925 at 1993
Primer F8 located in gene gene60925 at 2105
Primer F27 located in gene gene60925 at 2238
Primer F56 located in gene gene60925 at 2579
Primer F38 located in gene gene60925 at 2982
Primer R12 located in gene gene60925 at 2985
Primer F39 located in gene gene60926 at 3234
Primer F40 located in gene gene60927 

In [27]:
pos_info[13123]

{'elements': {'gene60953'}, 'primers': {'R4'}}

## Extract covered and uncovered regions when using primers for sequencing the transcripts

In [28]:
regions = dict()

tmp_cov = []
tmp_uncov = []
for i in tqdm(range(1, len(mt_seq)+1)):
    if len(pos_info[i]['primers']) > 0:
        tmp_cov.append(i)
    elif len(pos_info[i]['elements']) > 0:
        tmp_uncov.append(i)
        
regions['covered'] = list(interval_extract(tmp_cov))
regions['uncovered'] = list(interval_extract(tmp_uncov))
    

100%|██████████| 16569/16569 [00:00<00:00, 1128685.49it/s]


In [29]:
print('-'*100)
print('Covered regions')
print('-'*100)
for r in regions['covered']:
    size = r[1] - r[0] + 1
    print('Fragment size: {} - Located at {}-{}'.format(size, r[0], r[1]))
    
print('-'*100)
print('Uncovered regions')
print('-'*100)
for r in regions['uncovered']:
    size = r[1] - r[0] + 1
    print('Fragment size: {} - Located at {}-{}'.format(size, r[0], r[1]))

----------------------------------------------------------------------------------------------------
Covered regions
----------------------------------------------------------------------------------------------------
Fragment size: 267 - Located at 756-1022
Fragment size: 264 - Located at 1234-1497
Fragment size: 150 - Located at 1580-1729
Fragment size: 496 - Located at 1892-2387
Fragment size: 150 - Located at 2579-2728
Fragment size: 275 - Located at 2857-3131
Fragment size: 150 - Located at 3234-3383
Fragment size: 183 - Located at 3408-3590
Fragment size: 231 - Located at 3850-4080
Fragment size: 150 - Located at 4196-4345
Fragment size: 150 - Located at 4613-4762
Fragment size: 150 - Located at 4797-4946
Fragment size: 150 - Located at 5032-5181
Fragment size: 228 - Located at 5318-5545
Fragment size: 255 - Located at 5700-5954
Fragment size: 439 - Located at 6242-6680
Fragment size: 150 - Located at 6809-6958
Fragment size: 254 - Located at 7075-7328
Fragment size: 194 - Locate

## Construct additional primers for regions that are not covered (only if minimum of 25 bp uncovered)


In [30]:
## Loop through uncovered regions, if positive strand take the start plus the first 20 bp,
##                                 if negative strand take the reverse complement of the end - 20bp
##                                 e.g. start=20, end=150, take 130-150 then reverse complement
new_primers = dict()
for i in regions["uncovered"]:
    if i[1] - i[0] + 1 >= 25:
        curr_gene = list(pos_info[i[0]+2]["elements"])[0]
        curr = gff_transcript[gff_transcript['Id']==list(pos_info[i[0]]['elements'])[0]]
        if curr["Strand"].values[0] == '+':
            new_primers["F_p" + str(i[0])  ] = [str(mt_seq_biopython[i[0]:i[0]+20]), curr_gene]
        elif curr["Strand"].values[0] == "-":
            new_primers["R_p" + str(i[1])] = [str(mt_seq_biopython[i[1]-20:i[1]].reverse_complement()), curr_gene]
        else:
            print("No strand information")
print(len(new_primers))
new_primers

44


{'F_p577': ['TTTATGTAGCTTACCTCCTC', 'gene60922'],
 'F_p1023': ['GAAAGTGGCTTTAACATATC', 'gene60923'],
 'F_p1498': ['TCAAGTATACTTCAAAGGAC', 'gene60923'],
 'F_p1730': ['ACCCAAATAAAGTATAGGCG', 'gene60925'],
 'F_p2388': ['CAATCAACCAACAAGTCATT', 'gene60925'],
 'F_p2729': ['ATGGAGCTTTAATTTATTAA', 'gene60925'],
 'F_p3132': ['ACAAGAGAAATAAGGCCTAC', 'gene60925'],
 'F_p3591': ['GTCAACCTCAACCTAGGCCT', 'gene60927'],
 'F_p4081': ['TTGTCACCAAGACCCTACTT', 'gene60927'],
 'R_p4400': ['TAGGATGGGGTGTGATAGGT', 'gene60929'],
 'F_p4402': ['GTAAGGTCAGCTAAATAAGC', 'gene60930'],
 'F_p4763': ['ATAATAGCTATAGCAATAAA', 'gene60931'],
 'F_p4947': ['TATCCATCATAGCAGGCAGT', 'gene60931'],
 'F_p5182': ['ATGACTAACACCCTTAATTC', 'gene60931'],
 'F_p5546': ['AAGCCCTCAGTAAGTTGCAA', 'gene60932'],
 'R_p5655': ['AAGGGCTTAGCTTAATTAAA', 'gene60933'],
 'R_p5699': ['GCTGTTAACTAAGTGTTTGT', 'gene60934'],
 'F_p5955': ['TATACCTATTATTCGGCGCA', 'gene60937'],
 'F_p6681': ['ACTACTCCGGAAAAAAAGAA', 'gene60937'],
 'F_p6959': ['CTGACTGGCATTGTATTA

## Combine the primers that map and the additional primers and save to csv (Name, Sequence)

In [31]:
final_primers = pd.DataFrame(columns=["Sequence", "Gene"])#, index="Name")
for l in location_info:
    curr_seq = primers[primers["Name"] == l[1]]["Sequence 5'"].values[0]
    final_primers = pd.concat((final_primers, 
                               pd.DataFrame({"Sequence":curr_seq, "Gene":l[2]},index=[l[1]])))
final_primers

for p in new_primers:
    final_primers = pd.concat((final_primers, 
                               pd.DataFrame({"Sequence":new_primers[p][0], "Gene":new_primers[p][1]},
                                            index=[p])))
final_primers.sort_index().to_csv("final_primers.csv")
final_primers

Unnamed: 0,Sequence,Gene
F2,GGTTGGTCAATTTCGTGCCAG,gene60923
F3,CATCAAGCACGCAGCAATG,gene60923
F4,CTCACCACCTCTTGCTCAGC,gene60923
F6,AAGCTAAGACCCCCGAAACC,gene60925
F7,AAACCTACCGAGCCTGGTG,gene60925
...,...,...
R_p14700,GTCGTGGTTGTAGTCCGTGC,gene60954
F_p14851,CTCCTTGGCGCCTGCCTGAT,gene60956
F_p15410,CTTACTACACAATCAAAGAC,gene60956
F_p15724,CTCCTAGCCGCAGACCTCCT,gene60956


## Map to MT regardless genes

In [None]:
location_info2 = []
for row in result.iterrows():
    r = row[1]

    # When gene is in - and primer in +
    if r['Strand'] == '+':
        start = int(r['Start-Position'])
        end = start + window - 1 
    elif r['Strand'] == '-':
        end = int(r['End-Position'])
        start = end - window +1

    for i in range(start, end+1):
        location_info2.append(i)

In [None]:
regions2 = dict()

tmp_cov = []
tmp_uncov = []
for i in tqdm(range(1, len(mt_seq)+1)):
    if i in location_info2:
        tmp_cov.append(i)
    else:
        tmp_uncov.append(i)
        
regions2['covered'] = list(interval_extract(tmp_cov))
regions2['uncovered'] = list(interval_extract(tmp_uncov))
    

In [None]:
print('-'*100)
print('Covered regions')
print('-'*100)
for r in regions2['covered']:
    size = r[1] - r[0] + 1
    print('Fragment size: {} - Located at {}-{}'.format(size, r[0], r[1]))
    
print('-'*100)
print('Uncovered regions')
print('-'*100)
for r in regions2['uncovered']:
    size = r[1] - r[0] + 1
    print('Fragment size: {} - Located at {}-{}'.format(size, r[0], r[1]))

## Double check by loading in the genes fasta

In [68]:
mt_genes = SeqIO.to_dict(SeqIO.parse("MT.genes.fasta", "fasta"))
mt_genes

primers_used = set()
genes_seen = set()

pos_primers_used = set()
pos_genes_seen = set()
for gene in tqdm(mt_genes):
    if 'gene' in gene:
        curr_for = mt_genes[gene].seq
        curr_rev = mt_genes[gene].seq.reverse_complement()
        #print(curr_rev)
        for p_ind, p_val in final_primers.iterrows():
    #             print('p_ind',p_ind)
    #             print('p_val',p_val["Sequence"])
            if p_val["Sequence"] in curr_rev:
                primers_used.add(p_ind)
                genes_seen.add(gene)

            if p_val["Sequence"] in curr_for:
                pos_primers_used.add(p_ind)
                pos_genes_seen.add(gene)

primers_used

100%|██████████| 100/100 [00:00<00:00, 142.66it/s]


{'F21',
 'F33',
 'F34',
 'F36',
 'F46',
 'F62',
 'R1',
 'R10',
 'R11',
 'R12',
 'R13',
 'R15',
 'R16',
 'R17',
 'R19',
 'R2',
 'R20',
 'R21',
 'R22',
 'R23',
 'R24',
 'R3',
 'R4',
 'R5',
 'R6',
 'R8',
 'R9'}

In [69]:
len(primers_used)

27

In [70]:
pos_primers_used

{'F10',
 'F11',
 'F12',
 'F13',
 'F14',
 'F15',
 'F16',
 'F18',
 'F19',
 'F2',
 'F22',
 'F23',
 'F24',
 'F26',
 'F27',
 'F28',
 'F29',
 'F3',
 'F30',
 'F31',
 'F32',
 'F35',
 'F37',
 'F38',
 'F39',
 'F4',
 'F40',
 'F41',
 'F42',
 'F44',
 'F45',
 'F47',
 'F48',
 'F49',
 'F50',
 'F51',
 'F52',
 'F53',
 'F54',
 'F55',
 'F56',
 'F57',
 'F58',
 'F59',
 'F6',
 'F60',
 'F61',
 'F63',
 'F64',
 'F65',
 'F66',
 'F67',
 'F68',
 'F69',
 'F7',
 'F70',
 'F72',
 'F8',
 'F9',
 'F_p10072',
 'F_p1023',
 'F_p10325',
 'F_p10923',
 'F_p11151',
 'F_p11804',
 'F_p12097',
 'F_p12943',
 'F_p13124',
 'F_p13344',
 'F_p13777',
 'F_p14049',
 'F_p14851',
 'F_p1498',
 'F_p15410',
 'F_p15724',
 'F_p1730',
 'F_p2388',
 'F_p2729',
 'F_p3132',
 'F_p3591',
 'F_p4081',
 'F_p4402',
 'F_p4763',
 'F_p4947',
 'F_p5182',
 'F_p5546',
 'F_p577',
 'F_p5955',
 'F_p6681',
 'F_p6959',
 'F_p7329',
 'F_p7518',
 'F_p7793',
 'F_p8482',
 'F_p8791',
 'F_p9060',
 'F_p9459',
 'R14',
 'R_p14469',
 'R_p14700',
 'R_p16023',
 'R_p4400',
 'R_p56

In [72]:
len(pos_primers_used)

104

In [73]:
len(pos_genes_seen)

27

In [53]:
len(final_primers)

131

In [57]:
len(primers_used)

29