In [1]:
from Bio import SeqIO
import click
import pandas as pd
import sys
sys.path.append("../")
from src.config import ROOT_DIR
from os.path import join

import tqdm

Project Directory: /data2/mito_lineage


In [2]:
ref_fa = join(ROOT_DIR, "data/processed/genomes/refdata-cellranger-GRCh38-3.0.0/MT.fasta")
out_f = join(ROOT_DIR, "data/processed/amplicons/MT_amplicons.bed")

In [3]:
chromosome="MT" 
amplicon_length=150 
pcr_length=(18,22)


In [4]:
record_dict = SeqIO.to_dict(SeqIO.parse(ref_fa, "fasta"))
try:
    seq = record_dict[chromosome]
except KeyError:
    print(f"{chromosome} not found in {ref_fa}. Please check if you have the proper name")
    
print(seq)

ID: MT
Name: MT
Description: MT dna:chromosome chromosome:GRCh38:MT:1:16569:1 REF
Number of features: 0
Seq('GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGG...ATG', SingleLetterAlphabet())


### Create bed file of the MT regions broken up into approximate pcr sizes

In [5]:
bed_df = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd", "name", "score", "strand"])
#sequences = SeqIO.fas
for ind in tqdm.tqdm(range(50,len(seq)-100,amplicon_length)):
    bed_df = pd.concat((bed_df, pd.DataFrame({"chrom":chromosome,
                                    "chromStart":ind, "chromEnd":min(ind+amplicon_length, len(seq)-2), "name":ind, "score":0, "strand":"+"},
                                           index=[ind])))

bed_df.to_csv(out_f, header=None, sep="\t", index=False)
#SeqIO.write(sequences,open(out_f + ".fasta", "w"))
bed_df

100%|██████████| 110/110 [00:00<00:00, 196.52it/s]


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand
50,MT,50,200,50,0,+
200,MT,200,350,200,0,+
350,MT,350,500,350,0,+
500,MT,500,650,500,0,+
650,MT,650,800,650,0,+
...,...,...,...,...,...,...
15800,MT,15800,15950,15800,0,+
15950,MT,15950,16100,15950,0,+
16100,MT,16100,16250,16100,0,+
16250,MT,16250,16400,16250,0,+


### Make much fewer number of amplicons and see if works

In [7]:
out_f = join(ROOT_DIR, "data/processed/amplicons/MT_amplicons_every100bp.bed")

bed_df = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd", "name", "score", "strand"])
#sequences = SeqIO.fas
for ind in tqdm.tqdm(range(500,len(seq)-500,amplicon_length+100)):
    bed_df = pd.concat((bed_df, pd.DataFrame({"chrom":chromosome,
                                    "chromStart":ind, "chromEnd":min(ind+amplicon_length, len(seq)-2), "name":ind, "score":0, "strand":"+"},
                                           index=[ind])))

bed_df.to_csv(out_f, header=None, sep="\t", index=False)
#SeqIO.write(sequences,open(out_f + ".fasta", "w"))
bed_df

100%|██████████| 63/63 [00:00<00:00, 188.87it/s]


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand
500,MT,500,650,500,0,+
750,MT,750,900,750,0,+
1000,MT,1000,1150,1000,0,+
1250,MT,1250,1400,1250,0,+
1500,MT,1500,1650,1500,0,+
...,...,...,...,...,...,...
15000,MT,15000,15150,15000,0,+
15250,MT,15250,15400,15250,0,+
15500,MT,15500,15650,15500,0,+
15750,MT,15750,15900,15750,0,+


## Save as chrM instead of MT for UCSC naming

In [8]:
out_f = join(ROOT_DIR, "data/processed/amplicons/chrM_amplicons_every100bp.bed")

bed_df = pd.DataFrame(columns=["chrom", "chromStart", "chromEnd", "name", "score", "strand"])
#sequences = SeqIO.fas
for ind in tqdm.tqdm(range(500,len(seq)-500,amplicon_length+100)):
    bed_df = pd.concat((bed_df, pd.DataFrame({"chrom":chromosome,
                                    "chromStart":ind, "chromEnd":min(ind+amplicon_length, len(seq)-2), "name":ind, "score":0, "strand":"+"},
                                           index=[ind])))
bed_df["chrom"] = "chrM"
bed_df.to_csv(out_f, header=None, sep="\t", index=False)
#SeqIO.write(sequences,open(out_f + ".fasta", "w"))
bed_df

100%|██████████| 63/63 [00:00<00:00, 234.08it/s]


Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand
500,chrM,500,650,500,0,+
750,chrM,750,900,750,0,+
1000,chrM,1000,1150,1000,0,+
1250,chrM,1250,1400,1250,0,+
1500,chrM,1500,1650,1500,0,+
...,...,...,...,...,...,...
15000,chrM,15000,15150,15000,0,+
15250,chrM,15250,15400,15250,0,+
15500,chrM,15500,15650,15500,0,+
15750,chrM,15750,15900,15750,0,+


## Create the sequences into fasta

In [5]:
out_f = join(ROOT_DIR, "data/processed/amplicons/amplicons_every100bp.fasta")

sequences = dict()

for ind in tqdm.tqdm(range(500,len(seq)-500,amplicon_length+100)):
    print(ind)
    sequences[f"{ind}-{ind+100+amplicon_length}"] = str(seq.seq[ind:min(ind+500, len(seq))])



lines = ""
for i in sequences:
    lines = f"{lines}>{i}\n{sequences[i]}\n" 
with open(out_f , "w") as f:
    f.write(lines)
    
sequences

100%|██████████| 63/63 [00:00<00:00, 7793.80it/s]

500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
7750
8000
8250
8500
8750
9000
9250
9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
12750
13000
13250
13500
13750
14000
14250
14500
14750
15000
15250
15500
15750
16000





{'500-750': 'CCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACT',
 '750-1000': 'ACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATA