# 5. Mitochondrial genome assembly
## 5.1 mitobim assembly
### 5.1.1 Choose parameters
Choose a reference mitochondrial genome reference for each sample and the maximum mismatches per read:

In [29]:
smpl2ref = {
    'MentL30'    :['NC_024097.1',15],
    'MareHarA'   :['NC_026554.1',4],
    'MjavVW4'    :['NC_026556.1',1],
    'MjavVW5'    :['NC_026556.1',1],
    'MareL28'    :['NC_026554.1',1],
    'MincA14'    :['NC_024097.1',1],
    'MjavLD17'   :['NC_026556.1',1],
    'MincL27'    :['NC_024097.1',1],
    'MjavLD15'   :['NC_026556.1',1],
    'MjavL57'    :['NC_026556.1',1],
    'MincVW6'    :['NC_024097.1',1],
    'MincL9'     :['NC_024097.1',1],
    'Minc557R'   :['NC_024097.1',1],
    'MincL19'    :['NC_024097.1',1],
    'MareL32'    :['NC_026554.1',4],
    'MincHarC'   :['NC_024097.1',1],
    'MfloSJF1'   :['NC_024097.1',15],
    'MfloJB5'    :['NC_024097.1',6]     
}

### 5.1.2 Prepare a subset of 25M reads

In [None]:
def run_parallel_subset(smpl2ref):
    
    
    import threading
    import os, misc
     
    class MyThread(threading.Thread):
        def __init__(self, smpl):
            threading.Thread.__init__(self)
            self.smpl = smpl

        def run(self):
            trimmed = 'raw_reads/%s_1_trimmomatic.fastq.gz'%self.smpl
            # input still has old sample IDs for L17 and L15
            trimmed = trimmed.replace('MjavLD17','MincL17').replace('MjavLD15','MincL15')
            # The subset output will have the new sample IDs
            subset = 'raw_reads/%s_25M_SE_subset.fastq.gz'%self.smpl
            
            # how many reads in the trimmed reads file?
            out, err = misc.execute_cline('zcat %s | grep -c "^+$"' % trimmed)
            if int(out) > 25000000: # then get a subset
                misc.sed_subset_fastq_gz(trimmed, 25000000, subset)
            else: # just make a link
                out, err = misc.execute_cline('ln -s %s %s' % (trimmed, subset))
            
    threads = [MyThread(smpl) for smpl in smpl2ref.keys()]

    for t in threads:
        t.start()

    for t in threads:
        t.join()
        
smpl2ref1 = {key: smpl2ref[key] for key in smpl2ref.keys()[:10]}   
smpl2ref2 = {key: smpl2ref[key] for key in smpl2ref.keys()[10:]}    
run_parallel_subset(smpl2ref1)
run_parallel_subset(smpl2ref2)

### 5.1.3 Run mitobim

In [32]:
import misc
# command line templates
cdcline = "cd %s && %s && cd .." #%(workdir, cline)
mitobimcline = "MITObim_1.8.pl -start 0 -end 40 --clean "
mitobimcline+= "-sample {0} -ref {1} "
mitobimcline+= "-readpool raw_reads/{0}_25M_SE_subset.fastq.gz "
mitobimcline+= "--quick ../mito_references/{1}_genes.fasta "
mitobimcline+= "--missmatch {2} "

for smpl in smpl2ref:
    workdir = smpl + "_mitobim"
    ref = smpl2ref[smpl][0]
    mismatch = smpl2ref[smpl][1]
    misc.makedir(workdir)
    cline = cdcline%(workdir, mitobimcline.format(smpl, ref, mismatch))
    out, err = misc.execute_cline(cline)
    with open(workdir + '/log','wt') as log:
        log.write(out)