# 2. Get genes, CDSs and protein for samples without assemblies
## 2.1. Map and assemble genes BWA 
Map the read sub sample from the `mitogenomes` directory to the baits to get a "genes" fasta for each sample. Only retain the backbone file of iteration1 and delete everything else.

### 2.1.1 Determine which reference sample to use for each sample, and what missmatch percentage
javanica and incognita are expected to be almost identical to their reference genomes and arenaria slightly divergent. It is important to use low missmatch factor to avoid spurious mapping, but also not too low, so that homologous but divergent reads can map.

In [14]:
smpl2ref = {   #ref      # max mismatch
    'MjavVW5':['MjavVW4', 2],
    'MareL28':['MareHarA',5],
    'MincA14':['MincW1',  2],
    'MincL17':['MjavVW4', 2],
    'MincL27':['MincW1',  2],
    'MincL15':['MjavVW4', 2],
    'MjavL57':['MjavVW4', 2],
    'MincVW6':['MincW1',  2],
    'MincL9' :['MincW1',  2],
    'Minc557R':['MincW1',  2],
    'MincL19':['MincW1',  2],
    'MareL32':['MareHarA',5],
    'MincHarC':['MincW1',  2],
}

raw_data = {
    # smpl_id: [sra_accession, read1_fname, read2_fname]
} # to do

misc.makedir('raw_reads')

# for smpl_id in raw_data:
#     get sra accession
#     dump fastq files to raw_reads
#     rename read1_fname to smpl_id_1.fastq.gz
#     rename read2_fname to smpl_id_2.fastq.qz

### 2.1.2 Download the read data from SRA and quality trim

In [None]:
import misc, re, os


for smpl in smpl2ref:
    
    raw1 = "raw_reads/%s_1.fastq.gz"%smpl
    raw2 = "raw_reads/%s_2.fastq.gz"%smpl
    
    pairedtrimmed1 = raw1.replace('.fastq.gz','_trimmomatic.fastq.gz')
    pairedtrimmed2 = raw2.replace('.fastq.gz','_trimmomatic.fastq.gz')
    
    if os.path.exists(pairedtrimmed1):
        continue
    
    unpairedtrimmed1 = raw1.replace('.fastq.gz','_trimmomatic_up.fastq.gz')
    unpairedtrimmed2 = raw2.replace('.fastq.gz','_trimmomatic_up.fastq.gz')


    adapters = "../mitogenomes/adapters.fasta"
    a = "java -jar /usr/bin/trimmomatic.jar PE -phred33 %s %s %s %s %s %s "
    b = "ILLUMINACLIP:%s:2:30:10 LEADING:30 TRAILING:30 SLIDINGWINDOW:4:15 MINLEN:80"
    c = a+b
    cline = c%(raw1,
                 raw2,
                 pairedtrimmed1,
                 unpairedtrimmed1,
                 pairedtrimmed2,
                 unpairedtrimmed2,
                 adapters)

    #print "Trimming the file %s"%raw1
    #print "Output written to %s"%pairedtrimmed1
    #print "using command\n%s"%cline

    err, out = misc.execute_cline(cline)
    
    #print out


### 2.1.3 Run BWA
#### 2.1.3.1 Prerequisites

In [None]:
#--------------------------------
# File and command line templates
# -------------------------------

# The paths assume we moved to a subdirectory of cwd
readpool = "raw_reads/%s_%i_trimmomatic.fastq.gz"
ref = "../all_gene_ref_reviewed/%s_ref.nt.fasta"
bwadir = "./%s_bwa/"

In [None]:
# command line templates
cdcline = "cd %s && %s && cd .." #%(bwadir, cline)
indexcline = "bwa index %s"   #%(ref)
aligncline = "bwa aln -n %i %s %s > %s%i.sai" #%(missmatch, ref, readpool, smpl,read)
sampecline = "bwa sampe %s %s1.sai %s2.sai %s %s > %s.sam" #%(ref, smpl, smpl, readpool1, readpool2, smpl)
sortcline  = "samtools sort -o %s.bam -T %s -@ 1 -O bam %s.sam" #%(smpl,smpl,smpl) 
idxbamcline= "samtools index %s.bam"#%smpl
fbayescline= "freebayes -f %s -p 1 --report-monomorphic %s.bam > %s.vcf" #%(ref, smpl, smpl)

#---------------------------------------------------------------
# A function for a single run that happens in a new subdirectory
#---------------------------------------------------------------

def index_bwa_refs(smpl2ref):
    
    import misc
    
    ref_samples = set([i[1][0] for i in smpl2ref.items()])
    
    for refsmpl in ref_samples:
        
        smpl_ref = ref[1:]%refsmpl
        
        misc.printoe(
            misc.execute_cline(indexcline%smpl_ref)
        )
        


def run_map_assembly(smpl, smpl2ref, vcf=True, fasta=True):
    
    import misc
    import os
    import sys
    from warnings import warn
    
    """
    This is a very context dependant function to run mitobim
    on a single sample
    """
    
    
    # A subdirecotry of cwd in which mitobim will run for smpl
    outdir = bwadir%smpl
    
    misc.makedir(outdir)
    
    # specific input and ouput fpaths for smpl
    smpl_readpool_1 = readpool%(smpl,1)
    smpl_readpool_2 = readpool%(smpl,2)
    smpl_ref = ref%smpl2ref[smpl][0]
    missmatch = smpl2ref[smpl][1]
    
    # Check the inputs exists and outputs don't
    readpool1_exists = os.path.exists(smpl_readpool_1)
    readpool2_exists = os.path.exists(smpl_readpool_2)
    ref_exists = os.path.exists(outdir+smpl_ref)
    output_exists = os.path.exists(outdir+'.gz')
    
    if output_exists:
        warn('skipping %s, output exists'%smpl)

        
    elif not readpool1_exists or not readpool2_exists:
        warn('skipping %s, cannot access readpool'%smpl)

        
    elif not ref_exists:
        warn('skipping %s, cannot access ref'%smpl)
        
    else:
        # Make sample specific clines
        clines = [
            # bwa aln
            aligncline%(missmatch, smpl_ref, smpl_readpool_1, smpl,1),
            aligncline%(missmatch, smpl_ref, smpl_readpool_2, smpl,2),
            # bwa sampe
            sampecline%(smpl_ref, smpl, smpl, smpl_readpool_1, smpl_readpool_2, smpl),
            # sort sam by start
            sortcline%(smpl,smpl,smpl),
            # clean
            'rm *.*ai','rm *.sam',
        ]
        
        if vcf:
            # index bam file
            clines.append(idxbamcline%smpl)
            # make vcf file
            clines.append(fbayescline%(smpl_ref, smpl, smpl))
        
        # run the workflow
        for cline in clines:
            print "executing", cdcline%(outdir, cline)
            sys.stdout.flush()
            misc.printoe(
                misc.execute_cline(cdcline%(outdir, cline))
            )
        
        # make genes fasta
        if fasta:
            vcf2con("%s.vcf"%smpl, outdir, smpl)
            
        # delete bam
        # I am keeping it for now
        #misc.printoe(
        #    misc.execute_cline(cdcline%'rm *.bam')
        #)
        
        # delete vcf
        misc.printoe(
            misc.execute_cline(cdcline%(outdir, 'rm *.vcf'))
        )
        
        # compress the output dir
        misc.printoe(
            misc.compressgz(outdir)
        )

#---------------------------------------------------------------
# A function to convert vcf to assembly
#---------------------------------------------------------------

def vcf2con(vcfin, workdir, smpl):
    
    """This will use christophs vcf_2_consensus
    to take a vcf file and write separate fasta
    entries in one file for each assembled gene"""
    
    from Bio import SeqIO
    import misc
    import os
    
    # output fasta files with assembled genes
    out = open("%s/%s.nt.fasta"%(workdir,smpl),'wt')
    
    with open(workdir+vcfin, 'r') as lines:
        
        locus_vcf_lines = []
        current_locus = ''
        header = []
        
        for line in lines:
            
            # skip header lines in vcf
            if line.startswith('#'):
                header.append(line)
                continue
            
            # The bait name (gene in the reference file)
            line_locus = line.split()[0]
            
            # Is it is the first vcf line, get he gene
            # name and the line
            if current_locus == '':
                current_locus = line_locus
                locus_vcf_lines.append(line)
            
            # If it is not the first line, and the gene
            # is the same as before, add the line
            elif line_locus == current_locus:
                locus_vcf_lines.append(line)
                
            # If this line referes to a new gene,
            # write a vcf file containing the lines
            # of the previous gene, and write them as 
            # a fasta assembly, then start a list for
            # for the new gene
            else:
                
                # a vcf file with lines belonging
                # only to one genes
                tmp = open(workdir+'tmp.vcf','wt')
                for l in header+locus_vcf_lines:
                    tmp.write(l)
                tmp.close()
                
                # Write a fasta file with an assembly of
                # the single gene
                cdcln = "cd %s && %s && cd .." #%(workdir, cline)
                executable = "python vcf_2_consensus.py"
                convcln = "%s -f -o %s tmp.vcf"%(executable,smpl)
                cline = cdcln%(workdir, convcln)
                #misc.printoe(
                misc.execute_cline(cline)
                #)
                
                # Add the fasta assembly to a fasta file
                # with all the assembled gened
                record = SeqIO.read(workdir+smpl+'.fas','fasta')
                record.id = current_locus
                record.description = ''
                out.write(record.format('fasta'))
                os.remove(workdir+smpl+'.fas')
                os.remove(workdir+'tmp.vcf')
                
                # Reset the list and the gene name
                # for the following gene in the 
                # original vcf file
                current_locus = line_locus
                locus_vcf_lines = [line]
                
    out.close()                
                            

#--------------------------------------------------------------
# A function to run several gene assembly processes in parallel
#--------------------------------------------------------------

def run_parallel_map_assmbly(smpl2ref):
    
    """ This will run run_map_assembly 
    for all the samples in parallel"""
    
    import threading
    import os
    
    index_bwa_refs(smpl2ref)
    
    class MyThread(threading.Thread):
        def __init__(self, smpl, smpldict):
            threading.Thread.__init__(self)
            self.smpl = smpl
            self.smpl2ref = smpldict

        def run(self):
            run_map_assembly(self.smpl, self.smpl2ref)
            
    threads = [MyThread(smpl, smpl2ref) for smpl in smpl2ref.keys()]

    for t in threads:
        t.start()

    for t in threads:
        t.join()
        
    from glob import glob
    
    for f in glob('./gene_ref_reviewed/*_ref.nt.fasta.*'):
        os.remove(f)

#### 2.1.3.2 Make gene files

In [None]:
run_parallel_map_assmbly(smpl2ref)

## 2.2. Get CDSs and proteins from the map-assembled genes, using  exonerate, for samples without a genome assembly
Search a CDS in each assembled gene using a Protein as query.  

In [None]:
def parse_a_single_record(record):
    
    output = {}
    
    def parse_line_element(ind, parter, element_name):
        elementline = record[ind]
        try:
            output[element_name] = elementline.rstrip().split(parter)[1]
            if ' [revcomp]' in elementline:
                output[element_name] = output[element_name].replace(' [revcomp]','')
            elif ' -> ' in output[element_name] and \
                 element_name == 'Target_start':
                output[element_name] = output[element_name].split(' -> ')[0]
            elif ' -> ' in output[element_name] and \
                 element_name == 'Target_end':
                output[element_name] = output[element_name].split(' -> ')[1]
            try:
                output[element_name] = int(output[element_name])
            except:
                pass
        except:
            raise RuntimeError("%s not found"%parter)

            
    line_elements = [[2, 'Query: ', 'Query'],
                     [3, 'Target: ', 'Target'],
                     [5, 'Raw score: ', 'Score'],
                     [7, 'Target range: ', 'Target_start'],
                     [7, 'Target range: ', 'Target_end']]
    
    
    for ind, parter, element_name in line_elements:
        parse_line_element(ind, parter, element_name)
        
    def parse_block_element(init, halt, element_name):
        element = ""
        get = False
        for line in record:
            if init in line:
                get = True
            elif halt in line:
                get = False
            elif get:
                element += line
        output[element_name] = element
        
    block_elements = [['STARTGENE','ENDGENE','Gene'],
                      ['STARTCDS','ENDCDS','CDS'],
                      ['--- START OF GFF DUMP ---',
                       '--- END OF GFF DUMP ---',
                       'GFF']
                     ]
    for init, halt, element_name in block_elements:
        parse_block_element(init, halt, element_name)
        
    return output

def iter_records(fpath):
    with open(fpath,'r') as lines:
        record = []
        get = False
        for line in lines:
            if 'C4 Alignment:' in line:
                get = True
            if get and 'C4 Alignment:' in line:
                if len(record) > 0:
                    yield record
                    record = [line]
                else:
                    record = [line]
            elif get:
                record.append(line)
        yield record

def single_gene_exonerate(prot, gene, chain = ''):
    
    import misc, os
    from StringIO import StringIO
    from Bio import SeqIO
    
    ## exonerate cline
    # query and target
    a = "exonerate -q %s -t %s "%(prot, gene)
    # search CDS in DNA and get only the best result
    b = "-m protein2genome -n 1 "
    # format the output into gene and CDS fasta (exonerate roll your own syntax
    # plus proper Python escapes: backslash for quotation and backslash,
    # percent for percent)
    c = '--ryo \"STARTGENE\\n>{0}qi\\n{0}tas\\nENDGENE\\nSTARTCDS\\n>{0}qi\\n{0}tcs\\nENDCDS\\n \" '.format('%')
    # gff
    d = "--showtargetgff "
    cline = a+b+c+d
    
    out, err = misc.execute_cline(cline)
    
    outhndl = open(str(chain)+'exo','wt')
    outhndl.write(out)
    outhndl.close()
    
    
    try:
        record = parse_a_single_record(list(iter_records(str(chain)+'exo'))[0])
        os.remove(str(chain)+'exo')
        return record
    
    except:
        return None

def dump_single_record_to_fasta_and_gff(record, ref_prot_seq_str):
    
    from warnings import warn
    from Bio import SeqIO
    from StringIO import StringIO
    from distance import levenshtein
    
    seqrecord = SeqIO.read(StringIO(record['CDS']),'fasta')
    seqrecord.description = ''
    
    gff = '#GlobalCounter%s\n'%seqrecord.id+record['GFF']
    
    
    # select frame
    all_frames = [
        
        str(seqrecord.seq.translate()),
        str(seqrecord.seq[1:].translate()),
        str(seqrecord.seq[2:].translate())
    ]
    
    dists = [levenshtein(ref_prot_seq_str, all_frames[0]),
             levenshtein(ref_prot_seq_str, all_frames[1]),
             levenshtein(ref_prot_seq_str, all_frames[2])]
    
    frame = dists.index(min(dists))
    
    prot_seq = all_frames[frame]
    cds_seq = str(seqrecord.seq)[frame:]
    
    prot_has_stops = prot_seq.count('*') > 1
    prot_has_stop = prot_seq.count('*') == 1
    prot_endswith_stop = prot_seq.endswith('*')
    early_stop = prot_has_stop and not prot_endswith_stop
    bad_prot = any([prot_has_stops, early_stop])
    
    if prot_endswith_stop:
        prot_seq = prot_seq[:-1]
        
    prot = ">%s\n%s\n"%(seqrecord.id,prot_seq)
    cds  = ">%s\n%s\n"%(seqrecord.id,cds_seq)

    return cds, prot, gff, bad_prot

def single_sample_per_gene_exonerate(smpl, smpl2ref, chain = ''):
    
    from Bio import SeqIO
    import os, sys, misc
    from warnings import warn
    
    print smpl
    sys.stdout.flush()
    
    smpl_genes_fpath = '%s_bwa/%s.nt.fasta'%(smpl,smpl)
    ref_smpl = smpl2ref[smpl][0]
    ref_prot_fpath = 'all_protein_ref_reviewed/%s_ref.aa.fasta'%ref_smpl
    
    genes = SeqIO.parse(smpl_genes_fpath,'fasta')
    ref_prots = SeqIO.to_dict(SeqIO.parse(ref_prot_fpath,'fasta'))
    
    misc.makedir('cdss')
    misc.makedir('gffs')
    misc.makedir('proteins')
    
    misc.makedir('stopped_cdss')
    misc.makedir('stopped_gffs')
    misc.makedir('stopped_proteins')
      
    out_cdss = 'cdss/%s.cds.fasta'%smpl
    gff_out =  'gffs/%s.gff'%smpl
    prot_out = 'proteins/%s.aa.fasta'%smpl
    
    stopped_out_cdss = 'stopped_cdss/%s.cds.fasta'%smpl
    stopped_gff_out =  'stopped_gffs/%s.gff'%smpl
    stopped_prot_out = 'stopped_proteins/%s.aa.fasta'%smpl
    
    out_cdss = open(out_cdss,'wt')
    gff_out = open(gff_out,'wt')
    prot_out = open(prot_out,'wt')
    
    stopped_out_cdss = open(stopped_out_cdss,'wt')
    stopped_gff_out = open(stopped_gff_out,'wt')
    stopped_prot_out = open(stopped_prot_out,'wt')
    
    for gener in genes:
        protr = ref_prots[gener.id]
        
        with open('%s_temp.gene'%smpl,'wt') as hndl:
            hndl.write(gener.format('fasta'))
        
        with open('%s_temp.prot'%smpl,'wt') as hndl:
            hndl.write(protr.format('fasta'))
            
        record = single_gene_exonerate('%s_temp.prot'%smpl,
                                       '%s_temp.gene'%smpl,
                                       chain = chain)
        
        if not record:
            #print 0
            sys.stdout.flush()
            continue
        
        c, p, g, bad_prot = dump_single_record_to_fasta_and_gff(record, str(protr.seq))
        
        if all([c, p, g]):
            if not bad_prot:
                out_cdss.write(c)
                prot_out.write(p)
                gff_out.write(g)
            else:
                stopped_out_cdss.write(c)
                stopped_prot_out.write(p)
                stopped_gff_out.write(g)
        else:
            warn("sample %s gene %s had no output"%(smpl,gener.id))
        os.remove('%s_temp.prot'%smpl)
        os.remove('%s_temp.gene'%smpl)
    
    prot_out.close()
    gff_out.close()
    out_cdss.close()
    
    stopped_prot_out.close()
    stopped_gff_out.close()
    stopped_out_cdss.close()
    
    sys.stderr.flush()

In [None]:
for smpl in smpl2ref:
    single_sample_per_gene_exonerate(smpl, smpl2ref)

In [15]:
import misc

misc.makedir('all_gffs')

cline = "cat gffs/{0}.gff "
cline += "stopped_gffs/{0}.gff > "
cline += "all_gffs/{0}.gff"

for smpl in smpl2ref:
    
    misc.execute_cline(cline.format(smpl))

import misc

misc.makedir('all_proteins')

cline = "cat proteins/{0}.aa.fasta "
cline += "stopped_proteins/{0}.aa.fasta > "
cline += "all_proteins/{0}.aa.fasta"

for smpl in smpl2ref:
    
    misc.execute_cline(cline.format(smpl))
    
import misc

misc.makedir('all_cdss')

cline = "cat cdss/{0}.cds.fasta "
cline += "stopped_cdss/{0}.cds.fasta > "
cline += "all_cdss/{0}.cds.fasta"

for smpl in smpl2ref:
    
    misc.execute_cline(cline.format(smpl))    