In [9]:
import gzip
import os
import sys
import numpy as np
import re

from pprint import pprint
from itertools import groupby
import pandas as pd

###############
### Small functions
###############
def rev_comp(seq):
    complement_dict = {'A':'T','T':'A','G':'C','C':'G','N':'N'}
    return "".join([complement_dict[base.upper()] for base in reversed(seq)])

def translate_dna2aa(dna_seq):
    dna_seq = dna_seq.upper()
    codon_table = {
    'GCT':"A",
    'GCC':"A",
    'GCA':"A",
    'GCG':"A",
    'TGT':"C",
    'TGC':"C",
    'GAT':"D",
    'GAC':"D",
    'GAA':"E",
    'GAG':"E",
    'TTT':'F',
    'TTC':'F',
    'GGT':'G',
    'GGC':'G',
    'GGA':'G',
    'GGG':'G',
    'CAT':'H',
    'CAC':'H',
    'ATT':'I',
    'ATC':'I',
    'ATA':'I',
    'AAA':'K',
    'AAG':'K',
    'TTA':'L',
    'TTG':'L',
    'CTT':'L',
    'CTC':'L',
    'CTA':'L',
    'CTG':'L',
    'ATG':'M',
    'AAT':'N', 'AAC':'N',
    'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
    'CAA':'Q', 'CAG':'Q',
    'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
    'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S',
    'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
    'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
    'TGG':'W',
    'TAT':'Y', 'TAC':'Y',
    'TAG':"*",'TGA':"*",'TAA':"*",

    #Inverse table for the standard genetic code (compressed using IUPAC notation)
    'GCN':"A",
    'CGN':"R",'AGR':"R",'CGY':"R",'MGR':"R",
    'AAY':"N",
    'GAY':"D",
    'RAY':"B",
    'TGY':"C",
    'CAR':"Q",
    'GAR':"E",
    'SAR':"Z",
    'GGN':"G",
    'CAY':"H",
    'ATH':"I",
    'CTN':"L",'TTR':"L",'CTY':"L",'YTR':"L",
    'AAR':"K",
    'TTY':"F",
    'CCN':"P",
    'TCN':"S",'AGY':"S",
    'ACN':"T",
    'TAY':"Y",
    'GTN':"V",
    'TRA':"*",'TAR':"*"
                }

    aa_seq = ""
    for i in range(1,int(len(dna_seq)/3)+1):
        try:
            aa_seq += codon_table[dna_seq[(i-1)*3:i*3]]
        except KeyError:
            #print("Undefined codon: '%s' was translated to X "%(dna_seq[(i-1)*3:i*3]))
            aa_seq += 'X'
    return(aa_seq)

def fasta2dict(f):
    d = {}
    with  open(f,"r") as F:
        for line in F:
            if line[0]==">":
                name = line[1:].split("\n")[0].split("|")[0]
                d[name] = ''
            else:
                #print(name)
                d[name]+= line.split("\n")[0]
    F.close()

    return d

        


def make_fasta_from_listoftups(L,name):
    with open("%s"%name,"w") as f:
        for I in L:
            #print I
            f.write('>%s\n'%(str(I[0])))
            f.write('%s\n'%(str(I[1])))
        f.close()
        #print("Made fna file : %s"%(name))


def get_blastout(x):
    files = []
    lis = os.listdir("%s" %x)
    for i in lis:
        if i[-4:] == ".out" :
            files.append(i)
    return list(files)

def parse_blastout(dir_out):
    lis = os.listdir(dir_out)
    fs = ["%s/%s" %(dir_out,f) for f in lis]
    D = {}
    for f in fs:
        if f.split("/")[-1][0] != ".":
            genome = f.split("/")[-1].split(".blast")[0]
            with open(f, "r") as F:
                hit_n = 0
                for line in F:
                    """
                    'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore', 
                    """
                    hit_n+=1
                    hit         = hit_n   
                    cols        =    line.split(",")
                    contig      =    cols[0]
                    subject     =    cols[1]
                    pident      =    float(cols[2])
                    str_on_q    =    int(cols[6])
                    end_on_q    =    int(cols[7])
                    str_on_s    =    int(cols[8])
                    end_on_s    =    int(cols[9])
                    evalue      =    float(cols[10])


                    el = {}
                    el["contig"]   = contig
                    el["subject"]    = subject
                    el["str_s"]      = str_on_s
                    el["end_s"]      = end_on_s
                    el["str_q"]      = str_on_q
                    el["end_q"]      = end_on_q
                    el["evalue"]   = evalue
                    el["pident"] = pident
                    try:
                        D[genome][hit] = el
                    except KeyError:        
                        D[genome] = {}
                        D[genome][hit] = el

            F.close()
    return D

In [5]:
dir_genomes = "./genomes"
db          = "./blast/db/HRR25_geneanalysis_nt_translatorXinput.fna"


In [6]:
out_dir     =  "./blast_/out"



In [7]:
genome_fs = os.listdir(dir_genomes)

In [8]:
# For each genome, blast the orthologs from YGOB

i = 0
for f in genome_fs:
    i +=1
    print("%d out of %d files. Curently working on %s"%(i,len(genome_fs),f))
    nm = f
    command = "blastn -task blastn -query %s/%s -db %s -outfmt 10 -evalue 1e-20 > %s/%s.blast"%(dir_genomes,f,db, out_dir,nm)
    os.system(command)
    
    

1 out of 68 files. Curently working on GCA_000292725.1_SacArb1.0_genomic.fna
2 out of 68 files. Curently working on GCA_030580015.1_ASM3058001v1_genomic.fna
3 out of 68 files. Curently working on GCA_030557885.1_ASM3055788v1_genomic.fna
4 out of 68 files. Curently working on GCF_001298625.1_SEUB3.0_genomic.fna
5 out of 68 files. Curently working on yHAB133_kazachstania_unispora_160519.fas
6 out of 68 files. Curently working on GCA_000710315.1_Eremothecium_coryli_genomic.fna
7 out of 68 files. Curently working on yHAB160_kazachstania_kunashirensis_160519.fas
8 out of 68 files. Curently working on .DS_Store
9 out of 68 files. Curently working on yHAB154_kazachstania_transvaalensis_160519.fas


Error: NCBI C++ Exception:
    T0 "/Users/coremake/release_build/build/PrepareRelease_IntelMAC-Clang36_JSID_01_90273_130.14.22.10_9008__PrepareRelease_IntelMAC-Clang36_1481139955/c++/compilers/unix/../../src/objtools/readers/fasta.cpp", line 2428: Error: CFastaReader: Near line 1, there's a line that doesn't look like plausible data, but it's not marked as defline or comment. (m_Pos = 1)



10 out of 68 files. Curently working on GCF_947241705.1_Smik-IFO1815_genomic.fna
11 out of 68 files. Curently working on yHAB153_kazachstania_rosinii_160519.fas
12 out of 68 files. Curently working on GCA_017309295.1_JHU_Cniv_v1_genomic.fna
13 out of 68 files. Curently working on yHMPu5000034877_tetrapisispora_namnaonensis_160519.fas
14 out of 68 files. Curently working on GCA_030556345.1_ASM3055634v1_genomic.fna
15 out of 68 files. Curently working on GCF_001417885.1_Kmar_1.0_genomic.fna
16 out of 68 files. Curently working on yHMPu5000034881_torulaspora_pretoriensis_160519.fas
17 out of 68 files. Curently working on tetrapisispora_phaffii.fas
18 out of 68 files. Curently working on GCA_030569995.1_ASM3056999v1_genomic.fna
19 out of 68 files. Curently working on yHMPu5000026152_torulaspora_franciscae_160519.fas
20 out of 68 files. Curently working on GCA_900074735.1_LAFA0_genomic.fna
21 out of 68 files. Curently working on yHAB159_kazachstania_solicola_160519.fas
22 out of 68 files. C

In [10]:
# merge blast result to dictionary
blast_d = parse_blastout(out_dir)

In [11]:
regions_of_interest = pd.DataFrame( columns=["Genome","Contig","q_start","q_end","Subject","s_start","s_end","strand"])

for genome in blast_d:
    for hit in blast_d[genome]:
        el = blast_d[genome][hit]
        identity = el["pident"]

        if identity == 100:
            contig   =  el["contig"]
            str_on_q = el["str_q"]
            end_on_q = el["end_q"]
            subject  =  el["subject"]
            str_on_s = el["str_s"]
            end_on_s = el["end_s"]
            
            if str_on_s < end_on_s:
                strand = 1
            elif str_on_s > end_on_s:
                strand = -1
        
    
        
        
            new_row  = {'Genome':genome.split(".blast")[0], 
                        'Contig':contig, 
                        'q_start':str_on_q, 
                        'q_end':end_on_q,
                        'Subject':subject,
                        's_start':str_on_s, 
                        's_end':end_on_s,
                        'strand':strand
                       }
            regions_of_interest = regions_of_interest.append(new_row, ignore_index=True)
            
regions_of_interest.head()

Unnamed: 0,Genome,Contig,q_start,q_end,Subject,s_start,s_end,strand
0,GCA_030569995.1_ASM3056999v1_genomic.fna,JAJLUF010000032.1,8108,9496,HRR25_60.1,1389,1,-1
1,tetrapisispora_phaffii.fas,NC_016524.1,591805,593361,HRR25_35.1,1,1557,1
2,tetrapisispora_phaffii.fas,NC_016529.1,523631,525016,HRR25_35.2,1386,1,-1
3,GCA_900074765.1_LAFE0_genomic.fna,LT598485.1,406885,408273,HRR25_61.1,1,1389,1
4,yHAB155_kazachstania_spencerorum_160519.fas,scf7180000055587,23649,25022,HRR25_15.1,1,1374,1


In [12]:
regions_of_interest.to_csv(".blast_hits.csv")