In [2]:
import os,re,subprocess,copy
import pandas as pd
from Bio import SeqIO
from natsort import natsorted

path = '/data/s3cha/CHO_ENOSI_JOB/e22076b9e7ab40ef8fc2c63eddf67eba'
event = path + '/all_events.txt'
ref_fa = '/data/genome/hamster/picr_old/picr.fa'

pasa_path = '/data/shangzhong/Picr_assembly/Annotation/PASA/pasa_stringtie'
pasa_assm_fa = pasa_path + '/pasa_assem/picr_db_stringtie.assemblies.fasta'
pasa_gff = pasa_path + '/pasa_assem/picr_db_stringtie.pasa_assemblies.gff3'
gff = pasa_path + '/03_pasa_stringtie.gff3'
gff_pr = pasa_path + '/03_pasa_stringtie_pr.fa'

transde_gff = pasa_path + '/transdecoder/picr_db_stringtie.transdecoder.genome.gff3'
transde_pep = pasa_path + '/transdecoder/picr_db_stringtie.assemblies.fasta.transdecoder.pep'

####  For transcripts whose confidence level equal to 4 or 5, we check if there is evidence (proteomics or riboseq) to support them, if not, remove their CDS.

In [4]:
uniq_event = path + '/all_events_uniq.txt'
if not os.path.exists(uniq_event):
    df = pd.read_csv(event,sep='\t',header=0)
    df = df[['Event','Peptide','Chromosome','Location','Strand']].drop_duplicates()
    df.to_csv(uniq_event,sep='\t',index=False)

In [3]:
# get gene assemble
all_gff = {} # {geneid:[gene_line],{asm_id:[[asm_id_line]]}}
gene_rna_dic = {}  # {geneid:[[gene_line],[asm_ids]]}
rna_all_dic = {} # {asmbl_id:[[line1],['line2]]}
gene_pos = {}  # {geneid:[chr,strand,start,end]}
with open(gff) as in_f:
    for line in in_f:
        if line.startswith('#'):continue
        item = line.strip().split('\t')
        if item[2] == 'gene':
            geneid = re.search('(?<=gene_id=).+?(?=;)',item[8]).group(0)
            gene_pos[geneid] = [item[0],item[6],int(item[3]),int(item[4])]
            gene_rna_dic[geneid]=[item,[]]
            all_gff[geneid] = [item,{}]
        elif item[2] in ['mRNA','lncRNA']:
            rnaid = re.search('(?<=ID=).+?(?=;)',item[8]).group(0)
            rna_all_dic[rnaid] = [item]
            chrom = item[0]
            strand = item[6]
            geneid = re.search('(?<=gene_id=).+?(?=;)',item[8]).group(0)
            gename = re.search('(?<=Name=).+?(?=;|$)',item[8]).group(0)
            gene_rna_dic[geneid][1].append(rnaid)
            all_gff[geneid][1][rnaid] = [item]
        else:
            rna_all_dic[rnaid].append(item)
            all_gff[geneid][1][rnaid].append(item)

In [5]:
# # get riboseq expression to filter the reads
# ribo = '/data/shangzhong/RibosomeProfiling/previous_data'
# folders = natsorted([f for f in os.listdir(ribo) if f.endswith('quant')])

# assm_ribo_expressed  = []
# for f in folders:
#     df = pd.read_csv(ribo+'/'+f+'/quant.sf',sep='\t',header=0)
#     assms = df.query('TPM>=1')['Name'].tolist()
#     assm_ribo_expressed.extend(assms)
# assm_ribo_expressed = list(set(assm_ribo_expressed))

# get proteogenomics support
known_pep = '/data/s3cha/CHO_ENOSI_JOB/e22076b9e7ab40ef8fc2c63eddf67eba/known_asmbl.txt'
pep_assm = pd.read_csv(known_pep,sep='\t',header=None)[0].tolist()
proved_assm = list(set(pep_assm)) # +assm_ribo_expressed

low_qal_rna = []
rna_gid_dic = {}
with open(gff) as f:
    for line in f:
        if line.startswith('#'):continue
        item = line.strip().split('\t')
        if item[2] == 'mRNA':
            cs = item[8][-1]
            gid = re.search('(?<=gene_id=).+?(?=;)',item[8]).group(0)
            assm = re.search('(?<=ID=).+?(?=;)',item[8]).group(0)
            rna_gid_dic[assm] = gid
            if cs in ['4','5']:
                low_qal_rna.append(assm)

fail_assm = set(low_qal_rna)-set(proved_assm)#-set(assm_ribo_expressed)

Therea are 12325 transcripts whose confidence scores are 4 or 5 and don't have spectrum to support them.

In [6]:
# remove the low quality proteins
for gene,v in all_gff.iteritems():
    for assm in v[1]:
        if assm in fail_assm:
            all_gff[gene][1][assm] = [item for item in all_gff[gene][1][assm] if item[2]!='CDS']

# output the filtered annotation
fil_gff = pasa_path+'/04_fil_pasa_stringtie.gff3'
if not os.path.exists(fil_gff):
    with open(gff) as in_f,open(fil_gff,'w') as out_f:
        for line in in_f:
            if line.startswith('#'):
                out_f.write(line)
                continue
            item = line.strip().split('\t')
            if item[2] != 'CDS':
                out_f.write(line)
            else:
                assm = re.search('(?<=Parent=).+?(?=;)',line).group(0)
                if assm in fail_assm:
                    continue
                else:
                    out_f.write(line)

In [7]:
# transdecoder protein sequence
trans_pr_pos_dic = {}
with open(transde_gff) as f:
    for line in f:
        if line.strip() == '': continue
        item = line.split('\t')
        chrom = item[0]
        start = int(item[3])
        end = int(item[4])
        strand = item[6]
        anno = item[8]
        if item[2] =='CDS':
            fid = re.search('(?<=ID=).+?(?=;)',anno).group(0)[4:]
            if fid not in trans_pr_pos_dic:
                trans_pr_pos_dic[fid] = [chrom+strand,[start,end]]
            else:
                trans_pr_pos_dic[fid].append([start,end])

In [8]:
# read pasa rna sequence
pasa_assm_dic = SeqIO.index(pasa_assm_fa,'fasta')

In [9]:
# read pasa assembled rna position
pasa_pos_dic = {}  # {assm:[[start,end]]}
with open(pasa_gff) as f:
    for line in f:
        if line.strip() == '' or line.startswith('#'):
            continue
        item = line.strip().split('\t')
        chrom = item[0]
        start = int(item[3])
        end = int(item[4])
        strand = item[6]
        anno = item[8]
        tr = re.search('(?<=Target=).+?(?=\ )',anno).group(0)
        if tr not in pasa_pos_dic:
            pasa_pos_dic[tr] = [chrom+strand,[start,end]]
        else:
            pasa_pos_dic[tr].append([start,end])

In [10]:
# read gff protein sequence
gff_pr_dic = SeqIO.index(gff_pr,'fasta')

In [11]:
# get protein and transcript id mapping
transde_pep_dic = SeqIO.index(transde_pep,'fasta')
tr_pr_dic = {}
for prid in transde_pep_dic:
    tr = prid.split('.')[0]
    if tr in tr_pr_dic:
        tr_pr_dic[tr].append(prid)
    else:
        tr_pr_dic[tr] = [prid]

In [12]:
def genes_overlap_with_pep(item,gene_pos):
    '''this function gets overlapped gene ids for a event line
    * gene_pos: {geneid:[chrom,start,end]}
    * item: splited event line
    '''
    pep=''.join(item[1].split(':'))
    chrom = item[2]
    pep_str = item[-1]
    pos = []
    for ps in item[3].split(';'):
        pos.extend([int(p) for p in ps.split('-')])
    p_s = min(pos)
    p_e = max(pos)
    gene_ids = []
    for k,v in gene_pos.iteritems():
        if chrom == v[0]:
            if pep_str == v[1]:
                if v[2]<p_s<p_e<v[3]:
                    gene_ids.append(k)
    return gene_ids,p_s,p_e

In [13]:
def getORF(in_fa,pep):
    '''this function predicts the ORF in rna sequence and returen the longest ORF that match
    provided peptide'''
    cmd = ('~/Installation/ORFfinder -in {inf} -strand plus -out inter.fa').format(inf=in_fa)
    subprocess.call(cmd,shell=True)
    with open('out.fa','w') as out:
        for record in SeqIO.parse('inter.fa','fasta'):
            if pep in record.seq:
                SeqIO.write(record,out,'fasta')
    os.remove('inter.fa')
    l = 0
    res = ''
    for record in SeqIO.parse('out.fa','fasta'):
        if len(record.seq) > l:
            res = record
            l = len(record.seq)
    os.remove('out.fa')
    os.remove(in_fa)
    return res

def getCDS_gff(assm_gff,pr_seq):
    '''get cds position in gff format given protein sequence and rna gff format'''
    strand = assm_gff[0][-1]
    if strand == '+':
        assm_gff[1:] = sorted(assm_gff[1:],key=lambda x:x[0])
    else:
        assm_gff[1:] = sorted(assm_gff[1:],key=lambda x:x[0],reverse=True)
        assm_gff[1:] = [[p[1],p[0]] for p in assm_gff[1:]]
    cds_gff = [assm_gff[0]]
    head = pr_seq.id
    item = head.split(':')
    cds_s = int(item[1])+1
    cds_e = int(item[2])+1
    exon_index = [[0,0]]
    exon_lens = [abs(l[1]-l[0])+1 for l in assm_gff[1:]]
    for l in exon_lens:
        pair = [exon_index[-1][-1]+1,exon_index[-1][-1]+l]
        exon_index.append(pair)
    exon_index = exon_index[1:]
    for index,pos in zip(exon_index,assm_gff[1:]):
        if index[0]<=cds_s<=index[1]:
            if strand=='+':
                s = pos[0] + (cds_s-index[0])
            else:
                s = pos[0] - (cds_s-index[0])
        elif cds_s > index[1]:
            continue
        else:
            s = pos[0]
        if index[0]<=cds_e<=index[1]:
            if strand=='+':
                e = pos[0] + (cds_e-index[0])
            else:
                e = pos[0] - (cds_e-index[0])
        elif cds_e < index[0]:
            continue
        else:
            e = pos[1]
        cds_gff.append([s,e])
    if strand == '+':
        cds_gff[1:] = sorted(cds_gff[1:],key=lambda x:x[0])
    else:
        cds_gff[1:] = sorted(cds_gff[1:],key=lambda x:x[0],reverse=True)
        cds_gff[1:] = [[p[1],p[0]] for p in cds_gff[1:]]
    return cds_gff

##### Need to process the grouped peptides.
 This has two cases. 
 1. the new peptide maps to a transcript that already has protein, but it maps to the non coding part. the origial gene id and name remain the same. The new add gene is in the format gene_asmbl_idx.
 2. The new peptide maps to a transcript that doesn't have protein. This is easy, we get 6 frame ORF and then extract the one that include the new peptide. Gene id and gene name doesn't change.

In [1]:
def new_transcript_CDS(new_event_gff_dic,item,rm_genes,pasa_assm_dic,pasa_pos_dic,gene_rna_dic,rna_all_dic,gene_pos,trans_pr_pos_dic,gff_pr_dic,tr_pr_dic):
    '''
    * new_event_gff_dic = {} # {geneid:[[geneline],{assm:[lines]}]}
    * item: one line of event results
    * gene_rna_dic = {}  # {geneid:[[gene_line],[asm_ids]]}
    * rna_all_dic = {} # {asmbl_id:[[line1],[line2]]}
    * gene_pos = {}  # {geneid:[chr,start,end]}
    * trans_pr_pos_dic {transdecoder_pr:[chrstr,[pos]]}
    '''
    pep=''.join(item[1].split(':'))
    pep_str = item[-1]
    update_assm = []
    # 1. get all gene ids that overlap with the peptide
    gene_ids,p_s,p_e = genes_overlap_with_pep(item,gene_pos)
    # 2. for each gene id get the transcripts that has the peptide sequence
    match_prs = []
    for gene in gene_ids:
        rm_genes.append(gene)
        new_event_gff_dic[gene]=[gene_rna_dic[gene][0],{}]
        assm_ids = gene_rna_dic[gene][1]
        gene_cds_pos = []
        cds_poses = []
        cs = []
        for assm in assm_ids:
            '''if assm already has CDS, then update gene position to min and max of all CDS pos
            gene name does not change.'''
            assm_cds_pos = []
            cds_poses.append([])
            # get boundary for all CDS
            known_assm_pos = rna_all_dic[assm]
            cs.append(int(known_assm_pos[0][-1][-1]))
            if assm in tr_pr_dic:  # assemblies have coding protein
                prs = tr_pr_dic[assm] # dic from transdecoder pep
                for pr in prs:
                    pr_tr = pr.split('.')[0]
                    if pr_tr in gff_pr_dic:  # the protein is in the annotation
                        if pep in gff_pr_dic[pr_tr].seq:
                            print pep,'peptide already in known sequence',pr
                            # this is because the new peptide is the beginning of a database protein,
                            # but not the real beginning of known protein. It was not identified in known database.
                            break
                    if pep in transde_pep_dic[pr].seq:
                        match_prs.append(pr)
                    # get cds start and end
                    for line in known_assm_pos:
                        if line[2] == 'CDS':
                            gene_cds_pos.extend([int(line[3]),int(line[4])])
                            cds_poses[-1].append((int(line[3]),int(line[4])))
                    if cds_poses[0] != []: # has proteins, then split out the known protein as new genes
                        new_event_gff_dic[gene][0][3] = min(gene_cds_pos)
                        new_event_gff_dic[gene][0][4] = max(gene_cds_pos)
        if cds_poses[0] != []: 
            '''the gene already has protines, need to split between old and new
            change transcript id from asmbl_num to geneid_asmbl_index.
            geneid does not change'''
            chrom = new_event_gff_dic[gene][0][0]
            strand = new_event_gff_dic[gene][0][6]
            name = re.search('(?<=Name=).+?(?=;|$)',new_event_gff_dic[gene][0][8]).group(0)
            uni_cdses = set([tuple(x) for x in cds_poses if x!=[]])
            n = 0
            for uni in uni_cdses:
                n += 1
                rnaid = gene+'_assm_'+str(n)
                anno = 'ID={r};Parent={g};gene_id={g};Name={na};transcript_id={r};cs={cs}'.format(
                            r=rnaid,g=gene,na=name,cs=str(max(cs)))
                s = min(uni,key=lambda x:x[0])[0]
                e = max(uni,key=lambda x:x[1])[1]
                new_event_gff_dic[gene][1][rnaid] = [[chrom,'merge','mRNA',str(s),str(e),'.',strand,
                                                         '.',anno]]
                i = 0
                for u in uni:
                    i += 1
                    anno = 'ID={r}.exon{i};Parent={r};gene_id={g};Name={na};transcript_id={r}'.format(
                            r=rnaid,i=i,g=gene,na=name)
                    new_event_gff_dic[gene][1][rnaid].append([chrom,'merge','exon',str(u[0]),str(u[1]),'.',
                                strand,'.',anno])
                i = 0
                for u in uni:
                    i += 1
                    anno = 'ID={r}.CDS{i};Parent={r};gene_id={g};Name={na};protein_id={r}'.format(
                            r=rnaid,i=i,g=gene,na=name)
                    new_event_gff_dic[gene][1][rnaid].append([chrom,'merge','CDS',str(u[0]),str(u[1]),'.',
                                strand,'.',anno])
        else: # the gene doesn't have proteins
            for assm in assm_ids:
                known_assm_pos = [line for line in rna_all_dic[assm] if line[2]!='CDS']
                new_event_gff_dic[gene][1][assm]=known_assm_pos
        #------------- add new peptide to the gff file 
        if match_prs == []:# mapped gene doesn't have CDS,or have CDS but don't include pep
            n = 1
            for assm in assm_ids:
                assm_record = pasa_assm_dic[assm]
                with open(assm+'.fa','w') as f:
                    SeqIO.write(assm_record,f,'fasta')
                orf = getORF(assm+'.fa',pep)
                assm_pos = copy.copy(pasa_pos_dic[assm])
                chrom = assm_pos[0][:-1]
                strand = assm_pos[0][-1]
                rnaid = assm
                gene_id = gene
                '''here assign to gene, in the followings if need to split the original genes,
                the gene id would be gene_asmbl_num. Otherwise, do not change gene id, directly
                add cds '''
                if isinstance(orf,str):  # in another transcripts
                    continue
                cds_pos = getCDS_gff(assm_pos,orf)
                # write gene
                if gene_cds_pos!=[]:
                    print gene_id,assm,'new pep maps to it which already has coding protein'
                    update_assm.append(assm)
                    gene_id = 'gene_'+rnaid
                    rnaid = rnaid+'_'+str(n)
                    start = min(cds_pos[1:],key=lambda x:x[0])[0]
                    end = max(cds_pos[1:],key=lambda x:x[1])[1]
                    anno='ID={g};gene_id={g};Name={g}'.format(g=gene_id)
                    new_event_gff_dic[gene_id] = [[chrom,'merge','gene',str(start),str(end),
                                                '.',strand,'.',anno],{}]
                    anno=('ID={r};Parent={g};gene_id={g};Name={g};transcript_id={r};cs=4').format(
                                                    r=rnaid,g=gene_id)
                    new_event_gff_dic[gene_id][1][rnaid]=[[chrom,'merge','mRNA',str(start),str(end),
                                    '.',strand,'.',anno]]
                    # write assm
                    i = 0
                    for exon_p in cds_pos[1:]:
                        i +=1
                        anno=('ID={r}.exon{i};Parent={r};gene_id={g};Name={g};transcript_id={r}').format(
                                               r=rnaid,i=str(i),g=gene_id)
                        new_event_gff_dic[gene_id][1][rnaid].append([chrom,'merge','exon',str(exon_p[0]),
                                            str(exon_p[1]),'.',strand,'.',anno])
                else:
                    print gene_id,assm,'new pep maps to it without coding protein'
                    update_assm.append(assm)
                i = 0
                for cds_p in cds_pos[1:]:
                    i +=1
                    anno=('ID={r}.cds{i};Parent={r};gene_id={g};Name={g};protein_id={r}').format(
                                           r=rnaid,i=str(i),g=gene_id)
                    new_event_gff_dic[gene_id][1][rnaid].append([chrom,'merge','CDS',str(cds_p[0]),
                                        str(cds_p[1]),'.',strand,'.',anno])
            n += 1
        elif len(match_prs) > 1:
            print pep + ' maps to two or more rnas'
        else:
            for match_pr in match_prs[0:1]: # predicted protein has the peptide
                n = 1
                assm = match_pr.split('.')[0]
                print assm
                if gene_cds_pos != []: # known gff already has protein
                    known_cds_s,known_cds_e = min(gene_cds_pos),max(gene_cds_pos)
                    if p_s<=known_cds_s<=p_e or known_cds_s<=p_s<=known_cds_e:
                        print match_pr + ' new peptide overlap with known'
                        continue
                    transde_pr_pos = trans_pr_pos_dic[match_pr]
                    # create new gff
                    chrom = transde_pr_pos[0][:-1]
                    strand = transde_pr_pos[0][-1]
                    rnaid = assm+'_'+str(n)
                    gene_id = 'gene_'+rnaid
                    all_pos = [p for cds_p in transde_pr_pos[1:] for p in cds_p]
                    start = min(all_pos);end=max(all_pos)
                    anno='ID={g};gene_id={g};Name={g}'.format(g=gene_id)
                    new_event_gff_dic[gene_id] = [[chrom,'merge','gene',str(start),str(end),
                                                '.',strand,'.',anno],{}]
                    anno=('ID={r};Parent={g};gene_id={g};Name={g};transcript_id={r}').format(
                                                    r=rnaid,g=gene_id)
                    new_event_gff_dic[gene_id][1][rnaid]=[[chrom,'merge','mRNA',str(start),str(end),
                                    '.',strand,'.',anno]]
                    i = 0
                    for cds_p in transde_pr_pos[1:]:
                        i += 1
                        anno=('ID={r}.exon{i};Parent={r};gene_id={g};Name={g};transcript_id={r}').format(
                                           r=rnaid,i=str(i),g=gene_id)
                        new_event_gff_dic[gene_id][1][rnaid].append([chrom,'merge','exon',str(cds_p[0]),
                                            str(cds_p[1]),'.',strand,'.',anno])
                        anno=('ID={r}.cds{i};Parent={r};gene_id={g};Name={g};protein_id={r}').format(
                                           r=rnaid,i=str(i),g=gene_id)
                        new_event_gff_dic[gene_id][1][rnaid].append([chrom,'merge','CDS',str(cds_p[0]),
                                            str(cds_p[1]),'.',strand,'.',anno])
            n += 1
    return new_event_gff_dic,rm_genes,update_assm

In [15]:
update_gff = pasa_path+'/05_updated.gff3'
if not os.path.exists(update_gff):
    # 1. generate the update assemblies
    new_event_gff_dic = {} # {geneid:[[geneline],{assm:[lines]}]}
    rm_genes = [] # stores which genes are modified using proteomics evidence.
    update_assms = []
    with open(uniq_event) as f:
        for line in f:
            if line.startswith('#'): continue
            item = line.strip().split('\t')
            if item[0] == 'transcript gene(non CDS)':
                new_event_gff_dic,rm_genes,update_assm = new_transcript_CDS(new_event_gff_dic,item,rm_genes,
                                    pasa_assm_dic,pasa_pos_dic,gene_rna_dic,rna_all_dic,
                                   gene_pos,trans_pr_pos_dic,gff_pr_dic,tr_pr_dic)
                update_assms += update_assm
    # 2. merge updated assemblies with the old one
    # update all_gff 1. copy the original gff. 2. remove the genes that should be updated. 3. add updated genes.
    cp_all_gff = all_gff.copy()
    for rm_gene in set(rm_genes):
        del cp_all_gff[rm_gene]
    cp_all_gff.update(new_event_gff_dic)
    # output the updated gff file
    with open(update_gff,'w') as f:
        for gene,v in cp_all_gff.iteritems():
            v[0][3] = str(v[0][3])
            v[0][4] = str(v[0][4])
            f.write('\t'.join(v[0])+'\n')
            for assm,items in v[1].iteritems():
                for item in items:
                    item[3] = str(item[3])
                    item[4] = str(item[4])
                    f.write('\t'.join(item)+'\n')

gene_1415 asmbl_4507 new pep maps to it without coding protein
gene_11811_1 asmbl_33799 new pep maps to it without coding protein
gene_28383 asmbl_80128 new pep maps to it without coding protein
gene_4459 asmbl_13136 new pep maps to it which already has coding protein
gene_4459 asmbl_13138 new pep maps to it which already has coding protein
gene_4459 asmbl_13139 new pep maps to it which already has coding protein
gene_4666 asmbl_13701 new pep maps to it without coding protein
gene_4666 asmbl_13702 new pep maps to it without coding protein
gene_16694 asmbl_47213 new pep maps to it without coding protein
gene_28567 asmbl_80639 new pep maps to it without coding protein
gene_29483 asmbl_83539 new pep maps to it which already has coding protein
gene_29483 asmbl_83541 new pep maps to it which already has coding protein
gene_17008_1 asmbl_48309 new pep maps to it without coding protein
gene_17008_1 asmbl_48287 new pep maps to it without coding protein
gene_17008_4 asmbl_48305 new pep maps to 

gene_23679 asmbl_67112 new pep maps to it without coding protein
gene_23679 asmbl_67113 new pep maps to it without coding protein
gene_11884 asmbl_33992 new pep maps to it without coding protein
gene_11884 asmbl_33993 new pep maps to it without coding protein
gene_21542 asmbl_60477 new pep maps to it which already has coding protein
gene_23802 asmbl_67366 new pep maps to it without coding protein
gene_23802 asmbl_67369 new pep maps to it without coding protein
gene_23802 asmbl_67371 new pep maps to it without coding protein
gene_23802 asmbl_67372 new pep maps to it without coding protein
gene_7045 asmbl_21010 new pep maps to it without coding protein
gene_7045 asmbl_21009 new pep maps to it without coding protein
gene_7045 asmbl_21011 new pep maps to it without coding protein
gene_11002_3 asmbl_32066 new pep maps to it without coding protein
gene_11002_3 asmbl_32067 new pep maps to it without coding protein
gene_11002_3 asmbl_32068 new pep maps to it without coding protein
gene_9399 as

gene_8430 asmbl_24871 new pep maps to it without coding protein
gene_8430 asmbl_24875 new pep maps to it without coding protein
gene_8430 asmbl_24872 new pep maps to it without coding protein
gene_24146 asmbl_68477 new pep maps to it without coding protein
gene_13453 asmbl_38708 new pep maps to it without coding protein
gene_22404 asmbl_63225 new pep maps to it without coding protein
gene_18714_3 asmbl_52746 new pep maps to it without coding protein
gene_18714_3 asmbl_52732 new pep maps to it without coding protein
gene_28529 asmbl_80518 new pep maps to it without coding protein


136 genes are updated. 208 transcripts map without coding proteins. 38 transcripts already have coding protein.

In [17]:
# add frame information
def add_cds_frame(in_gff,frame_gff):
    with open(frame_gff,'w') as out, open(in_gff) as inf:
        pre_pr = ''
        for line in inf:
            if line.startswith('#'):
                out.write(line)
            else:
                item = line.split('\t')
                if item[2] == 'CDS':
                    pr = re.search('(?<=protein_id=).+?(?=$|;)',item[8]).group(0)
                    if pr != pre_pr:
                        pre_pr = pr
                        item[7] = '0'
                        n = abs(int(item[3])-int(item[4]))+1
                    else:
                        item[7] = str(n % 3)
                        n += abs(int(item[3])-int(item[4]))+1
                    out.write('\t'.join(item))
                else:
                    out.write(line)

update_gff = pasa_path+'/05_updated.gff3'
up_cds_frame_gff = pasa_path+'/06_updated_cds_frame.gff3'
if not os.path.exists(up_cds_frame_gff):
    add_cds_frame(update_gff,up_cds_frame_gff)

In [18]:
# extract proteins from gff
import sarge
up_cds_frame_pr = pasa_path+'/06_updated_cds_frame_pr.fa'
if not os.path.exists(up_cds_frame_pr):
    cmd = ('gffread {gff} -g {ref} -y {fa}').format(gff=up_cds_frame_gff,
                                                    ref=ref_fa,
                                                    fa=up_cds_frame_pr)
    sarge.run(cmd)

#####  Functional annotation of the new added proteins.

In [19]:
#--------- 1. extract low quality proteins (new added) based on the transcripts id mapping.
low_qual_pr_fn = pasa_path+'/07_update_low_qual_pr.fa'
if not os.path.exists(low_qual_pr_fn):
    update_pr_index = SeqIO.index(up_cds_frame_pr,'fasta')
    with open(low_qual_pr_fn,'w') as f:
        for assm in update_pr_index:
            if assm in update_assms:
                SeqIO.write(update_pr_index[assm],f,'fasta')
#---------- 2. blastp these proteins to uniprot
uniprot_db = pasa_path+'/uniprot_blastDb/uniprot'
low_qual_pr2uni = pasa_path + '/08_update_low_qual_pr2uni.txt'
if not os.path.exists(low_qual_pr2uni):
    cmd = ('blastp -db {uni} -query {q} -out {out} -outfmt 6 -num_alignments 1 -num_threads 9 -evalue 1').format(
            uni=uniprot_db,q=low_qual_pr_fn,out=low_qual_pr2uni)
    sarge.run(cmd)
#---------- 3. add gene name to the blastp mapping results.
# first build {uniprot_accession: gene name } dictionary
uni_id_map_fn = pasa_path + '/uniprot_idmap.txt'
df = pd.read_csv(uni_id_map_fn,sep='\t',header=None,names=['acc','name'])
df['name'] = df['name'].map(lambda x: str(x).upper())
acc_name_dic = df.set_index('acc')['name'].to_dict()
# second build {transcripts:gene name} dictionary
df = pd.read_csv(low_qual_pr2uni,sep='\t',header=None)
df['acc'] = df[1].map(lambda x: x.split('|')[1])
df['gene'] =df['acc'].map(lambda x:acc_name_dic[x] if x in acc_name_dic else x)
low_assm_gnm_dic = df.set_index(0)['gene'].to_dict()
low_assm_acc_dic = df.set_index(0)[1].to_dict()
# third output the new gene name
update_gnm_fn = pasa_path + '/10_update_gnm.txt'
if not os.path.exists(update_gnm_fn):
    with open(update_gnm_fn,'w') as out:
        for k,v in low_assm_gnm_dic.iteritems():
            out.write(k+'\t'+v+'\t'+low_assm_acc_dic[k]+'\n')

In [20]:
# update gene name in the updated gff file
# import pdb;pdb.set_trace()
final_update_gff = pasa_path+'/09_updated_final.gff3'
up_cds_frame_gff = pasa_path+'/06_updated_cds_frame.gff3'
if not os.path.exists(final_update_gff):
    with open(up_cds_frame_gff) as in_f, open(final_update_gff,'w') as out:
        for line in in_f:
            if line.startswith('#'):
                out.write(line)
                continue
            item = line.strip().split('\t')
            if item[2] == 'gene':
                gene = line
                n = 1
            elif item[2] in ['mRNA','lncRNA']:
                rnaid = re.search('(?<=ID=).+?(?=;|$)',line).group(0)
                if rnaid in low_assm_gnm_dic:
                    if n == 1:
                        gene = re.sub('(?<=Name=).+?(?=;|$)',low_assm_gnm_dic[rnaid],gene)
                        out.write(gene)
                    line = re.sub('(?<=Name=).+?(?=;|$)',low_assm_gnm_dic[rnaid],line)
                else:
                    if n == 1:
                        out.write(gene)
                out.write(line)
                n += 1
            else:
                rnaid = re.search('(?<=Parent=).+?(?=;|$)',line).group(0)
                if rnaid in low_assm_gnm_dic:
                    line = re.sub('(?<=Name=).+?(?=;|$)',low_assm_gnm_dic[rnaid],line)
                out.write(line)
                n += 1

In [21]:
update_final_pr = pasa_path + '/09_updated_final_pr.fa'
if not os.path.exists(update_final_pr):
    cmd = ('gffread {gff} -g {ref} -y {fa}').format(gff=final_update_gff,
                                                    ref=ref_fa,
                                                    fa=up_cds_frame_pr)
    sarge.run(cmd)