# Parse fDOG-Assembly gff files to get genomic locations of all core groups and save them in a pkl file

1) get the core group
2) get the present species
3) open gff file and parse them
4) write a dict
5) write dict to pkl file

In [1]:
import os
import pickle
import re

In [2]:
def parse_gene_list(path):
    file = open(path, 'r')
    lines = file.readlines()
    gene_set = set()
    for line in lines:
        line = line.rstrip()
        gene_set.add(line)
    return gene_set

# BUSCO vs. fDOG-Assembly

In [3]:
gene_set = parse_gene_list('/share/gluster/Projects/hannah/fDOG-assembly/benchmark/results/augustus_busco_core/gene_list.txt')

In [4]:
def get_gff_pathes(path, gene_set):
    path_dict = {}
    for i in gene_set:
        try:
            files = os.listdir(path + i + '/gff/')
            path_dict[i] = files
        except FileNotFoundError:
            print(i)
    return path_dict

In [5]:
def parse_gff(path, position_dict_cds, gene_name, ass):
    gff = open(path, 'r')
    lines = gff.readlines()
    for line in lines:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        #print(line)
        contig, source, typ, start_gff, end_gff, score, strand, phase, att = line.split('\t')
        #if int(start_gff) > int(end_gff):
            #print(line)
        start = min(int(start_gff), int(end_gff)) 
        end = max(int(start_gff), int(end_gff)) 
        if typ == 'gene':
            geneid = gene_name
        elif typ == 'transcript' or typ == 'mRNA':
            transcript_id = re.search(r'ID=(.*?);', att).group(1)
        elif typ == 'CDS':
            try:
                cds_list = position_dict_cds[ass][contig][geneid][transcript_id]
                cds_list.append([source, typ, int(start), int(end), strand, phase, att])
                position_dict_cds[ass][contig][geneid][transcript_id] = cds_list
            except KeyError:
                try:
                    position_dict_cds[ass][contig][geneid][transcript_id] = [[source, typ, int(start), int(end), strand, phase, att]]
                except KeyError:
                    try:
                        position_dict_cds[ass][contig][geneid] =  {transcript_id: [[source, typ, int(start), int(end), strand, phase, att]]}
                    except KeyError:
                        try:
                            position_dict_cds[ass][contig]= {}
                            position_dict_cds[ass][contig][geneid] =  {transcript_id: [[source, typ, int(start), int(end), strand, phase, att]]}
                        except KeyError:
                            position_dict_cds[ass] = {}
                            position_dict_cds[ass][contig] = {geneid: {transcript_id: [[source, typ, int(start), int(end), strand, phase, att]]}}
                    except UnboundLocalError:
                        print(line)
                        print(ass, gene_name, path)
                        print('ULE')
                            
    return position_dict_cds
        

## Augustus results

In [6]:
path_results = '/share/gluster/Projects/hannah/fDOG-assembly/benchmark/results/augustus_busco_core/'
gff_files = get_gff_pathes(path_results, gene_set)

366124at33208


In [10]:
position_dict_cds = {}

for group in gff_files:
    for file in gff_files[group]:
        if  not file.endswith('.gff'):
            print(file)
            continue
        ass = file.replace('.gff', '')
        #print(ass)
        path = path_results + group + '/gff/' + file
        position_dict_cds = parse_gff(path, position_dict_cds, group, ass)

In [12]:
print(position_dict_cds['RATNO@10116@QfO22']['CM026976.1']['147873at33208'])
#147873at33208

{'147873at33208_CM026976_1_1_g3.t1': [['AUGUSTUS', 'CDS', 133350340, 133350517, '-', '1', 'ID=147873at33208_CM026976_1_1_g3.t1.cds;Parent=147873at33208_CM026976_1_1_g3.t1'], ['AUGUSTUS', 'CDS', 133351128, 133351286, '-', '1', 'ID=147873at33208_CM026976_1_1_g3.t1.cds;Parent=147873at33208_CM026976_1_1_g3.t1'], ['AUGUSTUS', 'CDS', 133352079, 133352170, '-', '0', 'ID=147873at33208_CM026976_1_1_g3.t1.cds;Parent=147873at33208_CM026976_1_1_g3.t1'], ['AUGUSTUS', 'CDS', 133352664, 133352816, '-', '0', 'ID=147873at33208_CM026976_1_1_g3.t1.cds;Parent=147873at33208_CM026976_1_1_g3.t1'], ['AUGUSTUS', 'CDS', 133354193, 133354243, '-', '0', 'ID=147873at33208_CM026976_1_1_g3.t1.cds;Parent=147873at33208_CM026976_1_1_g3.t1']]}


In [13]:
tool = 'augustus'
gp_tool = 'fdog_assembly'
core_set = 'busco'
out_path = '../pkl_files/%s_%s_%s_CDS_positions_galga_v2.pkl'%(gp_tool,tool, core_set)
out = open(out_path,'wb')
pickle.dump(position_dict_cds, out)
out.close()

## MetaEuk results

In [8]:
path_results = '/share/gluster/Projects/hannah/fDOG-assembly/benchmark/results/metaeuk_busco_core/'
gff_files = get_gff_pathes(path_results, gene_set)

366124at33208


In [9]:
position_dict_cds = {}

for group in gff_files:
    for file in gff_files[group]:
        if  not file.endswith('.gff'):
            print(file)
            continue
        ass = file.replace('.gff', '')
        path = path_results + group + '/gff/' + file
        position_dict_cds = parse_gff(path, position_dict_cds, group, ass)

In [10]:
tool = 'metaeuk'
gp_tool = 'fdog_assembly'
core_set = 'busco'
out_path = '../pkl_files/%s_%s_%s_CDS_positions_galga_v2.pkl'%(gp_tool,tool, core_set)
out = open(out_path,'wb')
pickle.dump(position_dict_cds, out)
out.close()

# Human Proteom

In [22]:
gene_set = parse_gene_list('/share/gluster/Projects/hannah/fDOG-assembly/benchmark/data/fDOG/human_seed_genes/genes.txt')

In [23]:
path_results = '/share/gluster/Projects/hannah/fDOG-assembly/benchmark/results/human_proteom/augustus/'
gff_files = get_gff_pathes(path_results, gene_set)

In [24]:
position_dict_cds = {}

for group in gff_files:
    for file in gff_files[group]:
        if  not file.endswith('.gff'):
            print(file)
            continue
        ass = file.replace('.gff', '')
        path = path_results + group + '/gff/' + file
        position_dict_cds = parse_gff(path, position_dict_cds, group, ass)

In [25]:
tool = 'augustus_human_proteom'
gp_tool = 'fdog_assembly'
core_set = 'busco'
out_path = '../pkl_files/%s_%s_%s_CDS_positions.pkl'%(gp_tool,tool, core_set)
out = open(out_path,'wb')
pickle.dump(position_dict_cds, out)
out.close()