# Mapping by genomic location

1) open pkl files and save as dict
2) open gff file
3) parse gff file, save gene ids and check if CDS positions overlap
4) compute overlap and save gene ids and overlap size for all
5) filter overlaps by overlap size -> which overlap length shold we use?

In [16]:
import pickle
import re
import pandas as pd
import random

In [17]:
def open_pkl(path):
    
    pkl_file = open(path, 'rb')
    dictionary = pickle.load(pkl_file)
    pkl_file.close()
    
    return dictionary

In [18]:
busco_augustus_pkl = open_pkl('../pkl_files/busco_augustus_CDS_positions_galga_v2.pkl')
busco_metaeuk_pkl = open_pkl('../pkl_files/busco_metaeuk_CDS_positions_galga_v2.pkl')
busco_augustus_species_pkl = open_pkl('../pkl_files/busco_augustus_with_species_parameter_CDS_positions_galga_v2.pkl')
fdog_augustus_pkl = open_pkl('../pkl_files/fdog_assembly_augustus_busco_CDS_positions_galga_v2.pkl')
fdog_metaeuk_pkl = open_pkl('../pkl_files/fdog_assembly_metaeuk_busco_CDS_positions_galga_v2.pkl')
fdog_metaeuk_sens_pkl = open_pkl('../pkl_files/fdog_assembly_metaeuk_busco_CDS_positions_galga_v2_sens.pkl')
compleasm_pkl = open_pkl('../pkl_files/compleasm_CDS_positions_galga_v2.pkl')

In [None]:
#Example NEMVE from ENA -> mapping with gene att: Name= or CDS att: protein_id=
DS480309.1      Genbank region  1       1000    .       +       .       ID=DS480309.1:1..1000;Dbxref=taxon:45351;dev-stage=larval;gbkey=Src;genome=genomic;mol_type=genomic DNA;strain=CH2 x CH6
DS480309.1      Genbank gene    668     847     .       +       .       ID=gene-NEMVEDRAFT_v1g157624;Name=NEMVEDRAFT_v1g157624;end_range=847,.;gbkey=Gene;gene_biotype=protein_coding;locus_tag=NEMVEDRAFT_v1g157624;partial=true;start_range=.,668
DS480309.1      Genbank mRNA    668     847     .       +       .       ID=rna-NEMVEDRAFT_v1g157624;Parent=gene-NEMVEDRAFT_v1g157624;end_range=847,.;gbkey=mRNA;locus_tag=NEMVEDRAFT_v1g157624;partial=true;product=predicted protein;start_range=.,668
DS480309.1      Genbank exon    668     847     .       +       .       ID=exon-NEMVEDRAFT_v1g157624-1;Parent=rna-NEMVEDRAFT_v1g157624;end_range=847,.;gbkey=mRNA;locus_tag=NEMVEDRAFT_v1g157624;partial=true;product=predicted protein;start_range=.,668
DS480309.1      Genbank CDS     668     847     .       +       0       ID=cds-EDO25302.1;Parent=rna-NEMVEDRAFT_v1g157624;Dbxref=NCBI_GP:EDO25302.1;Name=EDO25302.1;gbkey=CDS;locus_tag=NEMVEDRAFT_v1g157624;partial=true;product=predicted protein;protein_id=EDO25302.1;start_range=.,668

In [None]:
#Example Ensembl 10116   Rattus norvegicus       UP000002494     Ensembl GCA_015227675.2
#maping possible by gene att: ID=gene:
1       mRatBN7.2       region  1       260522016       .       .       .       ID=region:1;Alias=CM026974.1,NC_051336.1
1       ensembl gene    76834   358271  .       +       .       ID=gene:ENSRNOG00000070568;Name=Vom2r3;biotype=protein_coding;description=vomeronasal 2 receptor%2C 3 [Source:RGD Symbol%3BAcc:1565892];gene_id=ENSRNOG00000070568;version=1
1       ensembl mRNA    76834   358271  .       +       .       ID=transcript:ENSRNOT00000096177;Parent=gene:ENSRNOG00000070568;Name=Vom2r3-201;biotype=protein_coding;tag=Ensembl_canonical;transcript_id=ENSRNOT00000096177;version=1
1       ensembl five_prime_UTR  76834   76908   .       +       .       Parent=transcript:ENSRNOT00000096177
1       ensembl exon    76834   77114   .       +       .       Parent=transcript:ENSRNOT00000096177;constitutive=1;exon_id=ENSRNOE00000506230;rank=1;version=2
1       ensembl CDS     76909   77114   .       +       0       ID=CDS:ENSRNOP00000084716;Parent=transcript:ENSRNOT00000096177;protein_id=ENSRNOP00000084716;version=1
1       ensembl exon    79753   80035   .       +       .       Parent=transcript:ENSRNOT00000096177;constitutive=1;exon_id=ENSRNOE00000620383;rank=2;version=1
1       ensembl CDS     79753   80035   .       +       1       ID=CDS:ENSRNOP00000084716;Parent=transcript:ENSRNOT00000096177;protein_id=ENSRNOP00000084716;version=1

In [19]:
def parse_gff_ena(path, sp, mapping_uniprot_id):
    gff_file = open(path, 'r')
    lines = gff_file.readlines()
    gff_dic = {}
    for line in lines:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        else:
            contig, source, typ, start, end, score, strand, phase, att = line.split('\t')
            if typ == 'region':
                gff_dic[contig] = {}
            elif typ == 'gene':
                gene_name = re.search(r'Name=(.*?);', att).group(1)
                transcript = 0
            elif typ == 'mRNA':
                transcript += 1
            elif typ == 'CDS':
                # all CDs have the same ids so we can not save them as a dict
                cds_names = set()
                try:
                    cds_name = re.search(r'Name=(.*?);', att).group(1)
                    
                except AttributeError:
                    cds_name = re.search(r'ID=(.*?);', att).group(1)
                cds_names.add(cds_name)
                try:
                    cds_locus = re.search(r'locus_tag=(.*?);', att).group(1)
                except AttributeError:
                    cds_locus = re.search(r'gene=(.*?);', att).group(1)
                cds_names.add(cds_locus)
                try:
                    cds_standard = re.search(r'standard_name=(.*?$)', att).group(1)
                    cds_names.add(cds_standard)
                except AttributeError:
                    pass
                
                try:
                    gff_dic[contig][gene_name].append([int(start), int(end), score, strand, phase, cds_names, transcript])
                except KeyError:
                    gff_dic[contig][gene_name] = []
                    transcript = 1
                    gff_dic[contig][gene_name].append([int(start), int(end), score, strand, phase, cds_names, transcript])
    return gff_dic           

In [20]:
def parse_gff_ensembl(path, sp, mapping_uniprot_id):
    gff_file = open(path, 'r')
    lines = gff_file.readlines()
    gff_dic = {}
    contig_mapping = {}
    for line in lines:
        line = line.rstrip()
        if line.startswith('#'):
            continue
        else:
            contig, source, typ, start, end, score, strand, phase, att = line.split('\t')
            if typ == 'region':
                #print(line)
                try:
                    alias = re.search(r';Alias=(.*?$)', att).group(1)
                    for i in alias.split(','):
                        contig_mapping[i] = contig
                except AttributeError:
                    pass
                gff_dic[contig] = {}
            elif typ == 'gene':
                cds_names = set()
                gene_name = re.search(r'ID=gene:(.*?);', att).group(1)
                try:
                    gene_alias = re.search(r';Name=(.*?);', att).group(1)
                    cds_names.add(gene_alias)
                except AttributeError:
                    pass
                try:
                    entrenz_alias = re.search(r'BAcc:(.*?)]', att).group(1)
                    cds_names.add(entrenz_alias)
                except AttributeError:
                    pass
                cds_names.add(gene_name)
                try:
                    print(gff_dic[contig][gene_name])
                    pass
                except KeyError:
                    try:
                        gff_dic[contig][gene_name] = []
                    except KeyError:
                        try:
                            gff_dic[contig][gene_name] = []
                        except KeyError:
                            gff_dic[contig] = {}
                            gff_dic[contig][gene_name] = []
            elif typ == 'ncRNA_gene':
                cds_names = set()
                gene_name = re.search(r'ID=gene:(.*?);', att).group(1)
                try:
                    gene_alias = re.search(r';Name=(.*?);', att).group(1)
                    cds_names.add(gene_alias)
                except AttributeError:
                    pass
                cds_names.add(gene_name)
                #print(line)
                try:
                    print(gff_dic[contig][gene_name])
                except KeyError:
                    try:
                        gff_dic[contig][gene_name] = []
                    except KeyError:
                        try:
                            gff_dic[contig][gene_name] = []
                        except KeyError:
                            gff_dic[contig] = {}
            elif typ == 'mRNA':
                transcript = re.search(r'ID=transcript:(.*?);', att).group(1)
            elif typ == 'CDS':
                # all CDs have the same ids so we can not save them as a dict
                cds_name = re.search(r'ID=CDS:(.*?);', att).group(1)
                cds_names.add(cds_name)
                try:
                    gff_dic[contig][gene_name].append([int(start), int(end), score, strand, phase, cds_names, transcript])
                except KeyError:
                    print(line)
                    gff_dic[contig][gene_name].append([int(start), int(end), score, strand, phase, cds_names, transcript])
    return gff_dic, contig_mapping

In [21]:
def parse_species_file(file):
    lines = file.readlines()
    species_dict = {}
    for line in lines:
        line = line.rstrip()
        ncbi, name, uniprot_acc, source, refseq_acc = line.split('\t')
        species_dict[ncbi] = {'name': name, 'uniprot': uniprot_acc, 'source': source, 'refseq': refseq_acc}
    return species_dict

In [22]:
def mapping_uniprot(path):
    uniprot_mapping_dict = {}
    file = open(path, 'r')
    lines = file.readlines()
    counter = 0
    #not_unique_source = {'ZFIN','Xenbase','RGD','CRC64','GI','UniParc','OMA','ESTHER','MEROPS','Gene_ORFName','DNASU','Gene_Synonym', 'UniRef100', 'UniRef90', 'UniRef50', 'NCBI_TaxID', 'OrthoDB', 'EMBL', 'EMBL-CDS', 'VEuPathDB', 'eggNOG', 'GeneTree', 'ChiTaRS', 'HOGENOM', 'UniPathway','GlyConnect', 'GuidetoPHARMACOLOGY', 'DrugBank', 'BioGRID', 'UCSC', 'PDB', 'ComplexPortal', 'TCDB', 'ChEMBL', 'DIP', 'KEGG', 'TreeFam', 'Reactome', 'STRING', 'Allergome'}
    for line in lines:
        line = line.rstrip()
        uniprot, source, source_id = line.split('\t')
        #if source in not_unique_source:
         #   continue
        if source.startswith('Ensembl'):
            # some of the ensample names also have versions given at the end of the id seperated by a dot. But the version number is often not given in the gff gene name
            prefix_source_id = source_id.split('.')[0]
            try:
                uniprot_mapping_dict[prefix_source_id].add(uniprot)
            except KeyError:
                uniprot_mapping_dict[prefix_source_id] = set()
                uniprot_mapping_dict[prefix_source_id].add(uniprot)
        try:
            uniprot_mapping_dict[source_id].add(uniprot)
        except KeyError:
            uniprot_mapping_dict[source_id] = set()
            uniprot_mapping_dict[source_id].add(uniprot)
    return uniprot_mapping_dict
    

In [23]:
# read in assembly information
#species_file = open('../../data/fDOG-assembly/species_set_benchmark.tsv', 'r')
species_file = open('../../data/fDOG-assembly/species_set_benchmark_v2.tsv', 'r')
species_dict = parse_species_file(species_file)
species_file.close()

In [24]:
gff_path  = '../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/'
positions_gff = {}
contig_mapping_dict = {}
mapping_files_path = '/share/gluster/Projects/hannah/fDOG-assembly/benchmark/data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/'
for sp in species_dict:
    print(sp)
    mapping_uniprot_ids = mapping_uniprot(mapping_files_path + species_dict[sp]['uniprot'] + '_' + sp + '.idmapping')
    if species_dict[sp]['source'] == 'Ensembl':
        print(gff_path + species_dict[sp]['refseq']+ '/genes.gff3')
        try:
            print('Ensembl')
            species_gff_dict, contig_mapping = parse_gff_ensembl(gff_path + species_dict[sp]['refseq']+ '/genes.gff3', sp, mapping_uniprot_ids)
        except FileNotFoundError:
            print('ENA')
            species_gff_dict = parse_gff_ena(gff_path + species_dict[sp]['refseq']+ '/genomic.gff', sp, mapping_uniprot_ids)
        positions_gff[sp] = species_gff_dict
        contig_mapping_dict[sp] = contig_mapping
    else:
        print(gff_path + species_dict[sp]['refseq']+ '/genomic.gff')
        print('ENA')
        species_gff_dict = parse_gff_ena(gff_path + species_dict[sp]['refseq']+ '/genomic.gff', sp, mapping_uniprot_ids)
        positions_gff[sp] = species_gff_dict

45351
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000209225.1/genomic.gff
ENA
10116
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_015227675.2/genes.gff3
Ensembl
9031
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000002315.5/genes.gff3
Ensembl
8364
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCF_000004195.4/genomic.gff
ENA
7955
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCF_000002035.6/genomic.gff
ENA
7227
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000001215.4/genomic.gff
ENA
7070
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000002335.3/genomic.gff
ENA
6945
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000208615.1/genes.gff3
Ensembl
ENA
6412
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000326865.1/genes.gff3
Ensembl
ENA
6239
../../data/qfo_eukaryota_2022/ncbi_download/ncbi_dataset/data/GCA_000002985.3/geno

In [25]:
def sort_ref_positional_dict(contig_gff):
    # input dict structure: positions_gff[ncbi][contig][gene][transcripts_list[start,end,score, strand, phase, cds_names, transcript]] 
    # should sort genes according to start position on contig
    sorted_dict = dict(sorted(contig_gff.items(), key=lambda item: item[0][0]))
    return sorted_dict
    
    

In [26]:
for species in positions_gff:
    for contig in positions_gff[species]:
        positions_gff[species][contig] = sort_ref_positional_dict(positions_gff[species][contig])
        break

In [27]:
def give_overlap(start, end, strand, start_r, end_r, strand_r):
    overlap = 0
    if strand != strand_r:
        return overlap
    elif end > start_r and end <= end_r:
        overlap = end - max(start_r, start)
    elif start >= start_r and start < end_r:
        overlap = min(end_r, end) - start
    return overlap

In [28]:
def get_coding_length(cds_list):
    cds_dict = {}
    for cds_ref in cds_list:
        start_r, end_r, score_r, strand_r, phase_r, cds_name_r, transcript_id = cds_ref
        try:
            cds_dict[transcript_id] += int(end_r) - int(start_r)
        except KeyError:
            cds_dict[transcript_id] = int(end_r) - int(start_r)
    return cds_dict

In [29]:
def get_overlap_table(pkl_file, positions_gff = positions_gff):    
    mapping_table = []
    mapping_files_path = '/share/gluster/Projects/hannah/fDOG-assembly/benchmark/data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/'
    overlap_dict = {'Species': [], 'GeneID':[], 'transcript':[], 'overlap': [], 'uniprotID': [], 'coverage': []}
    cant_map = set()
    multi_map = set()
    check = False
    p = False
    statistic_dict = {'Species': [], '#Orthologs': [], 'No_overlap': []}
    for key in pkl_file:
        # no overlap with any CDS of a gene in ref annotation file
        no_overlap = 0
        # number of orthologs predicted in total
        gene_count = 0
        cant_map = set()
        multi_map = set()
        print(key)
        ncbi = key.split('@')[1]
        #if key == 'GALGA@9031@000002315_5':
        if key == 'GALGA@9031@QfO22':
           continue
        ######## for human proteom bechmark ###########
        #if key != 'RATNO@10116@QfO22' and key != 'NEMVE@45351@QfO22' :
         #   print('Not right key')
          #  print(key)
           # continue
        ################testing###########
        #if key != 'RATNO@10116@QfO22':
         #   continue
        ########### teating############
        #dict scheme: position_dict_cds[ass]:[contig]:[geneid]:[transcript_id]:list([source, typ, int(start), int(end), strand, phase, att])
        #dict scheme contig_mapping_dict: gff_dic[species]:[contig]:[gene_name]:list([start, end, score, strand, phase, cds_name])
        print(species_dict[ncbi])
        mapping_uniprot_ids = mapping_uniprot(mapping_files_path + species_dict[ncbi]['uniprot'] + '_' + ncbi + '.idmapping')
        for contig in pkl_file[key]:
            try:
                genes = positions_gff[ncbi][contig]
                contig_ref = contig
            except KeyError:
                try:
                    alias = contig_mapping_dict[ncbi][contig]
                    genes = positions_gff[ncbi][alias]
                    contig_ref = alias
                except KeyError:
                    print(key)
                    print(contig)
                    print(ncbi)
                    print(species_dict[ncbi])
            for gene in pkl_file[key][contig]:
                ###### testing #############
                #if gene == '147873at33208':
                 #   p = True
                  #  print('Gene found in pkl')
                #else:
                 #   p = False
                ###### testing #############
                for transcript in pkl_file[key][contig][gene]:
                    cds_list = pkl_file[key][contig][gene][transcript]
                    ###### testing #########
                    #if p == True:
                    #    print('CDS list')
                    #    print(cds_list)
                    #else:
                    #    continue
                    ###### testing #########
                    overlaps = {'uniprotID':[], 'overlap':[], 'transcript_id': [], 'gene_ref_name':[]}
                    length_transcript = 0
                    #cds_sum_length_dict = {}
                    cds_sum_dict = {}
                    for cds in cds_list:
                        #try:
                        source, typ, start, end, strand, phase, att = cds
                        length_transcript += end - start
                        #except ValueError:
                         #   print(cds)
                        for gene_ref in positions_gff[ncbi][contig_ref]:
                            #if check == True:
                              #  check = False
                              #  break
                            for cds_ref in positions_gff[ncbi][contig_ref][gene_ref]:
                                try:
                                    start_r, end_r, score_r, strand_r, phase_r, cds_name_r, transcript_id = cds_ref
                                except ValueError:
                                    print(cds_ref)
                                try:
                                    cds_sum_dict[transcript_id]
                                except KeyError:
                                    cds_sum_dict[transcript_id] = int(end_r) - int(start_r)
                                #if min(int(end_r),int(start_r)) > max(int(end),int(start)):
                                    #check = True
                                    #break
                                    #is not working because the dictionaries are not correctly sorted
                                overlap = give_overlap(int(start), int(end), strand, int(start_r), int(end_r), strand_r)
                                if overlap != 0:
                                    multi = False
                                    if type(cds_name_r) == set:
                                        uniprot_id_r = None
                                        for i in cds_name_r:
                                            try:
                                                uniprot_ids = mapping_uniprot_ids[i]
                                                if len(uniprot_ids) > 1:
                                                    multi = True
                                                else:
                                                    uniprot_id_r = list(uniprot_ids)[0]
                                                    break
                                            except KeyError:
                                                pass
                                        if uniprot_id_r != None:
                                            overlaps['uniprotID'].append(uniprot_id_r)
                                        elif uniprot_id_r == None and multi == True:
                                            uniprot_id_r = 'NA'
                                            multi_map.add(next(iter(cds_name_r)))
                                            overlaps['uniprotID'].append(uniprot_id_r)
                                        else:
                                            #print(cds_name_r)
                                            if uniprot_id_r == None:
                                                uniprot_id_r = 'NA'
                                            overlaps['uniprotID'].append(uniprot_id_r)
                                            cant_map.add(next(iter(cds_name_r)))
                                    else:
                                        try:
                                            uniprot_ids = mapping_uniprot_ids[i]
                                            if len(uniprot_ids) > 1:
                                                    multi = True
                                                    multi_map.add(next(iter(cds_name_r)))
                                                    overlaps['uniprotID'].append('NA')
                                            else:
                                                uniprot_id_r = list(uniprot_ids)[0]
                                                overlaps['uniprotID'].append(uniprot_id_r)
                                        except KeyError: 
                                            cant_map.add(cds_name_r)
                                            overlaps['uniprotID'].append('NA')
                                    overlaps['overlap'].append(overlap)
                                    overlaps['transcript_id'].append(transcript_id)
                                    overlaps['gene_ref_name'].append(gene_ref)
                            cds_sum_dict[gene_ref] = get_coding_length(positions_gff[ncbi][contig_ref][gene_ref])
                    #print(overlaps)
                    df = pd.DataFrame(overlaps)
                    ###### testing #############
                    #if p == True:
                    #    print('overlaps')
                    #    print(df)
                    ###### testing #############
                    df_sum = df.groupby(['uniprotID', 'transcript_id', 'gene_ref_name']).sum()
                    ##### Only Max Uniprot###################
                    #max_ids = df_sum.reset_index().iloc[df_sum.reset_index().overlap.idxmax()]
                    #print(max_ids)
                    #overlap_dict['Species'].append(ncbi)
                    #overlap_dict['GeneID'].append(gene)
                    #overlap_dict['TranscriptID'].append(transcript)
                    #overlap_dict['overlap'].append(max_ids['overlap'])
                    #overlap_dict['uniprotID'].append(max_ids['uniprotID'])
                    #print(overlap_dict)
                    #### Max per Uniprot#######################
                    max_length = df_sum.groupby(['uniprotID', 'transcript_id', 'gene_ref_name'])['overlap'].max()
                    sum_lengths_list = max_length.reset_index().values.tolist()
                    gene_count += 1
                    ###### testing #############
                    #if p == True:
                    #    print('df_sum')
                    #    print(df_sum)
                    #    print('max_length')
                    #    print(max_length)
                    #    print('sum_lenght_list')
                    #    print(sum_lengths_list)
                    ###### testing #############
                    for row in sum_lengths_list:
                        #print(row)
                        overlap_dict['Species'].append(ncbi)
                        overlap_dict['GeneID'].append(gene)
                        overlap_dict['transcript'].append(transcript)
                        overlap_dict['overlap'].append(row[3])
                        if row[0] == 'NA':
                            overlap_dict['uniprotID'].append(None)
                        else: 
                            overlap_dict['uniprotID'].append(row[0])
                        overlap_dict['coverage'].append(int(row[3])/min(length_transcript,cds_sum_dict[row[2]][row[1]]))
                        cov = int(row[3])/min(length_transcript,cds_sum_dict[row[2]][row[1]])
                        if cov > 1:
                            print(ncbi)
                            print(gene)
                            print(row)
                            print(length_transcript)
                            print(cds_sum_dict[row[2]][row[1]])
                            print(cds_list)
                            print(df)
                            print(cov)
                    if overlaps['uniprotID'] == []:
                        #print(ncbi)
                        #print(gene)
                        no_overlap += 1
                        overlap_dict['Species'].append(ncbi)
                        overlap_dict['GeneID'].append(gene)
                        overlap_dict['transcript'].append(transcript)
                        overlap_dict['overlap'].append(None)
                        overlap_dict['uniprotID'].append(None)
                        overlap_dict['coverage'].append(None)
        
        print(f'Gene_count %d, No overlap %d, Cant map set %d, Multi map set %d'% (gene_count, no_overlap, len(cant_map), len(multi_map)))
        print('Multi map set')
        print(multi_map)
        print('Cant map examples')
        if len(cant_map) > 3:
            print(random.sample(sorted(cant_map), 3))
        else:
            print(cant_map)
        #statistic_dict = {'Species': [], '#Orthologs': [], 'No_overlap': []}
        statistic_dict['Species'].append(ncbi)
        statistic_dict['#Orthologs'].append(gene_count)
        statistic_dict['No_overlap'].append(no_overlap)
    df_stat = pd.DataFrame(statistic_dict)
    display(df_stat)
        
    #print(len(overlap_dict['Species']), len(overlap_dict['GeneID']), len(overlap_dict['transcript']), len(overlap_dict['overlap']), len(overlap_dict['uniprotID']), len(overlap_dict[ 'coverage']))
    df_overlap = pd.DataFrame(overlap_dict)
    
    return df_overlap

In [30]:
print("Compleasm")
compleasm_df = get_overlap_table(compleasm_pkl)
compleasm_df.to_csv('../overlap_tables/compleasm_overlap_gff_files_gallus_v2.tsv', sep='\t', index=False)

Compleasm
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 729, No overlap 0, Cant map set 0, Multi map set 3
Multi map set
{'CELE_Y39A3CR.1', 'Y39A3CR.1d', 'Y39A3CR.1b'}
Cant map examples
set()
DANRE@7955@QfO22
{'name': 'Danio rerio', 'uniprot': 'UP000000437', 'source': 'RefSeq', 'refseq': 'GCF_000002035.6'}
Gene_count 1155, No overlap 1, Cant map set 0, Multi map set 1
Multi map set
{'NP_001017751.2'}
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 957, No overlap 1, Cant map set 7, Multi map set 1
Multi map set
{'Dmel_CG7843'}
Cant map examples
['Dmel_CG9742', 'Dmel_CG1821', 'AAF48554.1']
GALGA@9031@000002315_5
{'name': 'Gallus gallus', 'uniprot': 'UP000000539', 'source': 'Ensembl', 'refseq': 'GCA_000002315.5'}
Gene_count 918, No overlap 8, Cant map set 18, Multi map set 2
Multi

Unnamed: 0,Species,#Orthologs,No_overlap
0,6239,729,0
1,7955,1155,1
2,7227,957,1
3,9031,918,8
4,6412,921,22
5,6945,928,67
6,45351,948,28
7,10116,1060,33
8,7070,955,3
9,8364,958,1


In [36]:
print("BSUCO augustus species")
busco_augustus_df = get_overlap_table(busco_augustus_species_pkl)
busco_augustus_df.to_csv('../overlap_tables/busco_augustus_species_overlap_gff_files_gallus_v2.tsv', sep='\t', index=False)

BSUCO augustus species
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 731, No overlap 1, Cant map set 10, Multi map set 3
Multi map set
{'Y39A3CR.1c', 'Y39A3CR.1d', 'Y39A3CR.1b'}
Cant map examples
['cds-CELE_Y71A12C.3', 'Y57G11C.21', 'cds-CELE_F27D4.3']
DANRE@7955@QfO22
{'name': 'Danio rerio', 'uniprot': 'UP000000437', 'source': 'RefSeq', 'refseq': 'GCF_000002035.6'}
Gene_count 1241, No overlap 16, Cant map set 0, Multi map set 1
Multi map set
{'NP_001017751.2'}
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 958, No overlap 0, Cant map set 8, Multi map set 2
Multi map set
{'Dmel_CG7843', 'AAF49536.1'}
Cant map examples
['AAF58920.1', 'AAF48634.1', 'AAS65037.3']
GALGA@9031@000002315_5
{'name': 'Gallus gallus', 'uniprot': 'UP000000539', 'source': 'Ensembl', 'refseq': 'GCA_00000231

Unnamed: 0,Species,#Orthologs,No_overlap
0,6239,731,1
1,7955,1241,16
2,7227,958,0
3,9031,912,5
4,6412,894,16
5,6945,889,67
6,45351,931,36
7,10116,1018,60
8,7070,956,3
9,8364,958,1


In [None]:
print("BUSCO augustus")
busco_augustus_df = get_overlap_table(busco_augustus_pkl)
#busco_augustus_df.to_csv('../overlap_tables/busco_augustus_overlap_gff_files_gallus_v2.tsv', sep='\t', index=False)

BUSCO augustus
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}


In [20]:
print("BUSCO metaeuk")
busco_metaeuk_df = get_overlap_table(busco_metaeuk_pkl)
#busco_metaeuk_df.to_csv('../overlap_tables/busco_metaeuk_overlap_gff_files.tsv', sep='\t', index=False)

BUSCO metaeuk
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 739, No overlap 1, Cant map set 4, Multi map set 3
Multi map set
{'Y39A3CR.1c', 'CAH2176098.1', 'CAH2176100.1'}
Cant map examples
['cds-CELE_B0250.3', 'F59E12.16b', 'F59E12.16a']
DANRE@7955@QfO22
{'name': 'Danio rerio', 'uniprot': 'UP000000437', 'source': 'RefSeq', 'refseq': 'GCF_000002035.6'}
Gene_count 1156, No overlap 2, Cant map set 0, Multi map set 1
Multi map set
{'NP_001017751.2'}
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 961, No overlap 0, Cant map set 6, Multi map set 1
Multi map set
{'AAM68345.2'}
Cant map examples
['AAF45822.1', 'AAF48554.1', 'AAN12244.1']
GALGA@9031@000002315_5
{'name': 'Gallus gallus', 'uniprot': 'UP000000539', 'source': 'Ensembl', 'refseq': 'GCA_000002315.5'}
Gene_count 917, No overl

Unnamed: 0,Species,#Orthologs,No_overlap
0,6239,739,1
1,7955,1156,2
2,7227,961,0
3,9031,917,8
4,6412,972,64
5,6945,919,66
6,45351,957,20
7,10116,1276,169
8,7070,960,4
9,8364,958,4


In [15]:
print('##### fDOG-Assembly Augustus ########')
fdog_busco_augustus_df = get_overlap_table(fdog_augustus_pkl)
#fdog_busco_augustus_df.to_csv('../overlap_tables/fdog_ass_busco_augustus_overlap_gff_files_gallus_v2.tsv', sep='\t', index=False)

##### fDOG-Assembly Augustus ########
GALGA@9031@000002315_5
{'name': 'Gallus gallus', 'uniprot': 'UP000000539', 'source': 'Ensembl', 'refseq': 'GCA_000002315.5'}
Gene_count 897, No overlap 7, Cant map set 20, Multi map set 2
Multi map set
{'ENSGALP00000078034', 'TRAPPC2L'}
Cant map examples
['ENSGALG00000059040', 'ENSGALP00000091600', 'ENSGALG00000059518']
RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 1009, No overlap 45, Cant map set 1, Multi map set 9
Multi map set
{'Ublcp1', '1561342', 'Rnf10', '620507', 'ENSRNOP00000058286', 'ENSRNOG00000055257', 'ENSRNOG00000013223', 'ENSRNOG00000002241', 'Ak6'}
Cant map examples
{'ENSRNOP00000049054'}
TRICA@7070@QfO22
{'name': 'Tribolium castaneum', 'uniprot': 'UP000007266', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002335.3'}
Gene_count 945, No overlap 6, Cant map set 0, Multi map set 0
Multi map set
set()
Cant map examples
set()
CAEEL@6239@QfO22
{'name': 'Ca

Unnamed: 0,Species,#Orthologs,No_overlap
0,9031,897,7
1,10116,1009,45
2,7070,945,6
3,6239,728,3
4,7227,926,6
5,45351,872,32
6,6945,863,107
7,7955,1164,6
8,6412,803,26
9,8364,949,5


In [16]:
print('##### fDOG-Assembly MetaEuk ########')
fdog_busco_metaeuk_df = get_overlap_table(fdog_metaeuk_pkl)
#fdog_busco_metaeuk_df.to_csv('../overlap_tables/fdog_ass_busco_metaeuk_overlap_gff_files_gallus_v2.tsv', sep='\t', index=False)

##### fDOG-Assembly MetaEuk ########
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 730, No overlap 2, Cant map set 9, Multi map set 1
Multi map set
{'CELE_Y39A3CR.1'}
Cant map examples
['CELE_F52C6.13', 'F47G3.2', 'CELE_F55A3.7']
DANRE@7955@QfO22
{'name': 'Danio rerio', 'uniprot': 'UP000000437', 'source': 'RefSeq', 'refseq': 'GCF_000002035.6'}
Gene_count 1159, No overlap 9, Cant map set 0, Multi map set 1
Multi map set
{'NP_001017751.2'}
Cant map examples
set()
TRICA@7070@QfO22
{'name': 'Tribolium castaneum', 'uniprot': 'UP000007266', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002335.3'}
Gene_count 949, No overlap 11, Cant map set 0, Multi map set 0
Multi map set
set()
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 934, No overlap 10, Cant map set 10, Multi map set 1
Multi map set

Unnamed: 0,Species,#Orthologs,No_overlap
0,6239,730,2
1,7955,1159,9
2,7070,949,11
3,7227,934,10
4,45351,1039,73
5,6945,967,176
6,6412,871,22
7,8364,974,29
8,9031,921,28
9,10116,1071,112


In [16]:
print('##### fDOG-Assembly MetaEuk sens########')
fdog_busco_metaeuk_df = get_overlap_table(fdog_metaeuk_sens_pkl)
fdog_busco_metaeuk_df.to_csv('../overlap_tables/fdog_ass_busco_metaeuk_overlap_gff_files_gallus_v2_sens.tsv', sep='\t', index=False)

##### fDOG-Assembly MetaEuk sens########
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 737, No overlap 11, Cant map set 8, Multi map set 2
Multi map set
{'Y39A3CR.1c', 'CELE_Y39A3CR.1'}
Cant map examples
['CELE_Y102A5C.6', 'cds-CELE_F52C6.13', 'cds-CELE_F47G3.2']
TRICA@7070@QfO22
{'name': 'Tribolium castaneum', 'uniprot': 'UP000007266', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002335.3'}
Gene_count 955, No overlap 16, Cant map set 0, Multi map set 0
Multi map set
set()
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 944, No overlap 16, Cant map set 7, Multi map set 1
Multi map set
{'Dmel_CG7843'}
Cant map examples
['AAF48634.1', 'AAF48554.1', 'Dmel_CG46509']
NEMVE@45351@QfO22
{'name': 'Nematostella vectensis', 'uniprot': 'UP000001593', 'source': 'ENA/EMBL', 'refseq': 'GCA_0002092

Unnamed: 0,Species,#Orthologs,No_overlap
0,6239,737,11
1,7070,955,16
2,7227,944,16
3,45351,1048,81
4,7955,1172,10
5,9031,926,39
6,6412,871,28
7,8364,981,39
8,10116,1080,108
9,6945,972,187


In [None]:
###### output fDOG-Assembly augustus, new chicken, always iterating whole contig
RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 1008, No overlap 45, Cant map 1, Multi map 9
Multi map set
{'1561342', 'ENSRNOG00000063018', '61932', 'ENSRNOP00000075579', '620870', 'Rnf10', 'ENSRNOP00000014632', 'ENSRNOG00000004477', 'ENSRNOP00000058286'}
Cant map examples
{'ENSRNOP00000049054'}
GALGA@9031@000002315_5
{'name': 'Gallus gallus', 'uniprot': 'UP000000539', 'source': 'Ensembl', 'refseq': 'GCA_000002315.5'}
Gene_count 898, No overlap 7, Cant map 20, Multi map 2
Multi map set
{'ENSGALP00000074383', 'ENSGALP00000090837'}
Cant map examples
['ENSGALP00000076009', 'ENSGALP00000078129', 'THUMPD1']
TRICA@7070@QfO22
{'name': 'Tribolium castaneum', 'uniprot': 'UP000007266', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002335.3'}
Gene_count 946, No overlap 6, Cant map 0, Multi map 0
Multi map set
set()
Cant map examples
set()
DROME@7227@QfO22
{'name': 'Drosophila melanogaster', 'uniprot': 'UP000000803', 'source': 'ENA/EMBL', 'refseq': 'GCA_000001215.4'}
Gene_count 927, No overlap 6, Cant map 5, Multi map 1
Multi map set
{'AAM68345.2'}
Cant map examples
['Dmel_CG9177', 'Dmel_CG1524', 'Dmel_CG9742']
DANRE@7955@QfO22
{'name': 'Danio rerio', 'uniprot': 'UP000000437', 'source': 'RefSeq', 'refseq': 'GCF_000002035.6'}
Gene_count 1163, No overlap 6, Cant map 0, Multi map 1
Multi map set
{'osgep'}
Cant map examples
set()
IXOSC@6945@QfO22
{'name': 'Ixodes scapularis', 'uniprot': 'UP000001555', 'source': 'Ensembl', 'refseq': 'GCA_000208615.1'}
Gene_count 863, No overlap 107, Cant map 11, Multi map 0
Multi map set
set()
Cant map examples
['IscW_ISCW018510', 'EEC16001.1', 'IscW_ISCW018975']
GALGA@9031@QfO22
XENTR@8364@QfO22
{'name': 'Xenopus tropicalis', 'uniprot': 'UP000008143', 'source': 'RefSeq', 'refseq': 'GCF_000004195.4'}
Gene_count 949, No overlap 5, Cant map 0, Multi map 47
Multi map set
{'mrps5', 'XP_004914765.1', 'rrn3', 'XP_004913000.1', 'XP_012814324.1', 'c1qbp', 'XP_002938755.1', 'XP_002937906.1', 'supt4h1', 'NP_001007514.1', 'tbc1d7', 'XP_012814323.1', 'NP_001011344.1', 'XP_004918010.1', 'XP_012809948.1', 'XP_004912881.1', 'rpl28', 'nup107', 'gbf1', 'NP_001016280.1', 'XP_002934120.1', 'ctu2', 'c12orf4', 'NP_001016587.1', 'XP_012814321.1', 'XP_002939365.2', 'XP_012822075.1', 'XP_012814396.2', 'ankmy2', 'XP_004912577.1', 'XP_017947958.1', 'NP_989259.2', 'XP_002938554.1', 'NP_001165133.1', 'det1', 'NP_001016895.1', 'mrpl47', 'XP_012814322.1', 'fdxr', 'mst1r', 'tmem70', 'utp18', 'NP_001027490.1', 'XP_004912704.1', 'poldip2', 'pus1', 'ubiad1'}
Cant map examples
set()
HELRO@6412@QfO22
{'name': 'Helobdella robusta', 'uniprot': 'UP000015101', 'source': 'Ensembl', 'refseq': 'GCA_000326865.1'}
Gene_count 803, No overlap 26, Cant map 0, Multi map 0
Multi map set
set()
Cant map examples
set()
NEMVE@45351@QfO22
{'name': 'Nematostella vectensis', 'uniprot': 'UP000001593', 'source': 'ENA/EMBL', 'refseq': 'GCA_000209225.1'}
Gene_count 873, No overlap 32, Cant map 39, Multi map 0
Multi map set
set()
Cant map examples
['cds-NEMVEDRAFT_v1g41253', 'NEMVEDRAFT_v1g2076', 'NEMVEDRAFT_v1g89647']
CAEEL@6239@QfO22
{'name': 'Caenorhabditis elegans', 'uniprot': 'UP000001940', 'source': 'ENA/EMBL', 'refseq': 'GCA_000002985.3'}
Gene_count 727, No overlap 3, Cant map 6, Multi map 3
Multi map set
{'Y39A3CR.1d', 'Y39A3CR.1b', 'Y39A3CR.1c'}
Cant map examples
['Y57G11C.21', 'cds-CELE_F47G3.2', 'F55A3.7']
14300 14300 14300 14300 14300 14300

In [21]:
print(positions_gff['10116']['5']['ENSRNOG00000019655'])

[[131423669, 131423695, '.', '-', '0', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704'], [131424277, 131424467, '.', '-', '2', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704'], [131424738, 131424789, '.', '-', '0', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704'], [131424914, 131424983, '.', '-', '1', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704'], [131425179, 131425256, '.', '-', '1', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704'], [131425386, 131425444, '.', '-', '0', {'Atp6v0b', 'ENSRNOP00000026704', 'ENSRNOG00000019655'}, 'ENSRNOT00000026704']]


In [23]:
print(positions_gff['8364']['NC_030686.2']['rpl23'])

[[10661863, 10661875, '.', '+', '0', {'rpl23', 'NP_001011231.1'}, 1], [10663371, 10663454, '.', '+', '2', {'rpl23', 'NP_001011231.1'}, 1], [10665060, 10665188, '.', '+', '2', {'rpl23', 'NP_001011231.1'}, 1], [10665520, 10665633, '.', '+', '2', {'rpl23', 'NP_001011231.1'}, 1], [10667663, 10667745, '.', '+', '2', {'rpl23', 'NP_001011231.1'}, 1]]


In [30]:
print(fdog_augustus_pkl['RATNO@10116@QfO22']['CM026978.1']['537470at33208'])

{'537470at33208_CM026978_1_1_g4.t1': [['AUGUSTUS', 'CDS', 131423669, 131423695, '-', '0', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131424277, 131424467, '-', '2', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131424738, 131424789, '-', '0', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131424914, 131424983, '-', '1', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131425179, 131425256, '-', '1', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131425386, 131425469, '-', '1', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS', 'CDS', 131425680, 131425728, '-', '2', 'ID=537470at33208_CM026978_1_1_g4.t1.cds;Parent=537470at33208_CM026978_1_1_g4.t1'], ['AUGUSTUS

## Human proteom fDOG-Assembly augustus

In [25]:
#Augustus with mapping file for augustus ref species, all proteins
fdog_human_proteom_augustus_pkl = open_pkl('../pkl_files/fdog_assembly_augustus_human_proteom_busco_CDS_positions.pkl')
fdog_human_proteom_augustus_df = get_overlap_table(fdog_human_proteom_augustus_pkl)
fdog_human_proteom_augustus_df.to_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files_rat_nema.tsv', sep='\t', index=False)

RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
RATNO@10116@QfO22
AY172581.1
10116
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 24025, No overlap 2668, Cant map 10, Multi map 130
Multi map set
{'ENSRNOP00000048081', '1303057', '1303008', 'ENSRNOG00000016029', 'Eci2', 'ENSRNOG00000003846', '2197', 'Il17f', '621512', 'ENSRNOG00000042111', '1303049', 'ENSRNOG00000033747', 'Ggt1', '2497', 'ENSRNOG00000024243', 'ENSRNOP00000059218', 'Ppp1r2', 'ENSRNOG00000018650', '1307768', 'Slc30a8', '2504', '621704', 'ENSRNOP00000024406', 'ENSRNOG00000003259', '1306313', 'ENSRNOG00000023433', 'Fev', '620371', 'ENSRNOG00000005592', 'ENSRNOG00000013223', 'ENSRNOP00000001449', 'Ncmap', 'ENSRNOP00000068417', 'Taco1', 'ENSRNOP00000006573', 'Hand2', '735199', 'Tor2a', '620438', 'ENSRNOP00000074503', 'ENSRNOP00000072729', 'ENSRNOG00000032517', 'Selenop', 'ENSRNOP000

In [22]:
#Augustus with mapping file for augustus ref species, 5000 human seed genes
fdog_human_proteom_augustus_pkl = open_pkl('../pkl_files/fdog_assembly_augustus_human_proteom_5t_CDS_positions.pkl')
fdog_human_proteom_augustus_df = get_overlap_table(fdog_human_proteom_augustus_pkl)
fdog_human_proteom_augustus_df.to_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files_rat_nema_5t.tsv', sep='\t', index=False)

TRICA@7070@QfO22
Not right key
TRICA@7070@QfO22
DROME@7227@QfO22
Not right key
DROME@7227@QfO22
NEMVE@45351@QfO22
{'name': 'Nematostella vectensis', 'uniprot': 'UP000001593', 'source': 'ENA/EMBL', 'refseq': 'GCA_000209225.1'}
Gene_count 3246, No overlap 262, Cant map set 158, Multi map set 0
Multi map set
set()
Cant map examples
['cds-NEMVEDRAFT_v1g61375', 'cds-NEMVEDRAFT_v1g61676', 'NEMVEDRAFT_v1g12630']
IXOSC@6945@QfO22
Not right key
IXOSC@6945@QfO22
DANRE@7955@QfO22
Not right key
DANRE@7955@QfO22
HELRO@6412@QfO22
Not right key
HELRO@6412@QfO22
GALGA@9031@QfO22
XENTR@8364@QfO22
Not right key
XENTR@8364@QfO22
RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 5749, No overlap 664, Cant map set 3, Multi map set 26
Multi map set
{'ENSRNOG00000016429', 'Slc30a8', 'ENSRNOP00000048081', 'Fbp1', 'Selenop', 'ENSRNOP00000065923', 'Xkr6', '61932', 'ENSRNOP00000007455', 'ENSRNOG00000018650', 'ENSRNOG00000009640'

Unnamed: 0,Species,#Orthologs,No_overlap
0,45351,3246,262
1,10116,5749,664


In [21]:
#MetaEuk, 5000 human seed genes
fdog_human_proteom_metaeuk_pkl = open_pkl('../pkl_files/fdog_assembly_metaeuk_human_proteom_5t_CDS_positions.pkl')
fdog_human_proteom_metaeuk_df = get_overlap_table(fdog_human_proteom_metaeuk_pkl)
fdog_human_proteom_metaeuk_df.to_csv('../overlap_tables/fdog_ass_human_proteom_metaeuk_overlap_gff_files_rat_nema_5t.tsv', sep='\t', index=False)

HELRO@6412@QfO22
Not right key
HELRO@6412@QfO22
GALGA@9031@QfO22
XENTR@8364@QfO22
Not right key
XENTR@8364@QfO22
RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
RATNO@10116@QfO22
AY172581.1
10116
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 5647, No overlap 732, Cant map set 2, Multi map set 27
Multi map set
{'Slc30a8', 'ENSRNOP00000048081', 'Fbp1', 'Selenop', 'ENSRNOP00000065923', 'Xkr6', 'Cyp4f6', '61932', 'ENSRNOP00000007455', 'ENSRNOG00000018650', 'Syn3', 'Gdf10', 'ENSRNOG00000009640', '1310453', 'Rps4x-ps9', '628789', 'ENSRNOG00000049334', 'Arx', 'Gpr3', 'ENSRNOP00000059218', '1592346', 'Spem2', 'ENSRNOP00000071934', 'Rb1', '2913', 'Msh5', 'Aebp1'}
Cant map examples
{'41253011', 'ENSRNOG00000032348'}
CAEEL@6239@QfO22
Not right key
CAEEL@6239@QfO22
TRICA@7070@QfO22
Not right key
TRICA@7070@QfO22
IXOSC@6945@QfO22
Not right key
IXOSC@6945

Unnamed: 0,Species,#Orthologs,No_overlap
0,10116,5647,732
1,45351,4164,643


In [23]:
#MetaEuk sens, 5000 human seed genes
fdog_human_proteom_metaeuk_pkl = open_pkl('../pkl_files/fdog_assembly_metaeuk_sens_human_proteom_5t_CDS_positions.pkl')
fdog_human_proteom_metaeuk_df = get_overlap_table(fdog_human_proteom_metaeuk_pkl)
fdog_human_proteom_metaeuk_df.to_csv('../overlap_tables/fdog_ass_human_proteom_metaeuk_sens_overlap_gff_files_rat_nema_5t.tsv', sep='\t', index=False)

RATNO@10116@QfO22
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
RATNO@10116@QfO22
AY172581.1
10116
{'name': 'Rattus norvegicus', 'uniprot': 'UP000002494', 'source': 'Ensembl', 'refseq': 'GCA_015227675.2'}
Gene_count 5689, No overlap 760, Cant map set 2, Multi map set 27
Multi map set
{'Slc30a8', 'ENSRNOP00000048081', 'Fbp1', 'Selenop', 'Xkr6', 'ENSRNOP00000065923', 'Cyp4f6', '61932', 'ENSRNOP00000007455', 'ENSRNOG00000018650', 'Syn3', 'ENSRNOG00000009640', 'Gdf10', '1310453', 'Rps4x-ps9', '628789', 'ENSRNOG00000049334', 'Arx', 'Gpr3', '1592346', 'Spem2', 'ENSRNOP00000059218', 'ENSRNOP00000071934', 'Rb1', '2913', 'Msh5', 'Aebp1'}
Cant map examples
{'41253011', 'ENSRNOG00000032348'}
NEMVE@45351@QfO22
{'name': 'Nematostella vectensis', 'uniprot': 'UP000001593', 'source': 'ENA/EMBL', 'refseq': 'GCA_000209225.1'}
Gene_count 4235, No overlap 742, Cant map set 131, Multi map set 0
Multi map set
set()
Cant map examples
['cds-NEMVEDRAF

Unnamed: 0,Species,#Orthologs,No_overlap
0,10116,5689,760
1,45351,4235,742
