# Script to analyse the Orthologs that were only found by the assembly based methods fDOG-Assembly and BUSCO

In [15]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [4]:
tools_ref = ['bbh', 'domainoid', 'ensemble', 'hieranoid', 'inparanoid', 'metaphors', 'oma_pairs', 'orthoffgc', 'orthofinder', 'orthoinspector', 'panther', 'rsd', 'sonicparanoid']
#tools = ['busco_metazoa_augustus', 'busco_metazoa_metaeuk', 'fdog_assembly_metazoa_augustus_galga_v2','fdog_assembly_metazoa_augustus', 'fdog_assembly_metazoa_augustus_fly', 'fdog_assembly_metazoa_metaeuk', 'fdog_assembly_metazoa_sens_metaeuk']
tools = ['busco_metazoa_augustus_gallus_v2', 'busco_metazoa_augustus_species_gallus_v2', 'busco_metazoa_metaeuk_gallus_v2', 'compleasm_metazoa_gallus_v2', 'fdog_assembly_metazoa_augustus_gallus_v2', 'fdog_assembly_metazoa_metaeuk_gallus_v2_sens']
path = '../../results/qfo_input/'

In [5]:
def create_set_of_sets(path):
    file = open(path, 'r')
    pairs_set = set()
    lines = file.readlines()
    for line in lines:
        line = line.rstrip()
        pairs = frozenset(line.split('\t'))
        pairs_set.add(pairs)
    #print(len(lines))
    print(len(pairs_set))
    return pairs_set

In [6]:
dict_of_sets = {}
for i in tools_ref:
    pairs_set = create_set_of_sets(path + i + '.tsv')
    dict_of_sets[i] = pairs_set

8738
8945
8168
8422
8487
8979
7973
8616
9372
9006
8906
8606
9126


In [7]:
for t in tools:
    print(t)
    pairs_set = create_set_of_sets(path + t + '.tsv')
    dict_of_sets[t] = pairs_set

busco_metazoa_augustus_gallus_v2
8567
busco_metazoa_augustus_species_gallus_v2
8588
busco_metazoa_metaeuk_gallus_v2
8621
compleasm_metazoa_gallus_v2
8714
fdog_assembly_metazoa_augustus_gallus_v2
8232
fdog_assembly_metazoa_metaeuk_gallus_v2_sens
8384


## Analyse assigned genes

In [8]:
#get the exclusive pairs for each assembly_based tool
exclusive_pair_dict = {}
for t in tools:
    exclusive_pairs = dict_of_sets[t]
    for ref_t in tools_ref:
        exclusive_pairs = exclusive_pairs - dict_of_sets[ref_t]
    print(t)
    print('All pairs')
    print(len(dict_of_sets[t]))
    print('Not found by any of the protein based tools')
    print(len(exclusive_pairs))
    exclusive_pair_dict[t] = exclusive_pairs


busco_metazoa_augustus_gallus_v2
All pairs
8567
Not found by any of the protein based tools
492
busco_metazoa_augustus_species_gallus_v2
All pairs
8588
Not found by any of the protein based tools
354
busco_metazoa_metaeuk_gallus_v2
All pairs
8621
Not found by any of the protein based tools
168
compleasm_metazoa_gallus_v2
All pairs
8714
Not found by any of the protein based tools
91
fdog_assembly_metazoa_augustus_gallus_v2
All pairs
8232
Not found by any of the protein based tools
133
fdog_assembly_metazoa_metaeuk_gallus_v2_sens
All pairs
8384
Not found by any of the protein based tools
127


In [9]:
#get seed genes
seed_file = open('../uniprotid_to_group_assignment/mapping_busco_id_uniport_id.tsv', 'r')
lines = seed_file.readlines()
seed_genes = set()
busco_vs_uniprot_dict = {}
for line in lines:
    line = line.rstrip()
    busco_id, uniprot_id = line.split('\t')
    seed_genes.add(uniprot_id)
    busco_vs_uniprot_dict[busco_id] = uniprot_id

In [10]:
def parse_species_file(file):
    lines = file.readlines()
    species_dict = {}
    for line in lines:
        line = line.rstrip()
        ncbi, name, uniprot_acc, source, refseq_acc = line.split('\t')
        species_dict[ncbi] = {'name': name, 'uniprot': uniprot_acc, 'source': source, 'refseq': refseq_acc}
    return species_dict

In [11]:
def get_uniprot_ids_for_species(file_path):
    file = open(file_path, 'r')
    lines = file.readlines()
    id_set = set()
    for line in lines:
        line = line.rstrip()
        uniprot_id = line.split('\t')[0]
        id_set.add(uniprot_id)
    file.close()
    return id_set

In [12]:
#create species to uniprot mapping
species_file = open('../../data/fDOG-assembly/species_set_benchmark_v2.tsv', 'r')
species_dict = parse_species_file(species_file)
species_file.close()

species_list = []
uniprot_dict = {} # dictionary species_ncbi_id:set(uniprot_ids of species)
for species in species_dict:
    print(species_dict[species]['name'])
    path = '../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/' + species_dict[species]['uniprot'] + '_' + species + '.idmapping'
    print(path)
    uniprot_dict[species] = get_uniprot_ids_for_species(path)
    species_list.append(species)

Nematostella vectensis
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000001593_45351.idmapping
Rattus norvegicus
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000002494_10116.idmapping
Gallus gallus
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000000539_9031.idmapping
Xenopus tropicalis
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000008143_8364.idmapping
Danio rerio
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000000437_7955.idmapping
Drosophila melanogaster
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000000803_7227.idmapping
Tribolium castaneum
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000007266_7070.idmapping
Ixodes scapularis
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000001555_6945.idmapping
Helobdella robusta
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000015101_6412.idmapping
Caenorhabditis elegans
../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/UP000001940_6239.idmappin

In [13]:
def assign_species(species_dict, gene):
    for sp in species_dict:
        if gene in species_dict[sp]:
            return sp
    print(gene)
    return 'Error'

In [16]:
#create matrix seed gene vs species and count how many tools found at least one pair
# blank matrix seed vs species
df_ref_tools = pd.DataFrame(0, index=list(seed_genes), columns=species_list) 
#print(df_ref_tools)
seed_dict = {}

#fill matrix
for tool in tools_ref:
    #print(tool)
    for pair in dict_of_sets[tool]:
        gene1, gene2 = pair
        if gene1 in seed_genes:
            seed = gene1
            ortholog = gene2
        else:
            ortholog = gene1
            seed = gene2
        species = assign_species(uniprot_dict, ortholog)
        if species == 'Error':
            print('Ortholog gene could not be assigned to species')
            print(ortholog)
            break
        try:
            seed_dict[seed][species].add(tool)
        except KeyError:
            try:
                seed_dict[seed][species] = set()
                seed_dict[seed][species].add(tool)
            except KeyError:    
                seed_dict[seed] = {}
                seed_dict[seed][species] = set()
                seed_dict[seed][species].add(tool)

for seed in seed_dict:
    for species in seed_dict[seed]:
        no_tools = len(seed_dict[seed][species])
        df_ref_tools.loc[seed, species] = no_tools

#print(df_ref_tools)

In [17]:
def evaluate_exclusive_genes_assigned_to_uniprot(exclusive_pairs, seed_genes, uniprot_dict, df_ref_tools):
    # genes that fill a gap in the profile -> turn 0 in 1
    fills_gap = 0
    # genes found in all the other species
    conserved = 0
    # genes other tools found an ortholog but another one 
    no_gap = 0
    # the protein-based tools don't include self-assignments 
    self_assignment = 0
    species_counter = {}

    filled_gaps = set()
    conserved_filled_gaps = set()
    no_gaps = set()
    
    for pair in exclusive_pairs:
        try:
            gene1, gene2 = pair
        except ValueError:
            #print(pair)
            self_assignment += 1
            gene1 = list(pair)[0]
            gene2 = gene1
        #assign genes to seed and ortholog
        if gene1 in seed_genes:
            seed = gene1
            ortholog = gene2
        else:
            seed = gene2
            ortholog = gene1
        #assign ortholog to species
        if species == 'Error':
            print('Ortholog gene could not be assigned to species')
            print(ortholog)
            #break
                
        if df_ref_tools.loc[seed, species] == 0:
            fills_gap += 1
            filled_gaps.add(str(species) + '_' + seed)
            #check conservation
            zeros = (df_ref_tools.loc[seed] == 0).sum()
            if zeros <= 1:
                conserved += 1
                if str(species) + '_' + seed in conserved_filled_gaps:
                    continue
                else:
                    conserved_filled_gaps.add(str(species) + '_' + seed)
                    try:
                        species_counter[species] += 1
                    except KeyError:
                        species_counter[species] = 1
                
            #else:
             #   print(df_ref_tools.loc[seed])
        else:
            no_gap += 1
            no_gaps.add(str(species) + '_' + seed)
            
    return fills_gap, len(filled_gaps), conserved, len(conserved_filled_gaps), no_gap, len(no_gaps), self_assignment, species_counter


In [18]:
for t in tools:
    print(t)
    print(len(exclusive_pair_dict[t]))
    print("Pairs that filled a gap, Filled gaps in Profile, Pairs that filled a gap of a conserved gene, Conserved gaps filled, Pairs that did not fill a gap, On gene level, Self-assignment")
    print(evaluate_exclusive_genes_assigned_to_uniprot(exclusive_pair_dict[t], seed_genes, uniprot_dict, df_ref_tools))

busco_metazoa_augustus_gallus_v2
492
Pairs that filled a gap, Filled gaps in Profile, Pairs that filled a gap of a conserved gene, Conserved gaps filled, Pairs that did not fill a gap, On gene level, Self-assignment
(31, 26, 26, 21, 461, 356, 13, {'6945': 21})
busco_metazoa_augustus_species_gallus_v2
354
Pairs that filled a gap, Filled gaps in Profile, Pairs that filled a gap of a conserved gene, Conserved gaps filled, Pairs that did not fill a gap, On gene level, Self-assignment
(16, 14, 14, 12, 338, 271, 13, {'6945': 12})
busco_metazoa_metaeuk_gallus_v2
168
Pairs that filled a gap, Filled gaps in Profile, Pairs that filled a gap of a conserved gene, Conserved gaps filled, Pairs that did not fill a gap, On gene level, Self-assignment
(12, 10, 11, 9, 156, 133, 14, {'6945': 9})
compleasm_metazoa_gallus_v2
91
Pairs that filled a gap, Filled gaps in Profile, Pairs that filled a gap of a conserved gene, Conserved gaps filled, Pairs that did not fill a gap, On gene level, Self-assignment
(5

## Analyse gap distribution

In [19]:
# how many gaps are there in total in the df_ref_tools?
zeros = (df_ref_tools == 0).sum()
print(zeros)

45351    33
10116     7
9031     40
8364     18
7955      1
7227     19
7070      5
6945     52
6412     36
6239     80
dtype: int64


In [20]:
# how many of them are conserved?
count_exactly_one_zero = (df_ref_tools == 0).sum(axis=1).eq(1).sum()
print(count_exactly_one_zero)
display(df_ref_tools)

214


Unnamed: 0,45351,10116,9031,8364,7955,7227,7070,6945,6412,6239
P22570,12,12,13,13,13,13,13,12,13,12
Q16795,13,13,13,13,13,12,13,11,13,12
Q9Y3D8,12,12,12,12,12,12,0,11,12,12
Q8NI36,0,13,12,12,12,12,12,12,12,11
Q15061,13,13,13,12,13,13,11,12,12,7
...,...,...,...,...,...,...,...,...,...,...
Q9GZP4,13,13,13,12,13,13,13,12,13,13
Q8TAM2,13,13,13,13,13,13,13,6,13,13
Q9Y276,13,13,13,13,13,13,13,12,13,13
Q96MW1,0,13,13,13,12,13,13,6,7,0


In [21]:
# Identify rows with exactly one 0
rows_with_one_zero = (df_ref_tools == 0).sum(axis=1).eq(1)

# Filter the DataFrame to include only rows with exactly one 0
filtered_df = df_ref_tools[rows_with_one_zero]

# Count occurrences of 0 in each column within the filtered rows
column_zero_counts = (filtered_df == 0).sum()

print("Count of columns where the single zeros are located:")
print(column_zero_counts)

Count of columns where the single zeros are located:
45351    26
10116     5
9031     32
8364     14
7955      1
7227     13
7070      4
6945     43
6412     21
6239     55
dtype: int64


In [22]:
# Has it worked correctly?
print(df_ref_tools[df_ref_tools['10116'] == 0])

        45351  10116  9031  8364  7955  7227  7070  6945  6412  6239
Q9BSF4     13      0     0    13    13    12     0     2     0    13
Q96MX6     13      0    13    13    13    13    13     7    13     1
Q8TDD1     13      0    13    12    13    13    13    12    12    13
O75419     13      0    13    13    13    13    13    12    13    13
P61758     13      0    13    13    13    13    13    12    12    13
Q96AB6      8      0    13    13    11    13    13    12    13     0
O43462     13      0    13    13    13    13    13    12    13    13


## Analyse unassigned genes

In [25]:
#get genes that were not assigned to a UniProt ID
# for that we have to parse the overlap tables and filter out the genes that have no overlap at all with a CDS in the reference gff files or that we could not assign to a UniProt gene
busco_augustus_df = pd.read_csv('../overlap_tables/busco_augustus_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
busco_augustus_species_df = pd.read_csv('../overlap_tables/busco_augustus_species_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
busco_metaeuk_df = pd.read_csv('../overlap_tables/busco_metaeuk_overlap_gff_files_gallus_2.tsv', delimiter='\t')
fa_augustus_df = pd.read_csv('../overlap_tables/fdog_ass_busco_augustus_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
fa_metaeuk_df = pd.read_csv('../overlap_tables/fdog_ass_busco_metaeuk_overlap_gff_files_gallus_v2_sens.tsv', delimiter='\t')
compleasm_df = pd.read_csv('../overlap_tables/compleasm_overlap_gff_files_gallus_v2.tsv', delimiter='\t')

In [26]:
def evaluate_unassigned_genes(exclusive_genes, df_ref_tools):
    # genes that fill a gap in the profile -> turn 0 in 1
    fills_gap = 0
    # genes found in all the other species
    conserved = 0
    # genes other tools found an ortholog but another one 
    no_gap = 0
    # the protein-based tools don't include self-assignments 
    self_assignment = 0

    species_counter = {}

    filled_gaps = set()
    conserved_filled_gaps = set()
    no_gaps = set()

    for pair in exclusive_genes:
        species, seed = pair
        if df_ref_tools.loc[seed, str(species)] == 0:
            fills_gap += 1
            filled_gaps.add(str(species) + '_' + seed)
            if species == 10116:
                print('Seed')
                print(seed)
            #check conservation
            zeros = (df_ref_tools.loc[seed] == 0).sum()
            if zeros <= 1:
                conserved += 1
                if str(species) + '_' + seed in conserved_filled_gaps:
                    continue
                else:
                    conserved_filled_gaps.add(str(species) + '_' + seed)
                    try:
                        species_counter[species] += 1
                    except KeyError:
                        species_counter[species] = 1
            #else:
             #   print(df_ref_tools.loc[seed])
        else:
            no_gap += 1
            no_gaps.add(str(species) + '_' + seed)
        

    return fills_gap, len(filled_gaps), conserved, len(conserved_filled_gaps), no_gap, len(no_gaps), self_assignment, species_counter, conserved_filled_gaps

In [27]:
def get_unassigned_exclusive_pairs(df, busco_to_uniprot):
    unassigned = df[df.isnull().any(axis=1)]
    #print(unassigned)
    pairs_rows = unassigned[['Species','GeneID']].copy()
    pairs_rows['GeneID'] = pairs_rows['GeneID'].map(busco_to_uniprot)
    pairs_rows.dropna(inplace = True)
    #print(pairs_rows)
    print(pairs_rows['Species'].value_counts())
    #unassigned_pairs = pairs_rows.apply(frozenset, axis=1)
    unassigned_pairs = pairs_rows.values.tolist()
    return unassigned_pairs
    

In [28]:
def number_seed_genes(unassigned_pairs):
    seeds = set()
    pairs = set()
    for i in unassigned_pairs:
        sp, seed = i
        seeds.add(seed)
        pairs.add(str(sp) + '_' + seed)
    return len(seeds), len(pairs)

In [29]:
#fDOG-Assembly Augustus
unassigned_pairs = get_unassigned_exclusive_pairs(fa_augustus_df, busco_vs_uniprot_dict)
#how many pairs are included?
print(len(unassigned_pairs))
# how many different seed genes are included ?
print(number_seed_genes(unassigned_pairs))
print('################')
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_fda_aug = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

6945     113
45351     71
8364      57
10116     51
9031      38
6412      25
6239      12
7227      11
7070       6
7955       6
Name: Species, dtype: int64
390
(286, 332)
################
Seed
Q96MX6
Seed
Q96AB6
(72, 68, 55, 53, 318, 264, 0, {9031: 4, 10116: 1, 7070: 2, 6239: 1, 45351: 14, 6945: 23, 7955: 1, 6412: 5, 8364: 2}, {'6945_Q9NRN9', '45351_Q14CX7', '6945_Q8N5C7', '6945_P62277', '45351_Q8IWT0', '6945_Q9BS26', '7070_P57081', '45351_Q96A65', '6945_Q5T280', '6945_P62829', '6412_Q9Y3D3', '6945_Q9UJK0', '6945_Q9P2I0', '8364_Q9UK45', '45351_Q9NXG2', '6945_Q92542', '45351_Q969N2', '9031_Q9BRT9', '7070_Q9Y3D8', '6945_Q8N6T3', '6945_Q9H4B6', '10116_Q96MX6', '45351_Q9UBL3', '45351_Q92845', '6412_P40938', '9031_Q13405', '45351_Q9NQ89', '45351_Q9P1Q0', '45351_P53990', '6945_Q05048', '6945_Q9UJX6', '6945_P48553', '6945_Q9H3H5', '6239_Q9ULC3', '6945_P49959', '6945_Q330K2', '6945_Q9BXS1', '9031_Q96L58', '45351_P61221', '45351_Q9BPX7', '6412_O75380', '8364_Q9Y2S7', '6945_Q9Y248', '9031_Q9HC

In [30]:
# for some additional investigations
#unassigned_pairs[species].value_counts()
#missing in overlap table:  why? -> fixed, due to None as UniProt IDs -> error by grouping the df during overlap table reconstruction
    147873at33208_CM026976_1_1_g3.t1
	325183at33208_CM026974_1_1_g2.t1
	488635at33208_CM026978_1_1_g3.t1
	491289at33208_CM026989_1_1_g2.t1
	526673at33208_CM026983_1_1_g1.t1
	535367at33208_CM026984_1_1_g2.t1
	539043at33208_CM026975_1_1_g2.t1
	557834at33208_CM026985_1_1_g1.t1
	577339at33208_CM026975_1_1_g1.t1
	599320at33208_CM026979_1_1_g1.t1


IndentationError: unexpected indent (2803421596.py, line 4)

In [39]:
#fDOG-Assembly MetaEuk
unassigned_pairs = get_unassigned_exclusive_pairs(fa_metaeuk_df, busco_vs_uniprot_dict)
#print(unassigned_pairs)
print(len(unassigned_pairs))
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_fda_meta = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

6945     189
45351    120
10116    112
8364      88
9031      69
6412      26
7227      23
6239      19
7070      15
7955      10
Name: Species, dtype: int64
671
Seed
Q9BSF4
Seed
Q96MX6
Seed
Q96AB6
Seed
Q96AB6
Seed
P61758
(102, 91, 76, 69, 569, 466, 0, {7070: 2, 45351: 17, 9031: 6, 6412: 7, 8364: 1, 10116: 2, 6945: 34}, {'45351_Q92665', '6945_Q16514', '6945_Q9NRN9', '45351_Q14CX7', '6945_P62277', '6945_Q96K21', '6945_Q8N5C7', '45351_Q8IWT0', '6945_Q9BS26', '7070_P57081', '6945_Q9Y315', '45351_Q96A65', '6945_Q5T280', '10116_P61758', '6945_P62829', '6412_Q9Y3D3', '6945_Q9NX55', '6945_Q9UJK0', '6945_Q9P2I0', '8364_Q9UK45', '6945_Q9Y4Y9', '45351_Q9NXG2', '6945_O00623', '6945_Q92542', '45351_Q969N2', '9031_Q9BRT9', '7070_Q9Y3D8', '6945_Q9H4B6', '6945_Q8N6T3', '10116_Q96MX6', '6945_Q9NWB7', '45351_Q9UBL3', '45351_Q92845', '6412_P40938', '45351_Q9NQ89', '45351_Q9P1Q0', '9031_Q13405', '45351_P53990', '6412_Q96J42', '6945_Q05048', '6945_P48553', '6945_Q9UJX6', '6412_Q9BV87', '6945_Q9H3H5', '694

In [32]:
#BUSCO Augustus
unassigned_pairs = get_unassigned_exclusive_pairs(busco_augustus_df, busco_vs_uniprot_dict)
#print(unassigned_pairs)
print(len(unassigned_pairs))
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_busco_aug = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

6945     64
10116    48
7955     39
45351    36
6239     24
6412     18
9031      5
7070      3
8364      1
Name: Species, dtype: int64
238
Seed
Q96AB6
Seed
Q96MX6
(57, 56, 45, 44, 181, 171, 0, {9031: 1, 6412: 8, 6945: 24, 45351: 8, 10116: 1, 7070: 2}, {'6945_Q9H6L2', '45351_Q92665', '6945_Q9NRN9', '6945_P62277', '6945_Q8N5C7', '6412_Q7L592', '7070_P57081', '6945_Q5T280', '6945_Q9BUB7', '6412_A8K0Z3', '6412_Q9Y3D3', '6945_Q96NB3', '6945_Q9UJK0', '6412_Q969H6', '45351_Q9NXG2', '6945_Q92542', '9031_Q9BRT9', '6945_Q8N6T3', '6945_Q9H4B6', '7070_Q9Y3D8', '10116_Q96MX6', '6412_P40938', '45351_P53990', '6945_Q05048', '6945_Q9UJX6', '6945_P48553', '6412_Q9BV87', '6945_Q9H3H5', '6945_Q9BU89', '6945_P49959', '6945_Q9H270', '6412_O75380', '45351_Q9BPX7', '45351_P61221', '45351_P49458', '6945_Q9NX38', '6945_Q9Y248', '6945_O94817', '6412_Q32P41', '45351_P12004', '6945_Q07864', '6945_O14653', '45351_Q9NQ50', '6945_Q8TAT6'})
Seed
Q96AB6
Seed
Q96MX6


In [36]:
#BUSCO Augustus Species
unassigned_pairs = get_unassigned_exclusive_pairs(busco_augustus_species_df, busco_vs_uniprot_dict)
#print(unassigned_pairs)
print(len(unassigned_pairs))
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_busco_aug_sp = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

6945     78
45351    69
10116    66
8364     54
9031     35
7955     16
6412     16
6239     12
7227      8
7070      3
Name: Species, dtype: int64
357
Seed
Q96AB6
Seed
Q96MX6
(81, 78, 60, 59, 276, 236, 0, {9031: 4, 6412: 8, 6945: 24, 45351: 18, 10116: 1, 7070: 2, 8364: 2}, {'6945_Q9H6L2', '45351_Q92665', '6945_Q9NRN9', '6945_P62277', '6945_Q8N5C7', '45351_Q14CX7', '45351_Q8IWT0', '7070_P57081', '45351_Q96A65', '6945_Q5T280', '6945_Q9BUB7', '6412_A8K0Z3', '6412_Q9Y3D3', '6945_Q96NB3', '6945_Q9UJK0', '6412_Q969H6', '8364_Q9UK45', '45351_Q9NXG2', '6945_Q92542', '45351_Q969N2', '9031_Q9BRT9', '6945_Q8N6T3', '6945_Q9H4B6', '7070_Q9Y3D8', '10116_Q96MX6', '45351_Q9UBL3', '6412_P40938', '45351_Q92845', '9031_Q13405', '45351_Q9NQ89', '45351_Q9P1Q0', '45351_P53990', '6945_Q05048', '6945_Q9UJX6', '6945_P48553', '6412_Q9BV87', '6945_Q9H3H5', '6945_Q9BU89', '6945_P49959', '6412_Q8IWA0', '6945_Q9H270', '9031_Q96L58', '6412_O75380', '45351_P61221', '45351_Q9BPX7', '45351_Q8NI36', '45351_P49458', '83

In [33]:
#BUSCO MetaEuk
unassigned_pairs = get_unassigned_exclusive_pairs(busco_metaeuk_df, busco_vs_uniprot_dict)
#print(unassigned_pairs)
print(len(unassigned_pairs))
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_busco_meta = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

10116    160
6945      64
6412      63
45351     20
9031       8
7070       4
8364       4
7955       2
6239       1
Name: Species, dtype: int64
326
Seed
Q9BSF4
Seed
Q96AB6
Seed
Q96MX6
Seed
P61758
(74, 74, 56, 56, 252, 117, 0, {9031: 2, 6412: 11, 6945: 32, 45351: 8, 10116: 2, 7070: 1}, {'45351_Q92665', '6945_Q16514', '6945_Q9NRN9', '6945_P62277', '6945_Q96K21', '6945_Q8N5C7', '6945_Q9BS26', '6412_Q7L592', '6945_Q9Y315', '7070_P57081', '6945_Q5T280', '10116_P61758', '6945_Q9BUB7', '6412_A8K0Z3', '6945_P62829', '6412_Q9Y3D3', '6945_Q96NB3', '6945_Q9UJK0', '6945_Q9P2I0', '6412_Q969H6', '6945_O00623', '45351_Q9NXG2', '6945_Q92542', '9031_Q9BRT9', '6945_Q9H4B6', '6945_Q8N6T3', '10116_Q96MX6', '6945_Q9NWB7', '6412_P40938', '45351_P53990', '6412_Q96J42', '6945_Q05048', '6945_P48553', '6945_Q9UJX6', '6412_Q9BV87', '6945_Q99547', '6945_Q9H3H5', '6945_Q9BU89', '6945_P49959', '6945_Q330K2', '6945_Q9H270', '6412_O75380', '45351_P61221', '9031_Q96EK9', '45351_Q9BPX7', '45351_P49458', '6945_Q9Y248',

In [37]:
#compleasm 
#BUSCO Augustus
unassigned_pairs = get_unassigned_exclusive_pairs(compleasm_df, busco_vs_uniprot_dict)
#print(unassigned_pairs)
print(len(unassigned_pairs))
print(evaluate_unassigned_genes(unassigned_pairs, df_ref_tools))
conserved_gaps_busco_aug = evaluate_unassigned_genes(unassigned_pairs, df_ref_tools)[-1]

6945     81
45351    65
8364     52
10116    42
9031     39
6412     20
7227      8
6239      3
7070      3
7955      2
Name: Species, dtype: int64
315
Seed
P61758
Seed
Q96AB6
Seed
Q96MX6
Seed
Q9BSF4
(102, 98, 77, 75, 213, 181, 0, {9031: 7, 6412: 10, 6945: 33, 45351: 19, 10116: 2, 7070: 2, 8364: 2}, {'45351_Q92665', '6945_Q16514', '6945_Q9NRN9', '6945_P62277', '6945_Q8N5C7', '45351_Q14CX7', '45351_Q8IWT0', '6945_Q9BS26', '6412_Q7L592', '6945_Q9Y315', '7070_P57081', '45351_Q96A65', '6945_Q5T280', '10116_P61758', '6412_A8K0Z3', '6945_P62829', '6412_Q9Y3D3', '6945_Q96NB3', '6945_Q9UJK0', '6945_Q9P2I0', '6412_Q969H6', '8364_Q9UK45', '6945_Q9Y4Y9', '6945_O00623', '45351_Q9NXG2', '6945_Q92542', '45351_Q969N2', '45351_Q9UNX4', '9031_Q9BRT9', '6945_Q8N6T3', '6945_Q9H4B6', '7070_Q9Y3D8', '10116_Q96MX6', '6945_Q9NWB7', '45351_Q9UBL3', '6412_P40938', '45351_Q92845', '9031_Q13405', '45351_Q9NQ89', '45351_Q9P1Q0', '45351_P53990', '6412_Q96J42', '6945_Q05048', '9031_Q8TDP1', '6945_P48553', '6945_Q9U

In [26]:
# Note for my self:
#I did not consider if I only found an additional co-ortholog or if I found additional orthologs and none of the found ones were supported by my results

In [34]:
# Do the results of filled gaps by the different methods overlap?
union_all = conserved_gaps_fda_aug.union(conserved_gaps_busco_aug, conserved_gaps_fda_aug, conserved_gaps_busco_meta)
species = []
uniprot = []
count = []
list_of_sets = [conserved_gaps_fda_aug, conserved_gaps_fda_meta, conserved_gaps_busco_aug, conserved_gaps_busco_meta]
for i in union_all:
    sp, uniprot_id = i.split('_')
    count_val = 0
    for t in list_of_sets:
        if i in t:
            count_val += 1
    species.append(sp)
    uniprot.append(uniprot_id)
    count.append(count_val)

zipped = list(zip(species, uniprot, count))

df = pd.DataFrame(zipped, columns=['Species', 'UniProtID', 'Found by x tools'])
df.to_csv('conserved_gaps_filled_by_assembly_based_methods.tsv', sep="\t", index=False)

In [35]:
display(df)

Unnamed: 0,Species,UniProtID,Found by x tools
0,6945,Q9H6L2,1
1,6945,Q8N5C7,4
2,6945,Q9BS26,3
3,7070,P57081,4
4,6945,Q5T280,4
...,...,...,...
72,6945,O94817,3
73,45351,P12004,4
74,6945,O14653,4
75,45351,Q9NVU0,2
