# Create input files for the QfO server

In [19]:
import pickle as pkl
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
#from matplotlib_venn import venn2
import re

In [65]:
#read in mapping tables
#busco_augustus_df = pd.read_csv('../overlap_tables/busco_augustus_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
busco_augustus_species_df = pd.read_csv('../overlap_tables/busco_augustus_species_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
busco_metaeuk_df = pd.read_csv('../overlap_tables/busco_metaeuk_overlap_gff_files_gallus_2.tsv', delimiter='\t')
#fa_augustus_df = pd.read_csv('../overlap_tables/fdog_ass_busco_augustus_overlap_gff_files.tsv', delimiter='\t')
fa_augustus_df = pd.read_csv('../overlap_tables/fdog_ass_busco_augustus_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
#fa_augustus_fly_df = pd.read_csv('../overlap_tables/fdog_ass_busco_augustus_fly_overlap_gff_files.tsv', delimiter='\t')
fa_metaeuk_df = pd.read_csv('../overlap_tables/fdog_ass_busco_metaeuk_overlap_gff_files_gallus_v2.tsv', delimiter='\t')
fa_metaeuk_sens_df = pd.read_csv('../overlap_tables/fdog_ass_busco_metaeuk_overlap_gff_files_gallus_v2_sens.tsv', delimiter='\t')
#fa_augustus_human_proteom = pd.read_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files.tsv', delimiter='\t')
compleasm_df = pd.read_csv('../overlap_tables/compleasm_overlap_gff_files_gallus_v2.tsv', delimiter='\t')

In [35]:
def open_pkl(path):
    pkl_file = open(path, 'rb')
    dictionary = pkl.load(pkl_file)
    pkl_file.close()
    return dictionary

In [36]:
def parse_species_file(file):
    lines = file.readlines()
    species_dict = {}
    for line in lines:
        line = line.rstrip()
        ncbi, name, uniprot_acc, source, refseq_acc = line.split('\t')
        species_dict[ncbi] = {'name': name, 'uniprot': uniprot_acc, 'source': source, 'refseq': refseq_acc}
    return species_dict

In [37]:
species_file = open('../../data/fDOG-assembly/species_set_benchmark_v2.tsv', 'r')
species_dict = parse_species_file(species_file)
species_file.close()

In [38]:
def create_isoform_dict(file, iso_dict):
    try:
        fasta = open(file, 'r')
    except FileNotFoundError:
        return iso_dict
    lines = fasta.readlines()
    for line in lines:
        line = line.rstrip()
        if line.startswith('>'):
            uniprot_canonical = re.search(r'Isoform of (.*?),', line).group(1)
            uniprot_isoform = line.split('|')[1]
            iso_dict[uniprot_isoform] = uniprot_canonical
    return iso_dict
    

In [39]:
isoform_dict = {}
for key in species_dict:
    file = '../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/' + species_dict[key]['uniprot'] + '_' + key + '_additional.fasta'
    isoform_dict = create_isoform_dict(file, isoform_dict)
print(isoform_dict['Q9XXH4'])

G5EFZ3


In [40]:
isoform_out = open("../overlap_tables/isoform_mapping.tsv", "w")
for i in isoform_dict:
    isoform_out.write(i + "\t" + isoform_dict[i] + "\n")
isoform_out.close()

In [41]:
#read in mapping busco_uniprot
busco_vs_uniprot_human = open_pkl('../pkl_files/busco_group_vs_uniprot_ident_90_length_70.pkl')
busco_vs_uniprot_drome = open_pkl('../pkl_files/busco_group_vs_uniprot_ident_90_length_70_drome.pkl')

In [42]:
seed_dict = {}
seed_file = open('../uniprotid_to_group_assignment/seed_species_list_busco.tsv','r')
lines = seed_file.readlines()
for line in lines:
    line = line.rstrip()
    group, species = line.split('\t')
    seed_dict[group] = species
busco_vs_uniprot = {}
ids = 0
human_seed = 0
drome_seed = 0
for key in seed_dict:
    if seed_dict[key] == '9606':
        human_seed += 1
        try:
            uniprot_id = busco_vs_uniprot_human[key]
            #print('Human')
            #print(key, uniprot_id)
            ids += 1
        except KeyError:
            print(key)
            continue
    else:
        drome_seed += 1
        try:
            uniprot_id = busco_vs_uniprot_drome[key]
            #print('Drome')
            #print(key, uniprot_id)
            ids += 1
        except KeyError:
            continue
    busco_vs_uniprot[key] = uniprot_id
    #print(key, busco_vs_uniprot[key])
print(ids)
print(len(busco_vs_uniprot))
print(human_seed, drome_seed)

372257at33208
514121at33208
514296at33208
545744at33208
565053at33208
568839at33208
604211at33208
618516at33208
629763at33208
631767at33208
638541at33208
642552at33208
259581at33208
349064at33208
368875at33208
492033at33208
560694at33208
561054at33208
572845at33208
576938at33208
618986at33208
286553at33208
364383at33208
378913at33208
379636at33208
390402at33208
412370at33208
490248at33208
502691at33208
517686at33208
549156at33208
552201at33208
568022at33208
576633at33208
615531at33208
643883at33208
917
917
938 15


In [43]:
mapping_seed_out = open("../uniprotid_to_group_assignment/mapping_busco_id_uniport_id.tsv", "w")
for key in busco_vs_uniprot:
    mapping_seed_out.write(key + "\t" + busco_vs_uniprot[key] + "\n")
mapping_seed_out.close()

In [44]:
drome_set = set()
counter = 0
for i in busco_vs_uniprot_drome:
    if busco_vs_uniprot_drome[i] in drome_set:
        print(i, busco_vs_uniprot_drome[i])
        counter += 1
    else:
        drome_set.add(busco_vs_uniprot_drome[i])
print(counter)

human_set = set()
counter = 0
for i in busco_vs_uniprot_human:
    #print(i)
    #print(busco_vs_uniprot_human[i])
    break
    if busco_vs_uniprot_human[i] in human_set:
        print(i, busco_vs_uniprot_human[i])
        counter += 1
    else:
        human_set.add(busco_vs_uniprot_human[i])
print(counter)


0
0


In [45]:
def filter_dataframe(cutoff, category, df, busco_to_uniprot, isoform_dict, max_per_gene=True):
    print(df.head())
    df.drop_duplicates(inplace=True)   
    df['GeneID'] = df['GeneID'].map(busco_to_uniprot)
    df['uniprotID'] = df['uniprotID'].map(lambda x: isoform_dict.get(x, x))
    #print(df.head())
    if max_per_gene == True:
        max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max)
        #print(max_values.head())
        max_df = df[df['coverage'] == max_values]
        df = max_df
    filtered_df = df[df[category] >= cutoff].copy()
    #print(filtered_df.head())
    filtered_df.dropna(subset=['uniprotID'], inplace=True)
    filtered_df.dropna(subset=['GeneID'], inplace=True)
    #print(filtered_df.head())
    return filtered_df[['GeneID', 'uniprotID']].copy(), filtered_df[['GeneID', 'transcript' , 'uniprotID']].copy()

In [28]:
compleasm_df_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', compleasm_df, busco_vs_uniprot, isoform_dict)
compleasm_df_sub.drop_duplicates(keep='first',inplace=True)
compleasm_df_sub.to_csv('../qfo_input/compleasm_metazoa_gallus_v2.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/compleasm_metazoa_gallus_v2_mapping_transcript_uniprot.tsv', sep='\t', index=False, header=False)

In [66]:
#busco_augustus_species_df['transcript'] = busco_augustus_species_df['Species'].astype(str) + "_" + busco_augustus_species_df['GeneID'].astype(str)+ "_" + busco_augustus_species_df['transcript'].astype(str)
busco_augustus_species_df_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', busco_augustus_species_df, busco_vs_uniprot, isoform_dict)
busco_augustus_species_df_sub.drop_duplicates(keep='first',inplace=True)
busco_augustus_species_df_sub.to_csv('../qfo_input/busco_metazoa_augustus_species_gallus_v2.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/busco_metazoa_augustus_species_gallus_v2_mapping_transcript_uniprot.tsv', sep='\t', index=False, header=False)

   Species         GeneID                                transcript  overlap  \
0     6239  102804at33208  BX284601.5:10789087-10797610_r2.m1.g3.t1   1573.0   
1     6239  102804at33208  BX284601.5:10789087-10797610_r2.m1.g3.t1   3034.0   
2     6239  130245at33208    BX284601.5:6420887-6425669_r2.m1.g1.t1   2359.0   
3     6239   13816at33208    BX284601.5:8824397-8841505_r2.m1.g2.t1   8764.0   
4     6239   13816at33208    BX284601.5:8824397-8841505_r2.m1.g2.t1    948.0   

  uniprotID  coverage  
0    Q9N5R8  0.678602  
1    Q9N5R9  0.982831  
2    O01763  0.989929  
3    G5EDT9  0.995118  
4    Q65ZA8  1.000000  


In [63]:
busco_augustus_df['transcript'] = busco_augustus_df['GeneID'].astype(str)+ "_" + busco_augustus_df['transcript'].astype(str)
busco_augustus_df_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', busco_augustus_df, busco_vs_uniprot, isoform_dict)
busco_augustus_df_sub.drop_duplicates(keep='first',inplace=True)
busco_augustus_df_sub.to_csv('../qfo_input/busco_metazoa_augustus_gallus_v2.tsv', sep='\t', index=False, header=False)

NameError: name 'busco_augustus_df' is not defined

In [64]:
#busco_metaeuk_df['transcript'] = busco_metaeuk_df['Species'].astype(str) + "_" + busco_metaeuk_df['GeneID'].astype(str)+ "_" + busco_metaeuk_df['transcript'].astype(str)
busco_metaeuk_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', busco_metaeuk_df, busco_vs_uniprot, isoform_dict)
busco_metaeuk_sub.drop_duplicates(keep='first',inplace=True)
busco_metaeuk_sub.to_csv('../qfo_input/busco_metazoa_metaeuk_gallus_v2.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/busco_metazoa_metaeuk_gallus_v2_mapping_transcript_uniprot.tsv', sep='\t', index=False, header=False)

   Species         GeneID                                         transcript  \
0     6239  446000at33208  446000at33208_6239_0:00007b|BX284601.5|+|55821...   
1     6239  446000at33208  446000at33208_6239_0:000685|BX284601.5|+|49019...   
2     6239  642773at33208  642773at33208_6239_0:000102|BX284601.5|+|74923...   
3     6239  357580at33208  357580at33208_6239_0:00073f|BX284601.5|+|41932...   
4     6239  603370at33208  603370at33208_6239_0:000148|BX284601.5|-|83291...   

   overlap uniprotID  coverage  
0    631.0    P91402  0.718679  
1    870.0    P91266  0.987514  
2    310.0    Q22850  0.990415  
3    956.0    O02097  0.971545  
4    431.0    Q20647  0.990805  


In [16]:
fa_augustus_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', fa_augustus_df, busco_vs_uniprot, isoform_dict)
fa_augustus_sub.drop_duplicates(keep='first',inplace=True)
fa_augustus_sub.to_csv('../qfo_input/fdog_assembly_metazoa_augustus_gallus_v2.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/fdog_assembly_metazoa_augustus_gallus_v2_mapping_transcript_uniprot.tsv', sep='\t', index=False, header=False)

  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max)


In [17]:
fa_metaeuk_sub, mapping_transcript_uniprot = filter_dataframe(0.5, 'coverage', fa_metaeuk_df, busco_vs_uniprot, isoform_dict)
print('Duplicates')
print(fa_metaeuk_sub[fa_metaeuk_sub.duplicated(keep=False)])
fa_metaeuk_sub.drop_duplicates(keep='first',inplace=True)
fa_metaeuk_sub.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2_mapping_transcript_uniprot.tsv', sep='\t', index=False, header=False)

Duplicates
       GeneID   uniprotID
9      Q9UBS4      C1P627
10     Q9UBS4      C1P627
12     Q92845      G5EEE6
13     Q92845      G5EEE6
28     Q13769      C2BR94
...       ...         ...
14131  Q9P2J5      Q5PPJ6
14135  P51659      P97852
14139  P51659      P97852
14202  Q8WXX5  A0A8I5ZVX9
14205  Q8WXX5  A0A8I5ZVX9

[805 rows x 2 columns]


  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max)


In [18]:
fa_metaeuk_sub, mapping_transcript_uniprot = filter_dataframe(0.3, 'coverage', fa_metaeuk_df, busco_vs_uniprot, isoform_dict)
fa_metaeuk_sub.drop_duplicates(keep='first',inplace=True)
fa_metaeuk_sub.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2_cov_30.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2_mapping_transcript_uniprot_cov_30.tsv', sep='\t', index=False, header=False)

  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max)


In [19]:
fa_metaeuk_sens_sub, mapping_transcript_uniprot = filter_dataframe(0.3, 'coverage', fa_metaeuk_sens_df, busco_vs_uniprot, isoform_dict)
fa_metaeuk_sens_sub.drop_duplicates(keep='first',inplace=True)
fa_metaeuk_sens_sub.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2_sens.tsv', sep='\t', index=False, header=False)
mapping_transcript_uniprot.to_csv('../qfo_input/fdog_assembly_metazoa_metaeuk_gallus_v2_mapping_transcript_uniprot_sens.tsv', sep='\t', index=False, header=False)

  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max)


# Filter qfo22 uploaded data from other tools for BUSCO and fDOG-Assembly comparison

In [20]:
uniprot_ids_search_species = set()
qfo_path = '../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/'
for key in species_dict:
    print(key)
    file = open(qfo_path + species_dict[key]['uniprot'] + '_' + key + '.fasta', 'r')
    lines = file.readlines()
    for line in lines:
        if line.startswith('>'):
            uniprot_ids_search_species.add(line.split('|')[1])

45351
10116
9031
8364
7955
7227
7070
6945
6412
6239


In [21]:
uniprot_ref_species = set()
for key in busco_vs_uniprot:
    uniprot_ref_species.add(busco_vs_uniprot[key])

In [22]:
def filter_files(path, set_search_species, set_ref_species):
    file = open(path, 'r')
    out_list = []
    lines = file.readlines()
    for line in lines:
        line = line.rstrip()
        id1, id2 = line.split('\t')
        if id1 in set_ref_species and id2 in set_search_species:
            out_list.append(line)
        elif id2 in set_ref_species and id1 in set_search_species:
            out_list.append(line)
        #else:
            #print(id1, id2)
    return out_list

In [23]:
inparanoid = '../../data/qfo_22_results_different_tools/inparanoid/InParanoid_QFO22.pairs_1'
inparanoid_out = filter_files(inparanoid, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/inparanoid.tsv','w')
for i in inparanoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(inparanoid_out))

8487


In [24]:
oma_pairs = '../../data/qfo_22_results_different_tools/oma_pairs/OMA.2.5.0-VPairs.txt'
oma_out = filter_files(oma_pairs, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/oma_pairs.tsv', 'w')
for i in oma_out:
    out_file.write(i + '\n')
out_file.close()
print(len(oma_out))

7973


In [25]:
bbh = '../../data/qfo_22_results_different_tools/bbh/bbh_2columns.tsv'
bbh_out = filter_files(bbh, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/bbh.tsv','w')
for i in bbh_out:
    out_file.write(i + '\n')
out_file.close()
print(len(bbh_out))

8738


In [26]:
domainoid = '../../data/qfo_22_results_different_tools/domainoid/mergedResults_domainoid.tsv'
domainoid_out = filter_files(domainoid, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/domainoid.tsv','w')
for i in domainoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(domainoid_out))

8945


In [26]:
ensamble = '../../data/qfo_22_results_different_tools/ensamble_compara/ensamble_compara.tsv'
ensamble_out = filter_files(ensamble, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/ensamble.tsv','w')
for i in ensamble_out:
    out_file.write(i + '\n')
out_file.close()
print(len(ensamble_out))

8168


In [27]:
hieranoid = '../../data/qfo_22_results_different_tools/hieranoid/pairs_hieranoid-diamond.tsv'
hieranoid_out = filter_files(hieranoid, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/hieranoid.tsv','w')
for i in hieranoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(hieranoid_out))

8422


In [28]:
metaphors = '../../data/qfo_22_results_different_tools/metaphors/metaphors.Oldest_seed.CS0.5.2022.tsv'
metaphors_out = filter_files(metaphors, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/metaphors.tsv','w')
for i in metaphors_out:
    out_file.write(i + '\n')
out_file.close()
print(len(metaphors_out))

8979


In [29]:
orthoffgc = '../../data/qfo_22_results_different_tools/orthoffgc/orthoffgc.tsv'
orthoffgc_out = filter_files(orthoffgc, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/orthoffgc.tsv','w')
for i in orthoffgc_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthoffgc_out))

8616


In [30]:
orthofinder = '../../data/qfo_22_results_different_tools/orthofinder/OrthoFinder.txt'
orthofinder_out = filter_files(orthofinder, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/orthofinder.tsv','w')
for i in orthofinder_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthofinder_out))

9372


In [31]:
orthoinspector = '../../data/qfo_22_results_different_tools/orthoinspector/outputbench2022.tsv'
orthoinspector_out = filter_files(orthoinspector, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/orthoinspector.tsv','w')
for i in orthoinspector_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthoinspector_out))

9006


In [32]:
panther = '../../data/qfo_22_results_different_tools/panther_v18_all/PANTHER18_all_1.tsv'
panther_out = filter_files(panther, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/panther.tsv','w')
for i in panther_out:
    out_file.write(i + '\n')
out_file.close()
print(len(panther_out))

8906


In [33]:
rsd = '../../data/qfo_22_results_different_tools/rsd/rsd_2columns.tsv'
rsd_out = filter_files(rsd, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/rsd.tsv','w')
for i in rsd_out:
    out_file.write(i + '\n')
out_file.close()
print(len(rsd_out))

8606


In [34]:
sonicparanoid = '../../data/qfo_22_results_different_tools/sonicparanoid2_sens/qfo22-challenge.sp2-sens.tsv'
sonicparanoid_out = filter_files(sonicparanoid, uniprot_ref_species, uniprot_ids_search_species)
out_file = open('../qfo_input/sonicparanoid.tsv','w')
for i in sonicparanoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(sonicparanoid_out))

9126


# Create QfO input from human proteom benchmark

In [43]:
#fa_augustus_human_proteom = pd.read_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files.tsv', delimiter='\t')
#human whole proteom
fa_augustus_human_proteom = pd.read_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files_rat_nema.tsv', delimiter='\t')

In [51]:
def filter_dataframe_human_proteom(cutoff, category, df, max_per_gene=True):
    print(df.head())
    df.drop_duplicates(inplace=True)
    #df['GeneID'] = df['GeneID'].map(busco_to_uniprot)
    #df['uniprotID'] = df['uniprotID'].map(lambda x: isoform_dict.get(x, x))
    print(df.head())
    if max_per_gene == True:
        max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max).copy()
        print(max_values.head())
        max_df = df[df['coverage'] == max_values]
        df = max_df
    filtered_df = df[df[category] >= cutoff].copy()
    filtered_df['uniprotID'] = filtered_df['uniprotID'].map(lambda x: isoform_dict.get(x, x))
    #filtered_df
    #print(filtered_df.head())
    filtered_df.dropna(subset=['GeneID'], inplace=True)
    #print(filtered_df.head())
    return filtered_df[['GeneID', 'uniprotID']].copy(), filtered_df[['GeneID', 'transcript' , 'uniprotID']].copy()

In [None]:
fa_augustus_human_proteom_sub = filter_dataframe_human_proteom(0.5, 'coverage', fa_augustus_human_proteom)
fa_augustus_human_proteom_sub.drop_duplicates(keep='first',inplace=True)
fa_augustus_human_proteom_sub.dropna(inplace=True)
fa_augustus_human_proteom_sub.to_csv('../qfo_input/human_proteom/fdog_assembly_human_proteom_augustus.tsv', sep='\t', index=False, header=False)

In [43]:
# Still some genes have a -1 or -<number> ending which indicates a isoform of this gene, this needs to be removed
!sed -i 's/-1//' ../qfo_input/human_proteom/fdog_assembly_human_proteom_augustus.tsv

#### Augustus 5000 human proteins

In [52]:
# 5000 random human genes Augustus run
fa_augustus_human_proteom_5t = pd.read_csv('../overlap_tables/fdog_ass_human_proteom_augustus_overlap_gff_files_rat_nema_5t.tsv', delimiter='\t')

In [53]:
fa_augustus_human_proteom_sub, mapping_transcript_uniprot = filter_dataframe_human_proteom(0.5, 'coverage', fa_augustus_human_proteom_5t)
fa_augustus_human_proteom_sub.drop_duplicates(keep='first',inplace=True)
fa_augustus_human_proteom_sub.dropna(inplace=True)
fa_augustus_human_proteom_sub.to_csv('../qfo_input/human_proteom/fdog_assembly_human_proteom_augustus_5t_cov_50.tsv', sep='\t', index=False, header=False)
!sed -i 's/-1//' ../qfo_input/human_proteom/fdog_assembly_human_proteom_augustus_5t_cov_50.tsv

   Species  GeneID                 transcript  overlap uniprotID  coverage
0    45351  P54577  P54577_DS469824_1_1_g2.t1   1455.0    A7SV20  0.922638
1    45351  P55196  P55196_DS469824_1_1_g1.t1   2941.0    A7SV25  0.771713
2    45351  P55196  P55196_DS469824_1_1_g1.t1   2853.0    A7SV26  0.800281
3    45351  Q8IYE0  Q8IYE0_DS469508_1_1_g2.t1     34.0    A7RFV7  0.100592
4    45351  Q8IYE0  Q8IYE0_DS469508_1_1_g2.t1   2774.0    A7RFV9  0.981252
   Species  GeneID                 transcript  overlap uniprotID  coverage
0    45351  P54577  P54577_DS469824_1_1_g2.t1   1455.0    A7SV20  0.922638
1    45351  P55196  P55196_DS469824_1_1_g1.t1   2941.0    A7SV25  0.771713
2    45351  P55196  P55196_DS469824_1_1_g1.t1   2853.0    A7SV26  0.800281
3    45351  Q8IYE0  Q8IYE0_DS469508_1_1_g2.t1     34.0    A7RFV7  0.100592
4    45351  Q8IYE0  Q8IYE0_DS469508_1_1_g2.t1   2774.0    A7RFV9  0.981252
0    0.922638
1    0.800281
2    0.800281
3    0.981252
4    0.981252
Name: coverage, dtype: float64

  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max).copy()


#### MetaEuk 5000human proteins

In [49]:
# 5000 random human genes Augustus run
fa_metaeuk_human_proteom_5t = pd.read_csv('../overlap_tables/fdog_ass_human_proteom_metaeuk_sens_overlap_gff_files_rat_nema_5t.tsv', delimiter='\t')

In [50]:
fa_metaeuk_human_proteom_sub, mapping_transcript_uniprot = filter_dataframe_human_proteom(0.3, 'coverage', fa_metaeuk_human_proteom_5t)
fa_metaeuk_human_proteom_sub.drop_duplicates(keep='first',inplace=True)
fa_metaeuk_human_proteom_sub.dropna(inplace=True)
fa_metaeuk_human_proteom_sub.to_csv('../qfo_input/human_proteom/fdog_assembly_human_proteom_metaeuk_sens_5t_cov_30.tsv', sep='\t', index=False, header=False)
!sed -i 's/-1//' ../qfo_input/human_proteom/fdog_assembly_human_proteom_metaeuk_sens_5t_cov_30.tsv

   Species  GeneID                  transcript  overlap   uniprotID  coverage
0    10116  P61009  P61009_CM026989.1_1_5_mRNA    357.0  A0A8I6A2B6  0.681298
1    10116  P61009  P61009_CM026989.1_1_5_mRNA    200.0  A0A8I6A2B6  0.400000
2    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    602.0      Q6AY54  0.545290
3    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    463.0      Q6AY54  0.492553
4    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    463.0      Q6AY54  0.429898
   Species  GeneID                  transcript  overlap   uniprotID  coverage
0    10116  P61009  P61009_CM026989.1_1_5_mRNA    357.0  A0A8I6A2B6  0.681298
1    10116  P61009  P61009_CM026989.1_1_5_mRNA    200.0  A0A8I6A2B6  0.400000
2    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    602.0      Q6AY54  0.545290
3    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    463.0      Q6AY54  0.492553
4    10116  Q8NHS2  Q8NHS2_CM026989.1_1_9_mRNA    463.0      Q6AY54  0.429898
0    0.681298
1    0.681298
2    0.545290
3    0.545290
4    0.5

  max_values = df.groupby(['Species', 'GeneID', 'transcript'])['coverage'].transform(max).copy()


## Filter QfO22 tool data for orthologs of human genes

In [30]:
uniprot_ids_search_species = set()
qfo_path = '../../data/qfo_eukaryota_2022/qfo_data_2022/Eukaryota/'
for key in species_dict:
    file = open(qfo_path + species_dict[key]['uniprot'] + '_' + key + '.fasta', 'r')
    if key != '10116' and key != '45351':
        continue
    print(key)
    lines = file.readlines()
    for line in lines:
        if line.startswith('>'):
            uniprot_ids_search_species.add(line.split('|')[1])

45351
10116


#### If whole human proteom was used

In [48]:
#get human ids which were used as seed
uniprot_ref_species = set()
file = open(qfo_path + 'UP000005640_9606.fasta', 'r')
lines = file.readlines()
for line in lines:
    if line.startswith('>'):
        uniprot_ref_species.add(line.split('|')[1])

#### If the reduced human proteom was used

In [50]:
uniprot_ref_species = set()
file = open('../human_proteom/metaeuk/batch_files/random_5000_genes.txt', 'r')
lines = file.readlines()
for line in lines:
    line = line.rstrip()
    uniprot_ref_species.add(line)
file.close()

In [32]:
def filter_files(path, set_search_species, set_ref_species):
    file = open(path, 'r')
    out_list = []
    lines = file.readlines()
    for line in lines:
        line = line.rstrip()
        id1, id2 = line.split('\t')
        if id1 in set_ref_species and id2 in set_search_species:
            out_list.append(line)
        elif id2 in set_ref_species and id1 in set_search_species:
            out_list.append(line)
        #else:
            #print(id1, id2)
    return out_list

In [34]:
inparanoid = '../../data/qfo_22_results_different_tools/inparanoid/InParanoid_QFO22.pairs_1'
inparanoid_out = filter_files(inparanoid, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/inparanoid.tsv','w')
out_file = open('../qfo_input/human_proteom/inparanoid_5t.tsv','w')
for i in inparanoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(inparanoid_out))

8366


In [35]:
oma_pairs = '../../data/qfo_22_results_different_tools/oma_pairs/OMA.2.5.0-VPairs.txt'
oma_out = filter_files(oma_pairs, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/oma_pairs.tsv', 'w')
out_file = open('../qfo_input/human_proteom/oma_pairs_5t.tsv', 'w')
for i in oma_out:
    out_file.write(i + '\n')
out_file.close()
print(len(oma_out))

9445


In [36]:
bbh = '../../data/qfo_22_results_different_tools/bbh/bbh_2columns.tsv'
bbh_out = filter_files(bbh, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/bbh.tsv','w')
out_file = open('../qfo_input/human_proteom/bbh_5t.tsv','w')
for i in bbh_out:
    out_file.write(i + '\n')
out_file.close()
print(len(bbh_out))

6027


In [49]:
domainoid = '../../data/qfo_22_results_different_tools/domainoid/mergedResults_domainoid.tsv'
domainoid_out = filter_files(domainoid, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/domainoid.tsv','w')
out_file = open('../qfo_input/human_proteom/domainoid_5t.tsv','w')
for i in domainoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(domainoid_out))

42426


In [39]:
ensamble = '../../data/qfo_22_results_different_tools/ensamble_compara/ensamble_compara.tsv'
ensamble_out = filter_files(ensamble, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/ensamble.tsv','w')
out_file = open('../qfo_input/human_proteom/ensamble_5t.tsv','w')
for i in ensamble_out:
    out_file.write(i + '\n')
out_file.close()
print(len(ensamble_out))

21850


In [40]:
hieranoid = '../../data/qfo_22_results_different_tools/hieranoid/pairs_hieranoid-diamond.tsv'
hieranoid_out = filter_files(hieranoid, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/hieranoid.tsv','w')
out_file = open('../qfo_input/human_proteom/hieranoid_5t.tsv','w')
for i in hieranoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(hieranoid_out))

7223


In [41]:
metaphors = '../../data/qfo_22_results_different_tools/metaphors/metaphors.Oldest_seed.CS0.5.2022.tsv'
metaphors_out = filter_files(metaphors, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/metaphors.tsv','w')
out_file = open('../qfo_input/human_proteom/metaphors_5t.tsv','w')
for i in metaphors_out:
    out_file.write(i + '\n')
out_file.close()
print(len(metaphors_out))

10256


In [42]:
orthoffgc = '../../data/qfo_22_results_different_tools/orthoffgc/orthoffgc.tsv'
orthoffgc_out = filter_files(orthoffgc, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/orthoffgc.tsv','w')
out_file = open('../qfo_input/human_proteom/orthoffgc_5t.tsv','w')
for i in orthoffgc_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthoffgc_out))

6071


In [43]:
orthofinder = '../../data/qfo_22_results_different_tools/orthofinder/OrthoFinder.txt'
orthofinder_out = filter_files(orthofinder, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/orthofinder.tsv','w')
out_file = open('../qfo_input/human_proteom/orthofinder_5t.tsv','w')
for i in orthofinder_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthofinder_out))

9651


In [44]:
orthoinspector = '../../data/qfo_22_results_different_tools/orthoinspector/outputbench2022.tsv'
orthoinspector_out = filter_files(orthoinspector, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/orthoinspector.tsv','w')
out_file = open('../qfo_input/human_proteom/orthoinspector_5t.tsv','w')
for i in orthoinspector_out:
    out_file.write(i + '\n')
out_file.close()
print(len(orthoinspector_out))

7833


In [45]:
panther = '../../data/qfo_22_results_different_tools/panther_v18_all/PANTHER18_all_1.tsv'
panther_out = filter_files(panther, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/panther.tsv','w')
out_file = open('../qfo_input/human_proteom/panther_5t.tsv','w')
for i in panther_out:
    out_file.write(i + '\n')
out_file.close()
print(len(panther_out))

12511


In [46]:
rsd = '../../data/qfo_22_results_different_tools/rsd/rsd_2columns.tsv'
rsd_out = filter_files(rsd, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/rsd.tsv','w')
out_file = open('../qfo_input/human_proteom/rsd_5t.tsv','w')
for i in rsd_out:
    out_file.write(i + '\n')
out_file.close()
print(len(rsd_out))

6069


In [47]:
sonicparanoid = '../../data/qfo_22_results_different_tools/sonicparanoid2_sens/qfo22-challenge.sp2-sens.tsv'
sonicparanoid_out = filter_files(sonicparanoid, uniprot_ref_species, uniprot_ids_search_species)
#out_file = open('../qfo_input/human_proteom/sonicparanoid.tsv','w')
out_file = open('../qfo_input/human_proteom/sonicparanoid_5t.tsv','w')
for i in sonicparanoid_out:
    out_file.write(i + '\n')
out_file.close()
print(len(sonicparanoid_out))

11938
