## Assignment of human Uniprot Identifier to all BUSCO and fDOG-Assembly core groups

### BUSCO

In [1]:
def get_representatives(path, species_list):
    # the orthoDB gene ids have the following format: taxid_version:geneID
    file = open(path, 'r')
    lines = file.readlines()
    rep = set()
    species_set = set(species_list)
    print(species_set)
    orthoDB_groups = set()
    orthoGroup_vs_geneId = {}
    geneId_vs_orthoGroup = {}
    for line in lines:
        line = line.rstrip()
        gene_id, buscoGroup = line.split('\t')
        orthoDB_groups.add(buscoGroup)
        species = gene_id.split('_')[0]
        if int(species) in species_set:
            rep.add(gene_id)
            orthoGroup_vs_geneId[buscoGroup] = gene_id
            geneId_vs_orthoGroup[gene_id] = buscoGroup
    
    return rep, orthoGroup_vs_geneId, geneId_vs_orthoGroup, orthoDB_groups

In [18]:
species = [9606]
path = '../../data/busco/busco_downloads/lineages/metazoa_odb10/info/ogs.id.info'
rep, orthoGroup_vs_geneId, geneId_vs_orthoGroup, orthoDB_groups = get_representatives(path, species)
print(len(rep))

{9606}
938


In [12]:
species = [7227]
path = '../../data/busco/busco_downloads/lineages/metazoa_odb10/info/ogs.id.info'
rep, orthoGroup_vs_geneId, geneId_vs_orthoGroup, orthoDB_groups = get_representatives(path, species)
print(len(rep))

{7227}
927


In [4]:
from Bio import SeqIO
path_all_busco = '../../results/uniprotid_to_group_assignment/odb10v1_all_og_fasta.tab'
outpath = '../../results/uniprotid_to_group_assignment/busco/human_busco_genes.faa'

out_list = []

counter = 0
with open(path_all_busco) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        header = record.id
        if header.split('\t')[0] in rep:
            counter += 1
            out_list.append(record)
print(counter)          
with open(outpath, "w") as output_handle:
    SeqIO.write(out_list, output_handle, 'fasta-2line')
        

KeyboardInterrupt: 

In [15]:
#started blastp with the following command
# blastp -db ../human_qfo22_blastdb/human_qfo22_blastdb -query human_busco_genes.faa -out ./blastp_out_human_busco_vs_human_qfo22.out -outfmt '6 qseqid sseqid qlen slen length pident nident mismatch gaps qcov'
# blastp -db ../drome_qfo22_blastdb/drome_qfo22_blastdb -query drome_busco_genes.faa -out ./blastp_out_drome_busco_vs_drome_qfo22.out -outfmt '6 qseqid sseqid qlen slen length pident nident mismatch gaps qcov'

In [6]:
# Parse blastp output and assign a uniprot id
def parse_blast_out(min_seqid, cov, path) :
    blastp_out = open(path, 'r')
    lines = blastp_out.readlines()
    perfekt_match = set()
    good_match = set()
    accepted = set()
    dict_busco_uniprot = {}
    uniprot_ids = set()
    for line in lines:
        line = line.rstrip()
        qseqid, sseqid, qlen, slen, length, pident, nident, mismatch, gaps = line.split('\t')
        sseqid = sseqid.split('|')[1]
        if pident == '100.000' and gaps == '0' and nident == min(qlen, slen) and qseqid:
            perfekt_match.add(sseqid)
            accepted.add(qseqid)
            try:
                dict_busco_uniprot[qseqid].append(sseqid)
            except KeyError:
                dict_busco_uniprot[qseqid] = [sseqid]
            uniprot_ids.add(sseqid)
        elif float(pident) >= min_seqid and int(length)/min(int(qlen), int(slen)) > cov and qseqid:
            good_match.add(sseqid)
            accepted.add(qseqid)
            try:
                dict_busco_uniprot[qseqid].append(sseqid)
            except KeyError:
                dict_busco_uniprot[qseqid] = [sseqid]
            uniprot_ids.add(sseqid)
    return  len(perfekt_match), (len(good_match)), dict_busco_uniprot, uniprot_ids


In [20]:
path_human = '../../results/uniprotid_to_group_assignment/busco/blastp_out_human_busco_vs_human_qfo22.out'
path = '../../results/uniprotid_to_group_assignment/busco/blastp_out_drome_busco_vs_drome_qfo22.out'
perfekt_match, good_match, dict_busco_uniprot, uniprot_ids = parse_blast_out(98, 0.8, path)
print(perfekt_match, good_match, perfekt_match + good_match)

perfekt_match, good_match, dict_busco_uniprot, uniprot_ids = parse_blast_out(97, 0.8, path)
print(perfekt_match, good_match, perfekt_match + good_match)

perfekt_match, good_match, dict_busco_uniprot_drome, uniprot_ids = parse_blast_out(97, 0.7, path)
print(perfekt_match, good_match, perfekt_match + good_match)
print(len(uniprot_ids))

perfekt_match, good_match, dict_busco_uniprot_human, uniprot_ids = parse_blast_out(90, 0.7, path_human)
print(perfekt_match, good_match, perfekt_match + good_match)
print(len(uniprot_ids))

889 31 920
889 33 922
889 34 923
923
688 233 921
921


In [8]:
import pickle as pkl
out_path = '../pkl_files/busco_vs_uniprot_ident_98_length_70_drome.pkl'
out = open(out_path,'wb')
pkl.dump(dict_busco_uniprot, out)
out.close()

In [14]:
import pickle as pkl
busco_group_vs_uniprot = {}

for key in dict_busco_uniprot_drome:
    uniprot_id = dict_busco_uniprot_drome[key][0]
    if len(dict_busco_uniprot_drome[key]) > 1:
        print(key, dict_busco_uniprot_drome[key])
    group = geneId_vs_orthoGroup[key]
    busco_group_vs_uniprot[group] = uniprot_id
    
out_path = '../pkl_files/busco_group_vs_uniprot_ident_98_length_70_drome.pkl'
out = open(out_path,'wb')
pkl.dump(busco_group_vs_uniprot, out)
out.close()

7227_0:000908 ['Q9I7L8', 'E1JHM5']
7227_0:00254b ['Q9VEQ1', 'Q9VEQ2']
7227_0:002f60 ['C0HKA0', 'C0HKA1']
7227_0:0033a5 ['Q9VWQ3', 'Q9VWQ6']
7227_0:0034b4 ['A8QI13', 'A8QI32']


In [21]:
import pickle as pkl
busco_group_vs_uniprot = {}

for key in dict_busco_uniprot_human:
    uniprot_id = dict_busco_uniprot_human[key][0]
    if len(dict_busco_uniprot_human[key]) > 1:
        print(key, dict_busco_uniprot_human[key])
    group = geneId_vs_orthoGroup[key]
    busco_group_vs_uniprot[group] = uniprot_id
    
out_path = '../pkl_files/busco_group_vs_uniprot_ident_90_length_70.pkl'
out = open(out_path,'wb')
pkl.dump(busco_group_vs_uniprot, out)
out.close()

9606_0:0004ab ['B7ZAQ6', 'P0CG08']
9606_0:001f84 ['P52435', 'Q9H1A7', 'Q9GZM3']
9606_0:002531 ['A8K0Z3', 'Q6VEQ5', 'C4AMC7', 'A8MWX3', 'Q9NQA3']
9606_0:003213 ['P05388', 'Q8NHW5']
9606_0:00344c ['Q5BKT4', 'Q5I7T1']
9606_0:003c93 ['Q9NYV6', 'Q2M238']
9606_0:0048df ['P40429', 'Q6NVV1']
9606_0:004d68 ['P48553', 'A0A096LPH7']
9606_0:004db7 ['Q15269', 'A0A0B4J2E5']
9606_0:004eec ['Q9Y3A4', 'Q9NSQ0']
9606_0:005039 ['Q9ULC4', 'A0A3B3IRV3']
9606_0:00518f ['P0DI81', 'P0DI82']
9606_0:00523b ['Q5VZM2', 'Q7L523']
9606_0:0052ec ['Q9BVJ6', 'Q5TAP6']
9606_0:005565 ['Q13888', 'Q6P1K8']


## OMA

In [7]:
def get_seed_genes(path, reference_species):
    file = open(path, 'r')
    lines = file.readlines()
    species_set = set()
    gene_vs_oma_dict = {}
    oma_vs_gene_dict = {}
    header = set()
    for i in reference_species:
        species_set.add('ncbi' + str(i))
    for line in lines:
        line = line.rstrip()
        groupId, species, ortho, fasf, fasb = line.split('\t')
        if species in species_set:
            geneId = ortho.split('|')[2]
            protein_set_name = ortho.split('|')[1]
            if protein_set_name.split('@')[2] == '2209':
                oma_vs_gene_dict[groupId] = geneId
                gene_vs_oma_dict[geneId] = groupId
                header.add(ortho)
    return oma_vs_gene_dict, gene_vs_oma_dict, header

In [8]:
species = [9606]
oma_vs_gene_dict, gene_vs_oma_dict, header = get_seed_genes('../augustus/metazoa_core_augustus.phyloprofile', species)

In [9]:
print(len(header))

1372


In [49]:
from Bio import SeqIO
path_all_busco = '../../results/uniprotid_to_group_assignment/oma/metazoa_core_augustus_og.fa'
outpath = '../../results/uniprotid_to_group_assignment/oma/humna_oma_genes.faa'

out_list = []

counter = 0
with open(path_all_busco) as handle:
    for record in SeqIO.parse(handle, "fasta"):
        h = record.id
        if h.split('\t')[0] in header:
            counter += 1
            out_list.append(record)
print(counter)          
with open(outpath, "w") as output_handle:
    SeqIO.write(out_list, output_handle, 'fasta-2line')

1372


In [12]:
def parse_blast_out_oma(min_seqid, cov):
    path = '../../results/uniprotid_to_group_assignment/oma/blastp_out_human_oma_vs_human_qfo22.out'
    blastp_out = open(path, 'r')
    lines = blastp_out.readlines()
    perfekt_match = set()
    good_match = set()
    accepted = set()
    dict_oma_uniprot = {}
    uniprot_ids = set()
    for line in lines:
        line = line.rstrip()
        qseqid, sseqid, qlen, slen, length, pident, nident, mismatch, gaps = line.split('\t')
        sseqid = sseqid.split('|')[1]
        if pident == '100.000' and gaps == '0' and nident == min(qlen, slen):
            perfekt_match.add(sseqid)
            accepted.add(qseqid)
            try:
                dict_oma_uniprot[qseqid].append(sseqid)
            except KeyError:
                dict_oma_uniprot[qseqid] = [sseqid]
            uniprot_ids.add(sseqid)
        elif float(pident) >= min_seqid and int(length)/min(int(qlen), int(slen)) > cov:
            good_match.add(sseqid)
            accepted.add(qseqid)
            uniprot_ids.add(sseqid)
            try:
                dict_oma_uniprot[qseqid].append(sseqid)
            except KeyError:
                dict_oma_uniprot[qseqid] = [sseqid]
    return  len(perfekt_match), (len(good_match)), dict_busco_uniprot, uniprot_ids

In [20]:
perfekt_match, good_match, dict_oma_uniprot, uniprot_ids_oma = parse_blast_out_oma(98, 0.8)
print(perfekt_match, good_match, perfekt_match + good_match)

perfekt_match, good_match, dict_oma_uniprot, uniprot_ids_oma = parse_blast_out_oma(97, 0.8)
print(perfekt_match, good_match, perfekt_match + good_match)

perfekt_match, good_match, dict_oma_uniprot, uniprot_ids_oma = parse_blast_out_oma(98, 0.7)
print(perfekt_match, good_match, perfekt_match + good_match)

print(len(uniprot_ids_oma))

1274 75 1349
1274 92 1366
1274 77 1351
1351


In [18]:
print(len(uniprot_ids_oma.intersection(uniprot_ids)))
print(len(uniprot_ids.intersection(uniprot_ids_oma)))

375
375


In [77]:
print(dict_oma_uniprot['1102883|HUMAN@9606@2209|HUMAN04429'])

O00303


In [21]:
diff = (uniprot_ids_oma - uniprot_ids)
#print(diff)
for key in dict_oma_uniprot:
    if dict_oma_uniprot[key] == 'Q6TCH7':
        print(key)
    elif key:
        print(dict_oma_uniprot[key])

1002344|HUMAN@9606@2209|HUMAN76126


In [12]:
for key in dict_busco_uniprot:
    if dict_busco_uniprot[key] == 'Q658Y4':
        print(key)

In [18]:
diff_busco = (uniprot_ids - uniprot_ids_oma)

In [19]:
print(diff_busco)

{'Q9NYV6', 'Q9H270', 'Q9UJX4', 'Q969J3', 'Q96MW1', 'Q96NB3', 'Q96HA8', 'Q9BYN8', 'Q96B42', 'Q9H2G9', 'Q9NX58', 'Q15814', 'Q9BUE0', 'Q9H9J2', 'P05408', 'P30050', 'Q8IXH7', 'P42766', 'Q5T280', 'Q96BW9', 'Q96GX9', 'Q13952', 'Q5BKX5', 'O60725', 'Q6PJT7', 'Q9BWT6', 'P62917', 'Q96E11', 'Q9BVV7', 'Q99442', 'Q32P41', 'Q99437', 'Q9H501', 'Q9BW27', 'Q15061', 'O43929', 'Q9Y2L5', 'Q9P055', 'O75616', 'Q16514', 'Q9Y2R9', 'Q9P0R6', 'Q12768', 'Q9Y3D3', 'Q9Y3B2', 'Q8TCC3', 'Q13795', 'O00566', 'P51398', 'Q9H7Z7', 'P61513', 'Q9Y324', 'Q9BSF4', 'Q9NVM9', 'Q9Y3U8', 'Q8WVK2', 'Q147X3', 'Q9BVS4', 'Q9BT17', 'Q8N8A6', 'Q9UBZ9', 'P46776', 'Q16585', 'Q9NUL7', 'Q9NPL8', 'Q8WTT2', 'Q7Z4H3', 'Q3ZCQ8', 'Q01831', 'Q8WX92', 'Q9BXY0', 'Q9HD33', 'Q9NRM2', 'Q96L58', 'P57081', 'Q9P0N9', 'Q9Y5Y5', 'Q8N183', 'Q7Z392', 'P08240', 'Q9H0U6', 'Q9Y375', 'Q96SK2', 'P23025', 'Q9NVR5', 'Q8N5U6', 'O43292', 'P05455', 'Q8N5C7', 'P46779', 'P07902', 'Q9Y3C0', 'P61201', 'O00623', 'Q96DC8', 'P17152', 'Q9Y6G5', 'Q8WVT3', 'P78346', 'Q5C9Z4',