Chaetomium thermophilum and Aspergillus niger. AN = 10609 sequences. CT = 7164 sequences. BBH = 5340 sequences.

## Preprocessing 
Utility functions for processing the BLASTP results file

In [5]:
# parse the BLASTP results file to get best hits
def parse_blast(blastfile, q_prefix, t_prefix):
    best_hits = {}                        # initialise empty dictionary for queries and targets
    
    with open(blastfile, "r") as infile:
        looking_for_query = True
        
        for line in infile:
            if looking_for_query:
                if line.startswith("Query="):  
                    current_query = line.split()[1]   # assign query protein to 'query'
                    looking_for_query = False
                    target_count = 0
            
            else:
                if line.startswith("Query="):         # update query if previous query has no significant hits
                    current_query = line.split()[1]
                if line.startswith(q_prefix + "XP") or line.startswith(t_prefix + "XP"):
                    if line.startswith(q_prefix + "XP") and target_count == 0:
                        target_count = 1
                    else:
                        best_hit = line.split()[0]
                        best_hits[current_query] = best_hit
                        looking_for_query = True
    return best_hits


# find paralogs from hits
def find_paralogs(hits):
    outdict = {}
    for query, target in hits.items():
        if query[0] == target[0]:
            if (not target in outdict) or (outdict[target] != query):
                outdict[query] = target
    return outdict

# find bidirectional paralogs from hits
def find_BBH_paralogs(hits):
    outdict = {}
    for query, target in hits.items():
        if target in hits:
            if hits[target] == query:
                if not target in outdict:
                    outdict[query] = target
    return outdict


# find best bidirectional hits
def BBH(hits_1, hits_2):
    outdict = {}
    for query, target in hits_1.items():
        if query[0] != target[0]:
            if target in hits_2:
                if hits_2[target] == query:
                    outdict[query] = target
    return outdict
                                            

In [8]:
# read in files and parse
blastfile_AN = "data/mergedDB_ANquery.txt"
blastfile_CT = "data/mergedDB_CTquery.txt"

hits_AN = parse_blast(blastfile_AN, "AN_", "CT_")
hits_CT = parse_blast(blastfile_CT, "CT_", "AN_")

In [9]:
# calculate paralogs
AN_paralogs = find_paralogs(hits_AN)
AN_BBH_paralogs = find_BBH_paralogs(hits_AN)
print("AN paralogs: ", len(AN_paralogs))
print("AN BBH paralogs: ", len(AN_BBH_paralogs))

CT_paralogs = find_paralogs(hits_CT)
CT_BBH_paralogs = find_BBH_paralogs(hits_CT)
print("CT paralogs: ", len(CT_paralogs))
print("CT paralogs: ", len(CT_BBH_paralogs))

AN paralogs:  3635
AN BBH paralogs:  1253
CT paralogs:  937
CT paralogs:  159


In [10]:
# calculate best bidirectional hits
BBH = BBH(hits_AN, hits_CT)
print("Best bidirectional hits: ", len(BBH))

Best bidirectional hits:  4879
