In [None]:
from Bio import Entrez
import matplotlib.pyplot as plt
import matplotlib._color_data as mcd
import pandas as pd
import math
import random
Entrez.email = "lukas.becker@hhu.de"
overlap = [name for name in mcd.CSS4_COLORS]
overlap.remove("lightgrey")

In [None]:
reciprocal_best_hits_file = ""
blastp_fw_table = ""
query_file = ""

In [None]:
rec_prot=pd.read_table(reciprocal_best_hits_file)
fw_res=pd.read_table(blastp_fw_table,header=None)
fw_res.columns=["qseqid", "sseqid", "evalue", "bitscore", "qgi", "sgi", "sacc", "staxids", "sscinames", "scomnames",
                  "stitle"]

fw_res['qseqid'] = fw_res['qseqid'].map(lambda line: line.split('.')[0])
fw_res['sacc'] = fw_res['sacc'].map(lambda line: line.split('.')[0])
rec_prot = rec_prot.rename(columns={"forward_genome_id": "sacc"})
rec_prot = rec_prot.rename(columns={"backward_genome_id": "qseqid"})
result_data = rec_prot.merge(fw_res,how='inner', on=['sacc','qseqid'])
result_data = result_data.drop_duplicates('sacc', keep='first')

In [None]:
result_data.head()

In [None]:
queries = open(query_file,'r')
lines = queries.readlines()
queries.close()

queries = []
for line in lines:
    if ">" in line:
        queries.append(line.split(" ")[0].split(".")[0].split(">")[1])
print(len(queries))

In [None]:
accid_taxids={}
for query in queries:
    accid_taxids[query] = result_data[result_data['qseqid'] == query]

In [None]:
accid_taxids[queries[0]].head()

In [None]:
fig, axs = plt.subplots(math.floor(math.sqrt(len(queries))),math.ceil(math.sqrt(len(queries))), figsize=(15, 6), facecolor='w', edgecolor='k',constrained_layout=True)
#fig.subplots_adjust(hspace = .5, wspace=.001)

axs = axs.ravel()
for ax in axs:
    ax.set_axis_off()

for i in range(len(queries)):
    axs[i].set_axis_on()
    axs[i].grid()
    axs[i].set_facecolor("lightgrey")
    cl = overlap[random.randint(0,len(overlap)-1)]
    axs[i].scatter(list(accid_taxids[queries[i]]['evalue']),range(len(accid_taxids[queries[i]]['evalue'])),color=cl)
    axs[i].set_title(str(queries[i]))
    axs[i].get_xaxis().set_visible(False)
    axs[i].get_yaxis().set_visible(False)
    axs[i].set_xlim(0, 0.001)
    
#matplotlib version 3.4 support supylabel and supxlabel ...
fig.suptitle("E-Value distribution in target sequences", fontsize=16)
fig.supylabel("number of distinct taxids")
fig.supxlabel("amount of hits in backward genome")

#plt.savefig("../data/synechococcus_cellulose_candidates/evalue_distribution.png",dpi=300)
#fig.tight_layout()

In [None]:
fig, axs = plt.subplots(math.floor(math.sqrt(len(queries))),math.ceil(math.sqrt(len(queries))), figsize=(15, 6), facecolor='w', edgecolor='k',constrained_layout=True)
#fig.subplots_adjust(hspace = .5, wspace=.001)

axs = axs.ravel()
for ax in axs:
    ax.set_axis_off()

for i in range(len(accid_taxids.keys())):
    axs[i].set_axis_on()
    hit_distribution = {}
    for hit in accid_taxids[queries[i]]['staxids'].unique():
        val = accid_taxids[queries[i]][accid_taxids[queries[i]]['staxids'] == hit]['qseqid'].count()
        if val not in hit_distribution.keys():
            hit_distribution[val] = 1
        else:
            hit_distribution[val] += 1
    xvalues = []
    yvalues = []
    for key in sorted(hit_distribution.keys()):
        xvalues.append(key)
        yvalues.append(hit_distribution[key])
    axs[i].grid()    
    axs[i].bar(x=xvalues,height=yvalues,color=overlap[random.randint(0,len(overlap)-1)],width=0.8,edgecolor="black")
    axs[i].set_title(str(queries[i]))
    #axs[i].get_xaxis().set_visible(False)
    axs[i].set_xticks(range(1,max(list(hit_distribution.keys()))+1,1))
    
    if len(list(hit_distribution.keys())) >= 10:
        axs[i].tick_params('x', labelrotation=45)
    #axs[i].get_yaxis().set_visible(False)
    
fig.suptitle("amount of hits in distinct organisms per query", fontsize=16)
fig.supylabel("number of distinct taxids")
fig.supxlabel("amount of hits in backward genome")
#plt.savefig("../data/synechococcus_cellulose_candidates/hits_per_organism.png",dpi=300)


In [None]:
#single hits and multiple hits
hit_distribution = {}
for hit in accid_taxids[queries[13]]['staxids'].unique():
    val = accid_taxids[queries[13]][accid_taxids[queries[13]]['staxids'] == hit]['qseqid'].count()
    if val not in hit_distribution.keys():
        hit_distribution[val] = 1
    else:
        hit_distribution[val] += 1
        
xvalues = []
yvalues = []
for key in sorted(hit_distribution.keys()):
    xvalues.append(key)
    yvalues.append(hit_distribution[key])

In [None]:
hit_distribution.values()

In [None]:
plt.bar(x=xvalues,height=yvalues,color=overlap[random.randint(0,len(overlap)-1)])