In [1]:
import json
import plotly.graph_objs as go

In [2]:
with open("data/substrates_data.json") as f:
    # output file from preprocess_data.py
    substrates_data = json.load(f)

with open("data/scored_substrates_biopython.json") as f:
    # output file from score_substrate_similarities_biopython.py
    substrate_similarities_biopython = json.load(f)

with open("data/scored_substrates_positionwise.json") as f:
    # output file from score_substrate_similarities_biopython.py
    substrate_similarities_positionwise = json.load(f)

In [3]:
def plot_substrates_counts(d):
    """
    Function to plot the number of substrate sequences in an interactive graph
    """
    # Dictionary where each key is a protease and each value is the number of
    # substrate sequences
    counts = {key: len(value) for key, value in d.items()}
    # Sort by most to least substrates
    sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    # Set up the plot
    fig = go.Figure(data=[go.Bar(x=[item[0] for item in sorted_counts],
                                 y=[item[1] for item in sorted_counts])])
    fig.update_layout(
        title="Number of Substrates for each Protease",
        xaxis_title="Protease",
        yaxis_title="Count",
        yaxis_type="log"
    )
    fig.show()


def plot_substrate_similarities(d, title="Similarity score for each protease's substrates"):
    """
    Function to plot the similarity score of each protease's substrates based on
    score_substrate_similarities_biopython.py
    """
    # Split the two inner dictionaries
    proteases_with_one_substrate = d["Proteases with only one substrate"]
    proteases_with_many_substrates = d["Proteases with more than one substrate"]
    # Make them lists of tuples where each tuple is a pari od protease name and score
    proteases_with_many_substrates = sorted(
        proteases_with_many_substrates.items(), key=lambda x: x[1], reverse=True)
    proteases_with_one_substrate = [(protease, 1)
                                    for protease in proteases_with_one_substrate]
    # Assign 1 to proteases with only one substrate sequence, as 1 is the largest score possible
    all_dat = proteases_with_one_substrate + proteases_with_many_substrates
    # Color the proteases that have one substrate sequence red, the others blue
    colors = ["red"] * len(proteases_with_one_substrate) + \
        ["blue"] * len(proteases_with_many_substrates)
    # Set up the plot
    fig = go.Figure(data=[go.Bar(x=[item[0] for item in all_dat],
                                 y=[item[1] for item in all_dat],
                                 marker_color=colors)])
    fig.update_layout(
        title=title,
        xaxis_title="Protease",
        yaxis_title="Normalized score"
    )
    fig.show()

In [4]:
plot_substrates_counts(substrates_data)

In [5]:
plot_substrate_similarities(substrate_similarities_biopython, title="Similarity score for each protease's substrate using biopython's aligner")
plot_substrate_similarities(substrate_similarities_positionwise,
                            title="Similarity score for each protease's substrate using the positionwise aligner")