# Figure 3: Peptide visualization
## Library imports

In [None]:
import pandas as pd
import re
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

## Figure 3A: Peptide Atlas style figure for peptide visualization
This uses https://www.ebi.ac.uk/pride/archive/projects/PXD012867 as example data.

In [None]:
def peptide_atlas(peptides, gene, gap=1, range_max=None):
    """
    Draw peptide atlas style graph of peptide coverage.
    
    Args:
        - peptides: pandas table with MaxQuant peptide output.
        - gene: gene name to show
        - gap: default=1; minimum distance between peptides to be shown in the same row
        - range_max: default=None; maximum value for x-axis range. If None, end at last identified peptide.
    Returns:
        - plotly express line graph, colors are assigned by number of observations
        - peptide summary table
    """
    pg = peptides.loc[peptides['Gene names'] == gene,
                      [col for col in peptides.columns
                       if col in ['Sequence', 'Proteins', 'Start position', 'End position', 'Gene names']
                       or col.startswith("Intensity ")]].copy()\
    .sort_values(['Start position', 'End position'])
    
    # count quantifications
    pg.insert(0, "% quantified", pg[[col for col in pg.columns if col.startswith("Intensity ")]]\
                 .apply(lambda x: np.round(sum(x>0)/len(x), 2), axis=1))
    pg.drop([col for col in pg.columns if col.startswith("Intensity ")], axis=1, inplace=True)
    
    # assign traces to non-overlapping peptides
    trace_ends, traces = [-gap], []
    for s,e in zip(pg["Start position"], pg["End position"]):
        traced = False
        for t in range(len(trace_ends)):
            if s >= trace_ends[t]+gap:
                traces.append(t)
                trace_ends[t] = e
                traced = True
                break
        if not traced:
            traces.append(len(trace_ends))
            trace_ends.append(e)
    pg.insert(0, "Trace", traces)
    
    # draw and return plot
    colors = [px.colors.find_intermediate_color(
        px.colors.sequential.Reds[2],px.colors.sequential.Reds[-2], n, colortype="rgb"
    ) for n in np.unique(pg["% quantified"])]
    pa = px.line(pd.concat([pg[['Sequence', 'Proteins', 'Start position', 'Gene names', 'Trace',"% quantified"]]\
                                .rename({"Start position": "Sequence position"}, axis=1),
                            pg[['Sequence', 'Proteins', 'End position', 'Gene names', 'Trace', "% quantified"]]\
                                .rename({"End position": "Sequence position"}, axis=1)]).sort_values("% quantified"),
                 x="Sequence position", y="Trace", line_group="Sequence", template="simple_white", color="% quantified",
                 color_discrete_sequence = colors, height=100+30*(max(pg.Trace)+1),
                 range_x=[0,max(pg["End position"]) if range_max is None else range_max], range_y=[-0.5,(max(pg.Trace)+0.5)],
                 title="Peptides identified for {} ({})".format(gene, pg.Proteins.iloc[0]))\
    .update_traces(line_width=20).update_yaxes(visible=False).update_xaxes(showgrid=True)\
    .update_layout(margin={"autoexpand":False,"b":50,"l":20,"r":20,"t":50})
    return pa, pg

In [12]:
yeast_peptides = pd.read_csv("./MaxQuant_Output/MaxQuantOutput/peptides.txt", sep='\t')

pa, pg = peptide_atlas(yeast_peptides, "HSP42", gap=1)
pa.update_layout(font_size=10).show()

## Figure 3B: AlphaMap visualization
For a full guide on how to create these figures please refer to the AlphaMap github repository: https://github.com/MannLabs/alphamap

This also uses https://www.ebi.ac.uk/pride/archive/projects/PXD012867 as example data, but the evidence file was regenerated.

In [None]:
from alphamap.organisms_data import import_fasta
yeast_fasta = import_fasta("Saccharomyces cerevisiae")

from alphamap.organisms_data import import_uniprot_annotation
yeast_uniprot = import_uniprot_annotation('Saccharomyces cerevisiae')

In [None]:
from alphamap.importing import import_data
mq_data_rpn0 = import_data("./MaxQuant_Output/MaxQuantOutput/evidence.txt",
                      sample=["20190124_QX3_JuSc_SA_JS7_1_RPN4_0h_1_190125133334",
                              "20190124_QX3_JuSc_SA_JS7_1_RPN4_0h_2",
                              "20190124_QX3_JuSc_SA_JS7_1_RPN4_0h_3",
                              "20190124_QX3_JuSc_SA_JS7_1_RPN4_0h_4"])
mq_data_rpn0["naked_sequence"] = mq_data_rpn0["naked_sequence"].apply(lambda x: re.sub(r'\(.+?\)', "", x))
mq_data_rpn0.drop([i for i,el in enumerate(mq_data_rpn0["all_protein_ids"]) if el.startswith("CON")], inplace=True)
mq_data_wt0 = import_data("./MaxQuant_Output/MaxQuantOutput/evidence.txt",
                      sample=["20190124_QX3_JuSc_SA_JS7_1_wt_0h_1_190125155225",
                              "20190124_QX3_JuSc_SA_JS7_1_wt_0h_2",
                              "20190124_QX3_JuSc_SA_JS7_1_wt_0h_3",
                              "20190124_QX3_JuSc_SA_JS7_1_wt_0h_4"])
mq_data_wt0["naked_sequence"] = mq_data_wt0["naked_sequence"].apply(lambda x: re.sub(r'\(.+?\)', "", x))
mq_data_wt0.drop([i for i,el in enumerate(mq_data_wt0["all_protein_ids"]) if el.startswith("CON")], inplace=True)

from alphamap.preprocessing import format_input_data
formatted_mq_data_rpn0 = format_input_data(df=mq_data_rpn0, fasta = yeast_fasta, modification_exp = r'\(.+?\)')
formatted_mq_data_wt0 = format_input_data(df=mq_data_wt0, fasta = yeast_fasta, modification_exp = r'\(.+?\)')

In [10]:
from alphamap.sequenceplot import plot_peptide_traces, uniprot_color_dict
from alphamap.uniprot_integration import uniprot_feature_dict
fig3B = plot_peptide_traces([formatted_mq_data_wt0, formatted_mq_data_rpn0],
                          name = ['wildtype', 'mutant'],
                          protein = "Q12329",
                          fasta = yeast_fasta,
                          uniprot=yeast_uniprot,
                          selected_features=['CHAIN','DOMAIN','STRUCTURE'],
                          uniprot_feature_dict=uniprot_feature_dict, 
                          uniprot_color_dict=uniprot_color_dict)
fig3B.update_layout(width=900, height=400, font_size=10)

## Fig3C: PhosphoSitePlus style lollipop plot for PTM visualization
This uses https://www.ebi.ac.uk/pride/archive/projects/PXD010697 as an example data set.

In [None]:
def lollipop(df, proteinid, range_max=None,
             columns={"ids": "Proteins", "pos": "Positions within proteins",
                      "int": "Intensity", "prob": "Localization prob"}):
    """
    This function generates a lollipop plot representing PTM localization, intensity and localization probability.
    
    Args:
        - df (pd.DataFrame): Loaded sites table
        - proteinid (str): Protein identifier
        - range_max (int/None): default = None, optional maximum value for the x-axis.
                                If none the maximum site position+20 is used.
        - columns (dict): default = MaxQuant sites table names, defines the column names for:
            - ids: semicolon separated protein identifiers
            - pos: semicolon separated site positions
            - int: single column containing site intensity
            - prob: single column containing localization probability
    Returns:
        - gene (pd.DataFrame): filtered and transformed dataframe used for plotting
        - plot (plotly.Figure): interactive lollipop plot
    """
    gene = df.loc[[proteinid in str(el) for el in df[columns["ids"]]], list(columns.values())]
    gene[columns["pos"]] = gene[[columns["pos"], columns["ids"]]].apply(
        lambda x: int(x.values[0].split(";")[x.values[1].split(";").index(proteinid)]), axis=1)
    gene[columns["int"]] = gene[columns["int"]].apply(np.log10)
    gene.rename({columns["int"]: "log10("+columns["int"]+")"}, axis=1, inplace=True)
    gene.insert(0, "size", gene[columns["prob"]].apply(
        lambda x: 1 if x<0.5 else 2 if x<0.8 else 3 if x<0.95 else 4 if x<0.98 else 5))
    gene = gene.loc[np.isfinite(gene["log10("+columns["int"]+")"])]
    if len(gene) == 0:
        return gene, None
    plot = px.scatter(gene, x=columns["pos"], y="log10("+columns["int"]+")", size="size",
                      template="simple_white", title=proteinid,
                      hover_data=[columns["pos"], "log10("+columns["int"]+")", columns["prob"]],
                      range_x=[-20,max(gene[columns["pos"]])+20 if range_max is None else range_max])
    for i,el in gene.iterrows():
        plot.add_shape(x0=el[columns["pos"]], x1=el[columns["pos"]],
                    y0=0, y1=el["log10("+columns["int"]+")"], line_width=1, opacity=0.5)
    return gene, plot

In [11]:
fb_phospho = pd.read_csv(".\MaxQuant_Output\Forebrain_Phospho_txt\Phospho (STY)Sites.txt", sep='\t')
_, fig3C = lollipop(fb_phospho, "Q9WTQ5")
fig3C.update_layout(font_size=10, width=900, height=400)


Columns (201,640,641) have mixed types.Specify dtype option on import or set low_memory=False.

