In [None]:
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

In [None]:
# Figure 2a

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

mq_path = r"Z:/yufe/results/msfragger_ddaplus_paper/MSV000090552/maxquant/combined/txt/evidence.txt"

# The AllPeptides.psmtsv contains nonredundant peptides with all files combined. Read the individual peptide files to get the peptide counts of each file.
# mm_path = r"Z:/yufe/results/msfragger_ddaplus_paper/MSV000090552/metamorpheus/Task2-SearchTask/Individual File Results/"
mm_path = r"Z:/yufe/results/msfragger_ddaplus_paper/MSV000090552/metamorpheus_106/Task2-SearchTask/Individual File Results/"

fp_path = r"Z:/yufe/results/msfragger_ddaplus_paper/MSV000090552/fragpipe_dda/"
fp_ddaplus_path = r"Z:/yufe/results/msfragger_ddaplus_paper/MSV000090552/fragpipe_ddaplus/"

runs = ["2022_09_01_HeLa_500ng_16mzst_DDA_NCE_22",
        "2022_09_01_HeLa_500ng_16mzst_DDA_NCE_27",
        "2022_09_01_HeLa_500ng_16mzst_DDA_NCE_32",
        "2022_09_01_HeLa_500ng_16mzst_DDA_NCE_37",
        "2022_09_01_HeLa_500ng_16mzst_DDA_NCE_42"]

# Scribe. From the publication
scribe_prosit_counts = [32579, 34405, 34459, 31405, 24736]

# MaxQuant
df = pd.read_csv(mq_path, low_memory=False, sep="\t", index_col=False, na_values=["", "NaN"], header=0, usecols=["Sequence", "Raw file", "Reverse", "Potential contaminant"])
df = df[df["Reverse"].isna()]
df = df[df["Potential contaminant"].isna()]
mq_counts = df.groupby("Raw file")["Sequence"].nunique()

# MetaMorpheus
mm_counts = []
for run in runs:
    df = pd.read_csv(mm_path + run + "-calib_Peptides.psmtsv", sep="\t", index_col=False, na_values=["", "n/a"], header=0, usecols=["File Name", "Base Sequence", "Contaminant", "Decoy", "PEP_QValue"])
    df = df[df["Contaminant"] == "N"]
    df = df[df["Decoy"] == "N"]
    df = df[df["PEP_QValue"] < 0.01]
    mm_counts.append(df["Base Sequence"].nunique())


# MSFragger
fp_counts = []
for run in runs:
    df = pd.read_csv(fp_path + run + r"/peptide.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, usecols=["Peptide"])
    fp_counts.append(df["Peptide"].nunique())


# MSFragger-DDA+
fp_ddaplus_counts = []
for run in runs:
    df = pd.read_csv(fp_ddaplus_path + run + r"/peptide.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, usecols=["Peptide"])
    fp_ddaplus_counts.append(df["Peptide"].nunique())


counts_df = pd.DataFrame({
    "MaxQuant": mq_counts.values,
    "MetaMorpheus": mm_counts,
    "FragPipe": fp_counts,
    "FragPipe_DDA+": fp_ddaplus_counts,
    "Scribe_Prosit": scribe_prosit_counts,
})

legend_names = ["MaxQuant",
                "MetaMorpheus",
                "MSFragger, FragPipe",
                "MSFragger-DDA+, FragPipe",
                "Scribe (Prosit)"]
x_tick_labels = ["NCE 22", "NCE 27", "NCE 32", "NCE 37", "NCE 42"]
ax = counts_df.plot(kind="bar", figsize=(8, 4), width=0.8)
plt.ylabel("Number of peptide sequences")
ax.set_xticklabels(x_tick_labels, rotation=0)
ax.legend(legend_names, loc="upper center", bbox_to_anchor=(0.5, -0.1), fancybox=True, ncol=3)
ax.grid(False)

plt.savefig(r"figure2a.pdf", bbox_inches="tight", pad_inches=0.1)

print((counts_df["FragPipe_DDA+"] - counts_df["FragPipe"]) * 100 / counts_df["FragPipe"])
print(np.mean((counts_df["FragPipe_DDA+"] - counts_df["FragPipe"]) * 100 / counts_df["FragPipe"]))


In [None]:
# Figure 2b

import pandas as pd
import matplotlib.pyplot as plt

mq_path = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD027242/maxquant/combined/txt/evidence.txt"

# The AllPeptides.psmtsv contains nonredundant peptides with all files combined. Read the individual peptide files to get the peptide counts of each file.
mm_path = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD027242/metamorpheus/Task2-SearchTask/Individual File Results/"

fp_path = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD027242/fragpipe_dda/"
fp_ddaplus_path = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD027242/fragpipe_ddaplus/"

runs = ["trypsin_cid", "trypsin_hcd", "aspn_cid", "aspn_hcd", "gluc_cid", "gluc_hcd"]

# Scribe. From the publication
scribe_prosit_counts = [31714, 27413, 12504, 10821, 13206, 11916]

# MaxQuant
df = pd.read_csv(mq_path, low_memory=False, sep="\t", index_col=False, na_values=["", "NaN"], header=0, usecols=["Sequence", "Raw file", "Reverse", "Potential contaminant"])
df = df[df["Reverse"].isna()]
df = df[df["Potential contaminant"].isna()]
mq_counts = df.groupby("Raw file")["Sequence"].nunique()

# MetaMorpheus
mm_counts = []
ff = [
    "Y20210522-01-calib_Peptides.psmtsv",
    "Y20210522-02-calib_Peptides.psmtsv",
    "Y20210522-03-calib_Peptides.psmtsv",
    "Y20210522-04-calib_Peptides.psmtsv",
    "Y20210522-05-calib_Peptides.psmtsv",
    "Y20210522-06-calib_Peptides.psmtsv"
]
for f in ff:
    df = pd.read_csv(mm_path + f, sep="\t", index_col=False, na_values=["", "n/a"], header=0, usecols=["File Name", "Base Sequence", "Contaminant", "Decoy", "PEP_QValue"])
    df = df[df["Contaminant"] == "N"]
    df = df[df["Decoy"] == "N"]
    df = df[df["PEP_QValue"] < 0.01]
    mm_counts.append(df["Base Sequence"].nunique())


# MSFragger
fp_counts = []
for run in runs:
    df = pd.read_csv(fp_path + run + r"/peptide.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, usecols=["Peptide"])
    fp_counts.append(df["Peptide"].nunique())


# MSFragger-DDA+
fp_ddaplus_counts = []
for run in runs:
    df = pd.read_csv(fp_ddaplus_path + run + r"/peptide.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, usecols=["Peptide"])
    fp_ddaplus_counts.append(df["Peptide"].nunique())


counts_df = pd.DataFrame({
    "MaxQuant": mq_counts,
    "MetaMorpheus": mm_counts,
    "FragPipe": fp_counts,
    "FragPipe_DDA+": fp_ddaplus_counts,
    "Scribe_Prosit": scribe_prosit_counts,
})

legend_names = ["MaxQuant",
                "MetaMorpheus",
                "MSFragger, FragPipe",
                "MSFragger-DDA+, FragPipe",
                "Scribe (Prosit)"]
x_tick_labels = ["Trypsin\nCID", "Trypsin\nHCD", "AspN\nCID", "AspN\nHCD", "GluC\nCID", "GluC\nHCD"]
ax = counts_df.plot(kind="bar", figsize=(8, 4), width=0.8)
plt.ylabel("Number of peptide sequences")
ax.set_xticklabels(x_tick_labels, rotation=0)
ax.set_xlabel('')
ax.legend(legend_names, loc="upper center", bbox_to_anchor=(0.5, -0.15), fancybox=True, ncol=3)
ax.grid(False)

plt.savefig(r"figure2b.pdf", bbox_inches="tight", pad_inches=0.1)

print((counts_df["FragPipe_DDA+"] - counts_df["FragPipe"]) * 100 / counts_df["FragPipe"])
print(np.mean((counts_df["FragPipe_DDA+"] - counts_df["FragPipe"]) * 100 / counts_df["FragPipe"]))


In [None]:
# Figure 2c

import numpy as np
import datetime
import matplotlib.pyplot as plt

# MSV000090552
# MaxQuant search with raw files
d1_mq_st = datetime.datetime(year=2024, month=4, day=30, hour=23, minute=5, second=52) # 30/04/2024	23:05:52
d1_mq_et = datetime.datetime(year=2024, month=5, day=1, hour=0, minute=45, second=53) # 01/05/2024	00:45:53
d1_mq_t = (d1_mq_et - d1_mq_st).total_seconds() / 60
print("MSV000090552 MaxQuant - " + str(d1_mq_t))

# MeteMorpheus search with mzML files
d1_mm_t = 60 + 38 + 33 / 60 # Total time: 01:38:33.5484934
print("MSV000090552 MetaMorpheus - " + str(d1_mm_t))

# MSFragger search with mzML files
d1_dda_t = 12.2
d1_dda_search_t = 4.885
d1_dda_rest_t = d1_dda_t - d1_dda_search_t
print("MSV000090552 MSFragger - " + str(d1_dda_t))


# MSFragger-DDA+ search with mzML files
d1_ddap_t = 24.3
d1_ddap_search_t = 8.552
d1_ddap_rest_t = d1_ddap_t - d1_ddap_search_t
print("MSV000090552 MSFragger-DDA+ - " + str(d1_ddap_t))

dt = [[d1_mq_t, 0], [d1_mm_t, 0], [d1_dda_search_t, d1_dda_rest_t], [d1_ddap_search_t, d1_ddap_rest_t]]

patterns = ['/', '\\', '|']
def addlabels(x,y,y_pos):
    for i in range(len(x)):
        plt.text(i, y_pos[i], y[i], ha='center', va='center')

fig, ax = plt.subplots()
colors = ["limegreen","gold","#36a2eb"]
ax.bar(['MaxQuant','MetaMorpheus','MSFragger\nFragPipe','MSFragger-DDA+\nFragPipe'],
        [ x[0] for x in dt ],
        width = 0.5,
        color=[colors[2],colors[2],colors[0], colors[0]])

ax.bar(['MaxQuant','MetaMorpheus','MSFragger\nFragPipe','MSFragger-DDA+\nFragPipe'],
        [ x[1] for x in dt ],
        bottom=[ x[0] for x in dt ],
        width = 0.5,
        color=[colors[2],colors[2],colors[1], colors[1]])

ax.grid(False)
plt.ylabel("Runtime (minutes)")
ax.set_xlabel("")
plt.ylim(0, max(np.ravel(dt)) + 10)

colors = {'database search':colors[0], 'rescoring, protein grouping,\nFDR filtering, and LFQ':colors[1], 'entire process':colors[2]}
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in colors.keys() ]
plt.legend(handles, colors.keys())

plt.gcf().set_size_inches(6,4)
plt.savefig(r"figure2c.pdf", bbox_inches="tight", pad_inches=0.1)


In [None]:
# Figure S1

import os
from subprocess import Popen, PIPE
import pandas as pd
import matplotlib.pyplot as plt


def plot_table(stat_df, font_size):
    plt.figure(figsize=(8,3), dpi=300)
    ax = plt.subplot()
    nrows = stat_df.shape[0]
    ncols = stat_df.shape[1]
    ax.set_xlim(0, ncols + 0.5)
    ax.set_ylim(0, nrows + 0.5)
    # Add table's main text
    positions = [0.7, 2, 3, 4, 5]
    for i in range(nrows):
        for j, column in enumerate(stat_df.columns):
            text_label=f'{stat_df[column].iloc[i]}'
            ax.annotate(
                xy=(positions[j], i + .45),
                text=text_label,
                ha='center',
                va='center',
                weight='normal',
                size=font_size)
    # Add column names
    ax.annotate(
            xy=(4.5, nrows + .45),
            text="FDP estimation",
            ha='center',
            va='bottom',
            weight='bold',
            size=font_size)
    column_names = ['Tool', 'Target\nsequences', 'Entrapment\nsequences', 'upper bound', 'lower bound']
    for index, c in enumerate(column_names):
        if index < 3:
            ypos = nrows + 0.3 if index == 0 else nrows + .2
            weight = 'bold'
            va = 'bottom'
            size = font_size
        else:
            ypos = nrows + .35
            weight = 'normal'
            va = 'top'
            size = font_size - 0.5
        ax.annotate(
            xy=(positions[index], ypos),
            text=column_names[index],
            ha='center',
            va=va,
            weight=weight,
            size=size)
    # Add dividing lines
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [nrows, nrows], lw=1, color='black', marker='', zorder=4)
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [0, 0], lw=1, color='black', marker='', zorder=4)
    for x in range(1, nrows):
        ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [x, x], lw=1, color='grey', ls=':', zorder=3 , marker='')
    # Fill cell color in between
    ax.set_axis_off()
    ax.fill_between(
        x=[0,positions[1]-0.6],
        y1=nrows,
        y2=0,
        color='lightskyblue',
        alpha=0.25,
        ec='None')
    plt.gcf().set_size_inches(6, 2.5)
    plt.savefig(r"figureS1.pdf", bbox_inches='tight', pad_inches=0.1)
    plt.show()


def run():
    cmd = r"python {}".format(os.path.join(os.getcwd(), "../peptide_entrapment.py"))
    proc = Popen(cmd.split(' '), stdout=PIPE, stderr=PIPE)
    (output, error) = proc.communicate()
    output = r"{}".format(output.decode('utf-8'))
    print(output)
    print('return status: '+str(proc.returncode))
    return(output)


qval_stat = run()
lines = qval_stat.replace('\r\n','\n').split('\n')
ent_stat_df = pd.DataFrame({"Tool": [ x for x in lines if ":" not in x and len(x) >0 ],
                            "Target": [ x.replace("Target:","").strip() for x in lines if x.startswith("Target:") ],
                            "Entrapment": [ x.replace("Entrapment:","").strip() for x in lines if x.startswith("Entrapment:") ],
                            "Upper bound": [ x.replace("combined method:","").strip() for x in lines if x.startswith("combined method:") ],
                            "Lower bound": [ x.replace("lower bound:","").strip() for x in lines if x.startswith("lower bound:") ]})

ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MaxQuant' if x == 'MaxQuant' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MetaMorpheus' if x == 'MetaMorpheus' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MSFragger\nFragPipe' if x == 'MSFragger-DDA' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MSFragger-DDA+\nFragPipe' if x == 'MSFragger-DDA+' else x)

plot_table(stat_df = ent_stat_df, font_size = 8)


In [None]:
# Figure 2d

import os
from subprocess import Popen, PIPE
import pandas as pd
import matplotlib.pyplot as plt


def plot_table(stat_df, font_size):
    plt.figure(figsize=(8,3), dpi=300)
    ax = plt.subplot()
    nrows = stat_df.shape[0]
    ncols = stat_df.shape[1]
    ax.set_xlim(0, ncols + 0.5)
    ax.set_ylim(0, nrows + 0.5)
    # Add table's main text
    positions = [0.7, 2, 3, 4, 5]
    for i in range(nrows):
        for j, column in enumerate(stat_df.columns):
            text_label=f'{stat_df[column].iloc[i]}'
            ax.annotate(
                xy=(positions[j], i + .45),
                text=text_label,
                ha='center',
                va='center',
                weight='normal',
                size=font_size)
    # Add column names
    ax.annotate(
        xy=(4.5, nrows + .45),
        text="FDP estimation",
        ha='center',
        va='bottom',
        weight='bold',
        size=font_size)
    column_names = ['Tool', 'Target\nproteins', 'Entrapment\nproteins', 'upper bound', 'lower bound']
    for index, c in enumerate(column_names):
        if index < 3:
            ypos = nrows + 0.3 if index == 0 else nrows + .2
            weight = 'bold'
            va = 'bottom'
            size = font_size
        else:
            ypos = nrows + .35
            weight = 'normal'
            va = 'top'
            size = font_size - 0.5
        ax.annotate(
            xy=(positions[index], ypos),
            text=column_names[index],
            ha='center',
            va=va,
            weight=weight,
            size=size)
    # Add dividing lines
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [nrows, nrows], lw=1, color='black', marker='', zorder=4)
    ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [0, 0], lw=1, color='black', marker='', zorder=4)
    for x in range(1, nrows):
        ax.plot([ax.get_xlim()[0], ax.get_xlim()[1]], [x, x], lw=1, color='grey', ls=':', zorder=3 , marker='')
    # Fill cell color in between
    ax.set_axis_off()
    ax.fill_between(
        x=[0,positions[1]-0.6],
        y1=nrows,
        y2=0,
        color='lightskyblue',
        alpha=0.25,
        ec='None')
    plt.gcf().set_size_inches(6, 2.5)
    plt.savefig(r"figure2d.pdf", bbox_inches='tight', pad_inches=0.1)
    plt.show()


def run():
    cmd = r"python {}".format(os.path.join(os.getcwd(), "../protein_entrapment.py"))
    proc = Popen(cmd.split(' '), stdout=PIPE, stderr=PIPE)
    (output, error) = proc.communicate()
    output = r"{}".format(output.decode('utf-8'))
    print(output)
    print('return status: '+str(proc.returncode))
    return(output)


qval_stat = run()
lines = qval_stat.replace('\r\n','\n').split('\n')
ent_stat_df = pd.DataFrame({"Tool": [ x for x in lines if ":" not in x and len(x) >0 ],
                            "Target": [ x.replace("Target:","").strip() for x in lines if x.startswith("Target:") ],
                            "Entrapment": [ x.replace("Entrapment:","").strip() for x in lines if x.startswith("Entrapment:") ],
                            "Upper bound": [ x.replace("combined method:","").strip() for x in lines if x.startswith("combined method:") ],
                            "Lower bound": [ x.replace("lower bound:","").strip() for x in lines if x.startswith("lower bound:") ]})

ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MaxQuant' if x == 'MaxQuant' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MetaMorpheus' if x == 'MetaMorpheus' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MSFragger\nFragPipe' if x == 'MSFragger-DDA' else x)
ent_stat_df['Tool'] = ent_stat_df['Tool'].apply(lambda x: 'MSFragger-DDA+\nFragPipe' if x == 'MSFragger-DDA+' else x)

plot_table(stat_df = ent_stat_df, font_size = 8)
