In [None]:
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import csv

def count_pd_peptides(aa, columns):
    xx = aa[columns].dropna(how="all", inplace=False)
    a = xx.iloc[:, 0].value_counts(dropna=True)
    b = xx.iloc[:, 1].value_counts(dropna=True)
    return [a.get("High", default=0), a.get("Peak Found", default=0), b.get("High", default=0), b.get("Peak Found", default=0)]
    

def process_chimerys_result_optimization(pd_path):
    pd_54_30_2mz = ["Found in Sample: F5: Sample", "Found in Sample: F6: Sample"]
    pd_54_30_4mz = ["Found in Sample: F7: Sample", "Found in Sample: F8: Sample"]
    pd_54_30_8mz = ["Found in Sample: F9: Sample", "Found in Sample: F10: Sample"]
    pd_54_30_12mz = ["Found in Sample: F11: Sample", "Found in Sample: F12: Sample"]
    pd_54_30_18mz = ["Found in Sample: F13: Sample", "Found in Sample: F14: Sample"]
    pd_54_30_24mz = ["Found in Sample: F15: Sample", "Found in Sample: F16: Sample"]
    pd_54_30_48mz = ["Found in Sample: F17: Sample", "Found in Sample: F18: Sample"]

    pd_86_45_2mz = ["Found in Sample: F19: Sample", "Found in Sample: F20: Sample"]
    pd_86_45_4mz = ["Found in Sample: F21: Sample", "Found in Sample: F22: Sample"]
    pd_86_45_8mz = ["Found in Sample: F23: Sample", "Found in Sample: F24: Sample"]
    pd_86_45_12mz = ["Found in Sample: F25: Sample", "Found in Sample: F26: Sample"]
    pd_86_45_18mz = ["Found in Sample: F27: Sample", "Found in Sample: F28: Sample"]
    pd_86_45_24mz = ["Found in Sample: F29: Sample", "Found in Sample: F30: Sample"]
    pd_86_45_48mz = ["Found in Sample: F31: Sample", "Found in Sample: F32: Sample"]

    pd_118_60_2mz = ["Found in Sample: F35: Sample", "Found in Sample: F36: Sample"]
    pd_118_60_4mz = ["Found in Sample: F37: Sample", "Found in Sample: F38: Sample"]
    pd_118_60_8mz = ["Found in Sample: F39: Sample", "Found in Sample: F40: Sample"]
    pd_118_60_12mz = ["Found in Sample: F41: Sample", "Found in Sample: F42: Sample"]
    pd_118_60_18mz = ["Found in Sample: F43: Sample", "Found in Sample: F44: Sample"]
    pd_118_60_24mz = ["Found in Sample: F45: Sample", "Found in Sample: F46: Sample"]
    pd_118_60_48mz = ["Found in Sample: F47: Sample", "Found in Sample: F48: Sample"]

    pd_246_120_2mz = ["Found in Sample: F49: Sample", "Found in Sample: F50: Sample"]
    pd_246_120_4mz = ["Found in Sample: F51: Sample", "Found in Sample: F52: Sample"]
    pd_246_120_8mz = ["Found in Sample: F53: Sample", "Found in Sample: F54: Sample"]
    pd_246_120_12mz = ["Found in Sample: F55: Sample", "Found in Sample: F56: Sample"]
    pd_246_120_18mz = ["Found in Sample: F57: Sample", "Found in Sample: F58: Sample"]
    pd_246_120_24mz = ["Found in Sample: F59: Sample", "Found in Sample: F60: Sample"]
    pd_246_120_48mz = ["Found in Sample: F61: Sample", "Found in Sample: F62: Sample"]

    pd_table = pd.read_csv(pd_path, sep="\t", index_col=False, na_values=None, dtype=str)
    pd_table = pd_table.map(lambda x: x.strip() if isinstance(x, str) else x)
    pd_table.to_csv("temp", sep="\t", index=False, quoting=csv.QUOTE_ALL)
    pd_table = pd.read_csv("temp", sep="\t", index_col=False, na_values=["", "n/a"], header=0)

    pd_118_60_counts = [count_pd_peptides(pd_table, pd_118_60_2mz),
                        count_pd_peptides(pd_table, pd_118_60_4mz),
                        count_pd_peptides(pd_table, pd_118_60_8mz),
                        count_pd_peptides(pd_table, pd_118_60_12mz),
                        count_pd_peptides(pd_table, pd_118_60_18mz),
                        count_pd_peptides(pd_table, pd_118_60_24mz),
                        count_pd_peptides(pd_table, pd_118_60_48mz)]

    pd_246_120_counts = [count_pd_peptides(pd_table, pd_246_120_2mz),
                         count_pd_peptides(pd_table, pd_246_120_4mz),
                         count_pd_peptides(pd_table, pd_246_120_8mz),
                         count_pd_peptides(pd_table, pd_246_120_12mz),
                         count_pd_peptides(pd_table, pd_246_120_18mz),
                         count_pd_peptides(pd_table, pd_246_120_24mz),
                         count_pd_peptides(pd_table, pd_246_120_48mz)]

    pd_54_30_counts = [count_pd_peptides(pd_table, pd_54_30_2mz),
                       count_pd_peptides(pd_table, pd_54_30_4mz),
                       count_pd_peptides(pd_table, pd_54_30_8mz),
                       count_pd_peptides(pd_table, pd_54_30_12mz),
                       count_pd_peptides(pd_table, pd_54_30_18mz),
                       count_pd_peptides(pd_table, pd_54_30_24mz),
                       count_pd_peptides(pd_table, pd_54_30_48mz)]

    pd_86_45_counts = [count_pd_peptides(pd_table, pd_86_45_2mz),
                       count_pd_peptides(pd_table, pd_86_45_4mz),
                       count_pd_peptides(pd_table, pd_86_45_8mz),
                       count_pd_peptides(pd_table, pd_86_45_12mz),
                       count_pd_peptides(pd_table, pd_86_45_18mz),
                       count_pd_peptides(pd_table, pd_86_45_24mz),
                       count_pd_peptides(pd_table, pd_86_45_48mz)]

    return pd_54_30_counts, pd_86_45_counts, pd_118_60_counts, pd_246_120_counts


pd_path = r"Z:\yufe\results\msfragger_ddaplus_paper\PXD037527\1.6-48mz_30m\proteomediscoverer/DDA_1.6-48mz_30m_PeptideGroups.txt"
pd_54_30_counts, pd_86_45_counts, pd_118_60_counts, pd_246_120_counts = process_chimerys_result_optimization(pd_path)
pd_counts = [pd_54_30_counts, pd_86_45_counts, pd_118_60_counts, pd_246_120_counts]


In [None]:
def count_fp_peptides(aa, columns):
    xx_1 = aa[[columns + "_1 Match Type"]].dropna(how="all", inplace=False)
    xx_2 = aa[[columns + "_2 Match Type"]].dropna(how="all", inplace=False)
    a = xx_1.value_counts(dropna=True)
    b = xx_2.value_counts(dropna=True)
    return [a.get("MS/MS", 0), a.get("MBR", 0), b.get("MS/MS", 0), b.get("MBR", 0)]


def process_fp_result(fp_path):
    fp_table = pd.read_csv(fp_path, sep="\t", index_col=False, na_values=["", "0"], header=0)

    fp_118_60_counts = [count_fp_peptides(fp_table, "118_60_2mz"),
                        count_fp_peptides(fp_table, "118_60_4mz"),
                        count_fp_peptides(fp_table, "118_60_8mz"),
                        count_fp_peptides(fp_table, "118_60_12mz"),
                        count_fp_peptides(fp_table, "118_60_18mz"),
                        count_fp_peptides(fp_table, "118_60_24mz"),
                        count_fp_peptides(fp_table, "118_60_48mz")]

    fp_246_120_counts = [count_fp_peptides(fp_table, "246_120_2mz"),
                         count_fp_peptides(fp_table, "246_120_4mz"),
                         count_fp_peptides(fp_table, "246_120_8mz"),
                         count_fp_peptides(fp_table, "246_120_12mz"),
                         count_fp_peptides(fp_table, "246_120_18mz"),
                         count_fp_peptides(fp_table, "246_120_24mz"),
                         count_fp_peptides(fp_table, "246_120_48mz")]

    fp_54_30_counts = [count_fp_peptides(fp_table, "54_30_2mz"),
                       count_fp_peptides(fp_table, "54_30_4mz"),
                       count_fp_peptides(fp_table, "54_30_8mz"),
                       count_fp_peptides(fp_table, "54_30_12mz"),
                       count_fp_peptides(fp_table, "54_30_18mz"),
                       count_fp_peptides(fp_table, "54_30_24mz"),
                       count_fp_peptides(fp_table, "54_30_48mz")]

    fp_86_45_counts = [count_fp_peptides(fp_table, "86_45_2mz"),
                       count_fp_peptides(fp_table, "86_45_4mz"),
                       count_fp_peptides(fp_table, "86_45_8mz"),
                       count_fp_peptides(fp_table, "86_45_12mz"),
                       count_fp_peptides(fp_table, "86_45_18mz"),
                       count_fp_peptides(fp_table, "86_45_24mz"),
                       count_fp_peptides(fp_table, "86_45_48mz")]

    return fp_54_30_counts, fp_86_45_counts, fp_118_60_counts, fp_246_120_counts


fp_path = r"Z:\yufe\results\msfragger_ddaplus_paper\PXD037527\1.6-48mz_30m\fragpipe\combined_modified_peptide.tsv"
fp_54_30_counts, fp_86_45_counts, fp_118_60_counts, fp_246_120_counts = process_fp_result(fp_path)
fp_counts = [fp_54_30_counts, fp_86_45_counts, fp_118_60_counts, fp_246_120_counts]


In [None]:
def process_metamorpheus_result(mm_path, mm_design):
    mm_table = pd.read_csv(mm_path, sep="\t", index_col=False, na_values=["", "n/a"], header=0)
    mm_design = pd.read_csv(mm_design, sep="\t", index_col=False, na_values=["", "n/a"], header=0)

    cond_list = mm_design['Condition'].unique().tolist()

    grp_cond = dict()
    cond_ident = dict()
    for cond in cond_list:
        if cond != "118_60_1_6mz" and cond != "lib":
            group = '_'.join(cond.split('_')[0:2])
            if group in grp_cond:
                grp_cond[group].append(cond)
            else:
                grp_cond[group] = [cond]

            tmpdf = mm_design.groupby('Condition').get_group(cond)
            col_list = [ 'Detection Type_' + x.replace('.mzML', '') for x in tmpdf['FileName'].tolist() ]

            mm_tmp = mm_table[col_list].replace('NotDetected', np.nan, inplace=False).dropna(how="all", inplace=False)
            rep1 = mm_tmp.iloc[:, 0].value_counts(dropna=True)
            rep2 = mm_tmp.iloc[:, 1].value_counts(dropna=True)
            cond_ident[cond] = [rep1.get("MSMS", default=0), rep1.get("MBR", default=0), rep2.get("MSMS", default=0), rep2.get("MBR", default=0)]

    grp_ident_count = dict()
    for grp in grp_cond.keys():
        for i in ['2mz','4mz','8mz','12mz','18mz','24mz','48mz']:
            cond = grp + "_" + i
            if grp in grp_ident_count:
                grp_ident_count[grp].append(cond_ident[cond])
            else:
                grp_ident_count[grp] = [cond_ident[cond]]

    return grp_ident_count["54_30"], grp_ident_count["86_45"], grp_ident_count["118_60"], grp_ident_count["246_120"]


# Need to count both MBR and MSMS, so reading the AllPeptides.psmtsv or individual peptide files won't work. 
mm_path = r"Z:\yufe\results\msfragger_ddaplus_paper\PXD037527\1.6-48mz_30m\metamorpheus\Task2-SearchTask\AllQuantifiedPeptides.tsv" 
mm_design = r"Z:\yufe\results\msfragger_ddaplus_paper\PXD037527\1.6-48mz_30m\metamorpheus\Task1-CalibrateTask\ExperimentalDesign.tsv"
mm_54_30_counts, mm_86_45_counts, mm_118_60_counts, mm_246_120_counts = process_metamorpheus_result(mm_path, mm_design)
mm_counts = [mm_54_30_counts, mm_86_45_counts, mm_118_60_counts, mm_246_120_counts]


In [None]:
def barplot_pept_ident(count_dict, plot_title, prefix):
    pd_counts = count_dict['CHIMERYS']
    fp_counts = count_dict['FragPipe']
    mm_counts = count_dict['MetaMorpheus']

    mbr_means_1 = [np.mean([row[1], row[3]]) for row in pd_counts]
    msms_means_1 = [np.mean([row[0], row[2]]) for row in pd_counts]
    mbr_errs_1 = [np.std([row[1], row[3]], ddof=1) / np.sqrt(2) for row in pd_counts]
    msms_errs_1 = [np.std([row[0], row[2]], ddof=1) / np.sqrt(2) for row in pd_counts]

    mbr_means_2 = [np.mean([row[1], row[3]]) for row in fp_counts]
    msms_means_2 = [np.mean([row[0], row[2]]) for row in fp_counts]
    mbr_errs_2 = [np.std([row[1], row[3]], ddof=1) / np.sqrt(2) for row in fp_counts]
    msms_errs_2 = [np.std([row[0], row[2]], ddof=1) / np.sqrt(2) for row in fp_counts]

    mbr_means_3 = [np.mean([row[1], row[3]]) for row in mm_counts]
    msms_means_3 = [np.mean([row[0], row[2]]) for row in mm_counts]
    mbr_errs_3 = [np.std([row[1], row[3]], ddof=1) / np.sqrt(2) for row in mm_counts]
    msms_errs_3 = [np.std([row[0], row[2]], ddof=1) / np.sqrt(2) for row in mm_counts]
    
    bar_width = 0.5
    n_bars = len(pd_counts)

    fig, ax = plt.subplots(1, 1, figsize=(10, 6))   
    ax.bar(2 * np.arange(n_bars) + bar_width/2, msms_means_1, bar_width, yerr=msms_errs_1, alpha=1, color="tab:orange", label="CHIMERYS")
    ax.bar(2 * np.arange(n_bars) + 3 * bar_width/2, msms_means_2, bar_width, yerr=msms_errs_2, alpha=1, color="tab:green", label="FragPipe")
    ax.bar(2 * np.arange(n_bars) + 5 * bar_width/2, msms_means_3, bar_width, yerr=msms_errs_3, alpha=1, color="tab:purple", label="MetaMorpheus")
    
    ax.set_ylabel("Number of peptides")
    ax.set_xticks(2 * np.arange(n_bars) + 3 * bar_width / 2)
    ax.set_xticklabels(["2 Th", "4 Th", "8 Th", "12 Th", "18 Th", "24 Th", "48 Th"])
    ax.grid(False)

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc = "lower center", bbox_to_anchor=(0.5, -0.09),  ncol=3, fontsize="medium")    
    
    plt.title(plot_title)
    plt.gcf().set_size_inches(8, 4)
    plt.savefig("figure4_ab_{}.pdf".format(prefix), bbox_inches='tight', pad_inches=0.1)

    fig, ax = plt.subplots(1, 1, figsize=(10, 6))   
    ax.bar(2 * np.arange(n_bars) + bar_width/2, mbr_means_1, bar_width, yerr=mbr_errs_1, alpha=1, color="tab:orange", label="CHIMERYS")
    ax.bar(2 * np.arange(n_bars) + 3 * bar_width/2, mbr_means_2, bar_width, yerr=mbr_errs_2, alpha=1, color="tab:green", label="FragPipe")
    ax.bar(2 * np.arange(n_bars) + 5 * bar_width/2, mbr_means_3, bar_width, yerr=mbr_errs_3, alpha=1, color="tab:purple", label="MetaMorpheus")

    ax.set_ylabel("Number of peptides")
    ax.set_xticks(2 * np.arange(n_bars) + 3 * bar_width / 2)
    ax.set_xticklabels(["2 Th", "4 Th", "8 Th", "12 Th", "18 Th", "24 Th", "48 Th"])
    ax.grid(False)

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc = "lower center", bbox_to_anchor=(0.5, -0.09),  ncol=3, fontsize="medium")    
    
    plt.title(plot_title)
    plt.gcf().set_size_inches(8, 4)
    plt.savefig("figure_s2_ab_{}.pdf".format(prefix), bbox_inches='tight', pad_inches=0.1)


data_names = ["54 ms maximum injection time, 30k MS2 resolution",
              "86 ms maximum injection time, 45k MS2 resolution",
              "118 ms maximum injection time, 60k MS2 resolution",
              "246 ms maximum injection time, 120k MS2 resolution"]

for i,prefix in zip(range(4),["54ms_30k","86ms_45k","118ms_60k","246ms_120k"]):
    data_name = data_names[i]
    count_dict = {'CHIMERYS':pd_counts[i], "FragPipe":fp_counts[i], "MetaMorpheus":mm_counts[i]}
    barplot_pept_ident(count_dict = count_dict, plot_title = data_name, prefix = "peptide_comparison_{}".format(prefix))
    