In [None]:
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os


In [None]:
design = pd.DataFrame()
for data in ['250pg', '1ng', '10ng', '200ng', '400ng']:
    d = pd.read_csv('Z:/yufe/results/msfragger_ddaplus_paper/PXD045500/fragpipe/' + data + '/fragpipe-files.fp-manifest', sep="\t", index_col=False, header=None)
    design = pd.concat([design, d], axis=0, ignore_index=True)
design = design.rename(index=str, columns= {0:"File",1:"condition",2:"replicate",3:"type"})
design['File'] = design['File'].map(lambda x: re.sub('.mzML','',os.path.basename(x)))
design['sample'] = design.apply(lambda x: x['condition'] + '_' + str(x['replicate']), axis=1)
design['experiment'] = ''
design.loc[design['condition'].str.contains("1ng_|250pg_"),'experiment'] = 'low_sample_input'
design.loc[design['condition'].str.contains("200ng_|400ng_"),'experiment'] = 'std_sample_input'

# low input samples
design_low_input = design[design['experiment'].str.contains("low_sample_input")]
# standard input samples
design_std_input = design[design['experiment'].str.contains("std_sample_input")]

In [None]:
import csv

# PD CHIMERYS results
pd_ids = pd.DataFrame({"data":[],"File":[],"count":[]})
for cond in ["250pg", "1ng", "10ng", "200ng","400ng"]:
    cond_dir = os.path.join(r"Z:/yufe/results/msfragger_ddaplus_paper/PXD045500/proteomediscoverer", cond)
    for f in os.listdir(cond_dir):
        if f.endswith("_PeptideGroups.txt"):
            d = pd.read_csv(os.path.join(cond_dir,f), sep="\t", index_col=False, na_values=None, dtype=str)
            d = d.map(lambda x: x.strip() if isinstance(x, str) else x)
            d.to_csv("temp", sep="\t", index=False, quoting=csv.QUOTE_ALL)
            d = pd.read_csv("temp", sep="\t", index_col=False, na_values=["", "n/a"], header=0)
            fcol = d.columns[d.columns.str.startswith("Found in Sample: ")][0]
            cnt = d[fcol].value_counts()
            high_cnt = cnt['High']
            fname = f.replace("_PeptideGroups.txt","")
            pd_ids = pd.concat([pd_ids, pd.DataFrame({"data":[cond],"File":[fname],"count":[high_cnt]})], axis=0, ignore_index = True)

name_dict = dict()
pd_ids['sample'] = pd_ids.apply(lambda _: '', axis=1).copy()
sample_list = []
for i in range(pd_ids.shape[0]):
    file = pd_ids['File'].tolist()[i]
    if file == "20220719_E4_Neo0_FAIMS_uPAC50cm_neo_HeLa_200ng_iso3_rep3_20220723210932":
        file = "20220719_E4_Neo0_FAIMS_uPAC50cm_neo_HeLa_200ng_iso3_rep3"
    
    if file in design['File'].tolist():
        sample = design.loc[design['File'] == file, 'sample'].iloc[0]
    else:
        print("missing sample " + file)
        sample = file
    
    sample_list.append(sample)
    name_dict[file] = sample

pd_ids['sample'] = sample_list
pd_ids['group'] = [ '_'.join(x.split("_")[0:2]) for x in sample_list ]
pd_ids = pd_ids[['sample','count','group']].copy()

In [None]:
# MetaMorpheus results
mm_ids = pd.DataFrame({"File":[],"count":[]})

# The AllPeptides.psmtsv contains nonredundant peptides with all files combined. Read the individual peptide files to get the peptide counts of each file.
res_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD045500/metamorpheus/Task2-SearchTask/Individual File Results"

for f in os.listdir(res_dir):
    if f.endswith("-calib_Peptides.psmtsv"):
        fname = f.replace("-calib_Peptides.psmtsv","")
        d = pd.read_csv(os.path.join(res_dir, f),sep="\t", index_col=False, low_memory=False)
        d = d[(d['Decoy/Contaminant/Target']=="T") & (d['PEP_QValue']<0.01)]
        high_cnt = d.shape[0]
        mm_ids = pd.concat([mm_ids, pd.DataFrame({"File":[fname],"count":[high_cnt]})], axis=0, ignore_index = True)
        
mm_ids['sample'] = mm_ids.apply(lambda _: '', axis=1).copy()
sample_list = []
for i in range(mm_ids.shape[0]):
    file = mm_ids['File'].tolist()[i]
    if file == "20220719_E4_Neo0_FAIMS_uPAC50cm_neo_HeLa_200ng_iso3_rep3_20220723210932":
        file = "20220719_E4_Neo0_FAIMS_uPAC50cm_neo_HeLa_200ng_iso3_rep3"
    
    if file in design['File'].tolist():
        sample = design.loc[design['File']==file,'sample'].tolist()[0]
    else:
        print("missing sample " + file)
        sample = file
    
    sample_list.append(sample)
    name_dict[file] = sample

mm_ids['sample'] = sample_list
mm_ids['group'] = [ '_'.join(x.split("_")[0:2]) for x in sample_list ]
mm_ids = mm_ids[['sample','count','group']].copy()


In [None]:
# FragPipe results
fp_ids = pd.DataFrame({"sample":[],"count":[]})
stat_type = " Spectral Count"
for data in ['250pg', '1ng', '10ng', '200ng', '400ng']:
    d = pd.read_csv('Z:/yufe/results/msfragger_ddaplus_paper/PXD045500/fragpipe/' + data + '/combined_modified_peptide.tsv', sep="\t", index_col=False, low_memory=False)
    if stat_type == " Match Type":
        d = d[d.columns[d.columns.str.contains(" Match Type")].tolist()]
        d.columns = d.columns.str.replace(" Match Type","")
        for i in range(len(d.columns)):
            sample = d.columns[i]
            cnt = d[sample].value_counts()
            if 'MS/MS' in cnt.index:
                high_cnt = cnt['MS/MS']
            else:
                high_cnt = 0
            fp_ids = pd.concat([fp_ids, pd.DataFrame({"sample":[sample],"count":[high_cnt]})], axis=0, ignore_index = True)
    
    else:
        d = d[d.columns[d.columns.str.contains(stat_type)].tolist()]    
        d.columns = d.columns.str.replace(stat_type,"")
        for i in range(len(d.columns)):
            sample = d.columns[i]
            cnt = d[sample].map(lambda x: x>0).value_counts()            
            cnt.index = cnt.index.map(str)
            if 'True' in cnt.index:
                high_cnt = cnt['True']
            else:
                high_cnt = 0
            fp_ids = pd.concat([fp_ids, pd.DataFrame({"sample":[sample],"count":[high_cnt]})], axis=0, ignore_index = True)

fp_ids['group'] = [ "_".join(x.split("_")[0:2]) for x in fp_ids['sample'] ]

In [None]:
def barplot_pept_count(pd_counts, fp_counts, mm_counts, data_name, bar_width, xticklabels, outdir, prefix):
    pd_counts_2 = pd_counts.get(data_name)
    fp_counts_2 = fp_counts.get(data_name)
    mm_counts_2 = mm_counts.get(data_name)

    fig, ax = plt.subplots(figsize=(8, 4))
    ax.grid(False)
    n_bars = len(pd_counts_2) 

    # -- CHIMERYS
    iso_means_1 = [ np.mean(iso) for iso in pd_counts_2 ]
    x_axis_arr = 2.5 * np.arange(n_bars) + bar_width/2
    ax.bar(x_axis_arr, iso_means_1, bar_width, alpha=1, color="tab:orange", edgecolor="white", label="CHIMERYS")
    for x, counts in zip(x_axis_arr, pd_counts_2):
        ax.plot([x] * len(counts), counts, color="black", linestyle="", linewidth=1.5, marker='o', markersize=1)

    # -- FragPipe
    iso_means_2 = [ np.mean(iso) for iso in fp_counts_2 ]
    x_axis_arr = 2.5 * np.arange(n_bars) + 3 * bar_width/2
    ax.bar(x_axis_arr, iso_means_2, bar_width, alpha=1, color="tab:green", edgecolor="white", label="FragPipe")
    for x, counts in zip(x_axis_arr, fp_counts_2):
        ax.plot([x] * len(counts), counts, color="black", linestyle="", linewidth=1.5, marker='o', markersize=1)

    # -- MetaMorpheus
    iso_means_3 = [ np.mean(iso) for iso in mm_counts_2 ]
    x_axis_arr = 2.5 * np.arange(n_bars) + 5 * bar_width/2
    ax.bar(x_axis_arr, iso_means_3, bar_width, alpha=1, color="tab:purple", edgecolor="white", label="MetaMorpheus")
    for x, counts in zip(x_axis_arr, mm_counts_2):
        ax.plot([x] * len(counts), counts, color="black", linestyle="", linewidth=1.5, marker='o', markersize=1)
    
    ax.set_xlabel("iso width [m/z]")
    ax.set_title(data_name)
    ax.set_ylabel("Number of peptides")
    ax.set_xticks(2.5 * np.arange(n_bars) + 3 * bar_width / 2)
    ax.set_xticklabels(xticklabels)
    ax.tick_params(axis='both', color='white') 
    for spine in ax.spines.values():
        spine.set_color('lightgrey') 

    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc = 'lower center', bbox_to_anchor=(0.5, -0.12), ncol=3, fontsize="large")
    
    plt.gcf().set_size_inches(8, 4)
    plt.savefig(outdir+"{}.pdf".format(prefix), bbox_inches='tight', pad_inches=0.1)



pd_counts, fp_counts, mm_counts = dict(), dict(), dict()
## Low input (250pg, 1ng) and Standard input (200ng, 400ng)
for pat in ["250pg", "1ng", "10ng", "200ng", "400ng"]:
    pd_stat_cond = pd_ids[pd_ids['group'].str.contains(pat)].copy()
    fp_stat_cond = fp_ids[fp_ids['group'].str.contains(pat)].copy()
    mm_stat_cond = mm_ids[mm_ids['group'].str.contains(pat)].copy()
    pd_stat_counts = []
    fp_stat_counts = []
    mm_stat_counts = []
    
    if pat in ["250pg", "1ng"]:
        window_plan = ["1","2","4","8","12","18","24","28","56"]
    if pat in ["200ng", "400ng"]:
        window_plan = ["1","2","3","4","5","6","7","8"]
        
    for iso in window_plan:
        grp = pat + '_iso' + iso
        pd_stat_counts.append(pd_stat_cond[pd_stat_cond['group']==grp]['count'].tolist())
        fp_stat_counts.append(fp_stat_cond[fp_stat_cond['group']==grp]['count'].tolist())
        mm_stat_counts.append(mm_stat_cond[mm_stat_cond['group']==grp]['count'].tolist())
        
    pd_stat_counts = np.array(pd_stat_counts,dtype=object)
    fp_stat_counts = np.array(fp_stat_counts,dtype=object)
    mm_stat_counts = np.array(mm_stat_counts,dtype=object)
    
    pd_counts[pat] = pd_stat_counts
    fp_counts[pat] = fp_stat_counts
    mm_counts[pat] = mm_stat_counts


bar_width = 0.6

# Figure 4e and 4f
for data_name in ["250pg","400ng"]:
    if data_name == "250pg":
        xticklabels = ["1","2","4","8","12","18","24","28","56"]
    else:
        xticklabels = ["1","2","3","4","5","6","7","8"]
        
    barplot_pept_count(pd_counts = pd_counts, 
                       fp_counts = fp_counts, 
                       mm_counts = mm_counts,
                       data_name = data_name, 
                       bar_width = bar_width,
                       xticklabels = xticklabels, 
                       outdir = './', 
                       prefix = '{}_peptide_seperate'.format(data_name))

    print("PD " + data_name + ": ", round(np.mean(pd_counts[data_name])))
    print("FragPipe " + data_name + ": ", round(np.mean(fp_counts[data_name])))
    print("MetaMorpheus " + data_name + ": ", round(np.mean(mm_counts[data_name])))

# Figure S4e and S4f
for data_name in ["1ng","200ng"]:
    if data_name == "1ng":
        xticklabels = ["1","2","4","8","12","18","24","28","56"]
    if data_name == "200ng":
        xticklabels = ["1","2","3","4","5","6","7","8"]
        
    barplot_pept_count(pd_counts = pd_counts, 
                       fp_counts = fp_counts, 
                       mm_counts = mm_counts,
                       data_name = data_name, 
                       bar_width = bar_width,
                       xticklabels = xticklabels, 
                       outdir = './', 
                       prefix = '{}_peptide_seperate'.format(data_name))
    
    print("PD " + data_name + ": ", round(np.mean(pd_counts[data_name])))
    print("FragPipe " + data_name + ": ", round(np.mean(fp_counts[data_name])))
    print("MetaMorpheus " + data_name + ": ", round(np.mean(mm_counts[data_name])))
