In [None]:
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

# data_name = "02ng_15m_12mz"
data_name = "02ng_30m_12mz"

fp_mbr_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/fragpipe_mbr"
fp_nombr_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/fragpipe_nombr"
mm_mbr_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/metamorpheus_mbr"
mm_nombr_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/metamorpheus_nombr"

if data_name == "02ng_15m_12mz":
    pd_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/proteomediscoverer/DDA_02ng_15m_12mz_Proteins.txt"
    pd_design_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/proteomediscoverer/DDA_02ng_15m_12mz_StudyInformation.txt"
elif data_name == "02ng_30m_12mz":
    pd_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/proteomediscoverer/Ryan_Kelly_DDA_0.2ng_30m_12mz_Proteins.txt"
    pd_design_dir = r"Z:/yufe/results/msfragger_ddaplus_paper/PXD037527/" + data_name + "/proteomediscoverer/Ryan_Kelly_DDA_0.2ng_30m_12mz_StudyInformation.txt"

manifest = pd.read_csv(fp_mbr_dir + "/fragpipe-files.fp-manifest", sep="\t", index_col=False, header=None)
manifest = manifest.rename(index=str, columns= {0:"File",1:"condition",2:"replicate",3:"type"})
design = pd.read_csv(pd_design_dir, sep="\t", index_col=False)
design['Sample Identifier'] = design['Sample Identifier'].map(lambda x: re.sub('.raw','',x))
design = design[['File ID','Sample Identifier']].copy()
manifest['File'] = manifest['File'].map(lambda x: re.sub('.mzML','',os.path.basename(x)))
manifest['sample'] = manifest.apply(lambda x: x['condition'] + '_' + str(int(x['replicate'])), axis=1)
manifest = manifest.merge(design, left_on='File', right_on='Sample Identifier')
manifest = manifest.drop(columns=['Sample Identifier'])

# PD
pd_df = pd.read_csv(pd_dir, sep="\t", index_col=False, na_values=None, dtype=str)
pd_df = pd_df.map(lambda x: x.strip() if isinstance(x, str) else x)
pd_df.to_csv("temp", sep="\t", index=False, quoting=csv.QUOTE_ALL)
pd_df = pd.read_csv("temp", sep="\t", index_col=False, na_values=["", "n/a"], header=0)
pd_df = pd_df[pd_df["Protein FDR Confidence: Combined"].str.lower() == "high"]
orig_cols = pd_df.columns[pd_df.columns.str.contains(": Sample")]
pd_match_type_columns_k562 = dict()
pd_match_type_columns_hela = dict()
pd_abundance_columns_k562 = dict()
pd_abundance_columns_hela = dict()
for i in orig_cols:
    if i.startswith("Abundances (Scaled):"):
        tmp_col = i.replace('Abundances (Scaled): ','').replace(': Sample','')
        if tmp_col in manifest["File ID"].tolist():
            cond = manifest[manifest['File ID']==tmp_col]['condition'].tolist()[0]
        if cond == '02ngK5':
            pd_abundance_columns_k562[i] = cond
        if cond == '02ng':
            pd_abundance_columns_hela[i] = cond
    elif i.startswith("Found in Sample:"):
        tmp_col = i.replace('Found in Sample: ','').replace(': Sample','')
        if tmp_col in manifest["File ID"].tolist():
            cond = manifest[manifest['File ID']==tmp_col]['condition'].tolist()[0]
        if cond == '02ngK5':
            pd_match_type_columns_k562[i] = cond
        if cond == '02ng':
            pd_match_type_columns_hela[i] = cond
pd_combined_cv_df = pd.DataFrame()
for cond, celltype in zip(["02ngK5","02ng"],["K562","Hela"]):
    if cond == '02ngK5':
        mt_cols = pd_match_type_columns_k562
        abn_cols = pd_abundance_columns_k562
    if cond == '02ng':
        mt_cols = pd_match_type_columns_hela
        abn_cols = pd_abundance_columns_hela
    pd_df['protein'] = pd_df['Accession']
    pd_df = pd_df.set_index(['protein'])
    pd_abn_mbr = pd_df[abn_cols.keys()].dropna(thresh=2)
    pd_abn_nombr = pd_df[abn_cols.keys()]
    peak_matrix = pd_df[mt_cols.keys()] =='Peak Found'
    peak_matrix.columns = pd_abn_nombr.columns
    pd_abn_nombr = pd_abn_nombr.mask(peak_matrix)
    pd_abn_nombr = pd_abn_nombr.dropna(thresh=2)
    pd_cv_nombr = pd_abn_nombr.std(axis=1, skipna=True) * 100 / pd_abn_nombr.mean(axis=1, skipna=True)
    pd_cv_mbr = pd_abn_mbr.std(axis=1, skipna=True) * 100 / pd_abn_mbr.mean(axis=1, skipna=True)
    pd_cv_df_nombr_temp = pd.DataFrame({'group': 'CHIMERYS MBR- ' + celltype, 'cv': pd_cv_nombr, 'protein': pd_cv_nombr.index})
    pd_cv_df_mbr_temp = pd.DataFrame({'group': 'CHIMERYS MBR+ ' + celltype, 'cv': pd_cv_mbr, 'protein': pd_cv_mbr.index})
    pd_combined_cv_df = pd.concat([pd_combined_cv_df, pd_cv_df_nombr_temp], ignore_index=True)
    pd_combined_cv_df = pd.concat([pd_combined_cv_df, pd_cv_df_mbr_temp], ignore_index=True)
    
    
# FragPipe
fp_mbr_df = pd.read_csv(fp_mbr_dir + r"/combined_protein.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, low_memory=False)
fp_nombr_df = pd.read_csv(fp_nombr_dir + r"/combined_protein.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, low_memory=False)
fp_combined_cv_df = pd.DataFrame()
for df,datatype in zip([fp_mbr_df, fp_nombr_df], ["FragPipe2 MBR+", "FragPipe2 MBR-"]):
    df['protein'] = df['Protein ID']
    df = df.set_index(['protein'])
    abn_cols = df.columns[df.columns.str.contains("MaxLFQ Intensity")]
    df_abn = df[abn_cols]
    df_abn.columns = df_abn.columns.str.replace(" MaxLFQ Intensity","")
    for cond, celltype in zip(["02ngK5_","02ng_"],["K562","Hela"]):
        sample_cols = [ x for x in df_abn.columns if x.startswith(cond) ]
        df_abn_flt = df_abn[sample_cols].dropna(thresh=2)
        cv_values = df_abn_flt.std(axis=1, skipna=True) * 100 / df_abn_flt.mean(axis=1, skipna=True)
        cv_df = pd.DataFrame({'group': datatype + ' ' + celltype, 'cv': cv_values, 'protein': df_abn_flt.index})
        fp_combined_cv_df = pd.concat([fp_combined_cv_df, cv_df], ignore_index=True)
        
        
# MetaMorpheus
mm_mbr_df = pd.read_csv(mm_mbr_dir + r"/Task2-SearchTask/AllQuantifiedProteinGroups.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, low_memory=False)
mm_nombr_df = pd.read_csv(mm_nombr_dir + r"/Task2-SearchTask/AllQuantifiedProteinGroups.tsv", sep="\t", index_col=False, na_values=["", "0"], header=0, low_memory=False)
mm_combined_cv_df = pd.DataFrame()
for df,datatype in zip([mm_mbr_df,mm_nombr_df],["MetaMorpheus MBR+","MetaMorpheus MBR-"]):
    df['protein'] = df['Protein Accession']
    df = df.set_index(['protein'])
    abn_cols = df.columns[df.columns.str.contains("Intensity_")]
    df_abn = df[abn_cols]
    sample_cols = []
    for i in df_abn.columns:
        tmp_col = i.replace("Intensity_","").replace("-calib","")
        if tmp_col in manifest["File"].tolist():
            sample = manifest[manifest['File']==tmp_col]['sample'].tolist()[0]
            sample_cols.append(sample)
    df_abn.columns = sample_cols
    for cond, celltype in zip(["02ngK5_","02ng_"],["K562","Hela"]):
        sample_cols = [ x for x in df_abn.columns if x.startswith(cond) ]
        df_abn_flt = df_abn[sample_cols].dropna(thresh=2)
        cv_values = df_abn_flt.std(axis=1, skipna=True) * 100 / df_abn_flt.mean(axis=1, skipna=True)
        cv_df = pd.DataFrame({'group': datatype + ' ' + celltype, 'cv': cv_values, 'protein': df_abn_flt.index})
        mm_combined_cv_df = pd.concat([mm_combined_cv_df, cv_df], ignore_index=True)
cv_all = pd.concat([pd_combined_cv_df,fp_combined_cv_df,mm_combined_cv_df], axis=0)
cv_all['tool'] = cv_all['group'].apply(lambda x: x.split(" ")[0])
cv_all['type'] = cv_all['group'].apply(lambda x: x.split(" ")[2] + " " + x.split(" ")[1])


def barplot_CV(cv_df, plot_title, is_mbr, prefix):
    cv_df = cv_df.copy()
    cv_df['color_group'] = np.where(cv_df['cv'] < 20, 'dark', 'light')
    count_df = cv_df.groupby(['group', 'color_group']).size().reset_index(name='counts')
    if is_mbr:
        sorted_order = ['CHIMERYS MBR+ K562', 'FragPipe2 MBR+ K562', 'MetaMorpheus MBR+ K562', 'CHIMERYS MBR+ Hela', 'FragPipe2 MBR+ Hela', 'MetaMorpheus MBR+ Hela']
        xtick_list = ['K562', 'HeLa']
    else:
        sorted_order = ['CHIMERYS MBR- K562', 'FragPipe2 MBR- K562', 'MetaMorpheus MBR- K562', 'CHIMERYS MBR- Hela', 'FragPipe2 MBR- Hela', 'MetaMorpheus MBR- Hela']
        xtick_list = ['K562', 'HeLa']
        
    pivot_df = count_df.pivot(index='group', columns='color_group', values='counts').fillna(0)
    pivot_df = pivot_df.reindex(sorted_order)

    fig, ax = plt.subplots(figsize=(8, 4))
    ax.grid(False)
    bar_width = 0.5
    index = np.arange(len(sorted_order)/3) *2

    plt.bar(index, pivot_df.loc[sorted_order[::3], 'dark'], bar_width, label='CHIMERYS CV<20%', alpha=1, color='tab:orange')
    plt.bar(index + bar_width * 1, pivot_df.loc[sorted_order[1::3], 'dark'], bar_width, label='FragPipe CV<20%', alpha=1, color='tab:green')
    plt.bar(index + bar_width * 2, pivot_df.loc[sorted_order[2::3], 'dark'], bar_width, label='MetaMorpheus CV<20%', alpha=1, color='tab:purple')

    plt.bar(index, pivot_df.loc[sorted_order[::3], 'light'], bar_width, label=r'CHIMERYS CV$\geq$20%', bottom=pivot_df.loc[sorted_order[::3], 'dark'], alpha=0.5, color='tab:orange')
    plt.bar(index + bar_width * 1, pivot_df.loc[sorted_order[1::3], 'light'], bar_width, label=r'FragPipe CV$\geq$20%', bottom=pivot_df.loc[sorted_order[1::3], 'dark'], alpha=0.5, color='tab:green')
    plt.bar(index + bar_width * 2, pivot_df.loc[sorted_order[2::3], 'light'], bar_width, label=r'MetaMorpheus CV$\geq$20%', bottom=pivot_df.loc[sorted_order[2::3], 'dark'], alpha=0.5, color='tab:purple')

    plt.xlabel('')
    plt.ylabel('Number of quantified proteins')
    plt.title(plot_title)
    plt.xticks([ 0.5 + 0.5*x*4 for x in range(len(xtick_list)) ], xtick_list)
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc="lower center", bbox_to_anchor=(0.5, -0.21),  ncol=2, fontsize="large")

    plt.gcf().set_size_inches(8, 4)
    plt.savefig("{}.pdf".format(prefix), bbox_inches='tight', pad_inches=0.1)


if data_name == "02ng_15m_12mz":
    plot_title = '20 min'
elif data_name == "02ng_30m_12mz":
    plot_title = '40 min'

barplot_CV(cv_df = cv_all.loc[~cv_all['tool'].str.contains('FragPipe1') & cv_all['group'].str.contains('MBR-'),], plot_title = plot_title, is_mbr = False, prefix = data_name + "_CV_combined_noMBR")
barplot_CV(cv_df = cv_all.loc[~cv_all['tool'].str.contains('FragPipe1') & cv_all['group'].str.contains('MBR+'),], plot_title = plot_title, is_mbr = True, prefix = data_name + "_CV_combined_MBR")
