In [None]:
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.

In [None]:
import numpy as np
import pandas as pd
import os
from matplotlib_venn import venn2 
from matplotlib import pyplot as plt 
import seaborn as sns

In [None]:
design = pd.read_csv("Z:/yufe/results/msfragger_ddaplus_paper/PXD024427/DDA+/fragpipe-files.fp-manifest", sep="\t", header=None)
design.columns = design.columns.map(str)
design = design.rename(columns={"0":"file","1":"condition","2":"replicate","3":"type"})
design['sample'] = design['condition'] + "_" + design['replicate'].map(str)
design['file'] = design['file'].map(lambda x: os.path.basename(x).replace('.mzML',''))
design = design.apply(lambda col: col.map(lambda x: str(x).replace("-", "_")))

cond_list = design['condition'].unique()
cond_sample_dict = design.groupby("condition")['sample'].apply(list)

fp = pd.read_csv("Z:/yufe/results/msfragger_ddaplus_paper/PXD024427/DDA/combined_protein.tsv", sep="\t", low_memory=False)
fp_plus = pd.read_csv("Z:/yufe/results/msfragger_ddaplus_paper/PXD024427/DDA+/combined_protein.tsv", sep="\t", low_memory=False)

fp = fp[["Protein ID","Gene"] + fp.columns[fp.columns.str.contains("MaxLFQ")].tolist()]
fp.columns = fp.columns.str.replace(" MaxLFQ Intensity","")
fp_plus = fp_plus[["Protein ID","Gene"] + fp_plus.columns[fp_plus.columns.str.contains("MaxLFQ")].tolist()]
fp_plus.columns = fp_plus.columns.str.replace(" MaxLFQ Intensity","")

fp = fp.set_index(['Protein ID','Gene'])
fp_plus = fp_plus.set_index(['Protein ID','Gene'])

print("DDA identified: " + str(fp.shape[0]) + " proteins")
print("DDA+ identified: " + str(fp_plus.shape[0]) + " proteins")

print("DDA quantified: " + str(fp[fp.gt(0).any(axis=1)].shape[0]) + " proteins")
print("DDA+ quantified: " + str(fp_plus[fp_plus.gt(0).any(axis=1)].shape[0]) + " proteins")


In [None]:
plt.rcParams["font.family"] = "Arial"
plt.figure(figsize=(4,4))

common_ids = set(fp_plus.reset_index()['Gene']).intersection(set(fp.reset_index()['Gene']))
plus_ids = set(fp_plus.reset_index()['Gene']).difference(set(fp.reset_index()['Gene']))
regular_ids = set(fp.reset_index()['Gene']).difference(set(fp_plus.reset_index()['Gene']))

vd = venn2(subsets=(len(plus_ids), len(regular_ids), len(common_ids)),
           set_labels=('DDA+', 'DDA'), 
           set_colors=("darkorange", "#4285F4"),
           alpha=0.8) 

x = vd.get_label_by_id("100")
for text in vd.set_labels:
    text.set_fontsize(12)
    if text._text == 'DDA+':
        text.set_color("darkorange")
        text.set_x(text._x-0.08)
        text.set_y(-1*text._y)
        text.set_fontweight('bold')
    else:
        text.set_color("#4285F4")
        text.set_x(text._x+0.12)
        text.set_y(-1*text._y)
        text.set_fontweight('bold')

for text in vd.subset_labels:
    text.set_fontsize(13)

plt.savefig("Fig5_Glioma_Vennplot.pdf", bbox_inches='tight', pad_inches=0.1) 

In [None]:
def show_values_on_bars(axs, font_size):
    def _show_on_single_plot(ax):
        val_list = [x for x in ax.containers[0].datavalues]
        if len(ax.containers) > 0:
            for i in range(1, len(ax.containers)):
                tmp_list = [x for x in ax.containers[i].datavalues]
                val_list.extend(tmp_list)
        for i in range(len(ax.patches)):
            if i >= len(val_list):
                break
            p = ax.patches[i]
            _x = p.get_x()
            _y = p.get_y()
            value = val_list[i]
            ax.text(_x, _y, int(value), ha="left", color='black', fontsize=font_size)

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


sample_cols = design['sample'].tolist()

fp_log2 = fp.transform(lambda x: np.log2(x+1))
fp_log2[fp_log2==0] = np.NaN
fp_plus_log2 = fp_plus.transform(lambda x: np.log2(x+1))
fp_plus_log2[fp_plus_log2==0] = np.NaN

fp_gene_cnt = pd.DataFrame({'count':fp_log2.apply(lambda x: x.notna().sum(), axis=1)})
fp_gene_cnt['missing'] = fp_gene_cnt['count'].apply(lambda x: fp_log2.shape[1]-x)
fp_gene_cnt['missing_pct'] = fp_gene_cnt['missing'].astype(object).apply(lambda x: x/fp_log2.shape[1])
fp_gene_cnt = fp_gene_cnt.reset_index()
fp_gene_cnt = fp_gene_cnt.sort_values(by=['count'], ascending=True)

def safe_nanmean(x):
    if len(x.dropna()) == 0:
        return np.nan
    else:
        return np.nanmean(x)

fp_plus_gene_cnt = pd.DataFrame({'count':fp_plus_log2.apply(lambda x: x.notna().sum(), axis=1)})
fp_plus_gene_cnt['missing'] = fp_plus_gene_cnt['count'].apply(lambda x: fp_plus_log2.shape[1]-x)
fp_plus_gene_cnt['missing_pct'] = fp_plus_gene_cnt['missing'].astype(object).apply(lambda x: x/fp_plus_log2.shape[1])
fp_plus_gene_cnt = fp_plus_gene_cnt.reset_index()
fp_plus_gene_cnt = fp_plus_gene_cnt.sort_values(by=['count'], ascending=True)

res = fp_log2.apply(lambda x: safe_nanmean(x), axis=1).reset_index().rename(columns={0:"mean_intensity"})
res['type'] = 'DDA'
res = res.merge(fp_gene_cnt, how='inner', on=['Protein ID','Gene'])
res = res.sort_values(by=['type','missing_pct'], ascending=True)
res_plt = res.dropna()

res_plus = fp_plus_log2.apply(lambda x: safe_nanmean(x), axis=1).reset_index().rename(columns={0:"mean_intensity"})
res_plus['type'] = 'DDA+'
res_plus = res_plus.merge(fp_plus_gene_cnt, how='inner', on=['Protein ID','Gene'])
res_plus = res_plus.sort_values(by=['type','missing_pct'], ascending=True)
res_plus_plt = res_plus.dropna()

combined_df = pd.concat([res_plus_plt,res_plt], axis=0)
break_vals = [20, 25, 26, 27, 40]

def bin_intensity(v,break_vals):
    left_val = break_vals[0]
    for i in range(1,len(break_vals)):
        right_val = break_vals[i]
        if v > left_val and v <= right_val:
            return('{}-{}'.format(str(left_val),str(right_val)))
        else:
            left_val = right_val

combined_df['valrange'] = combined_df['mean_intensity'].apply(lambda v: bin_intensity(v=v, break_vals=break_vals))

custom_params = {"axes.spines.right": True, "axes.spines.top": True, "axes.linewidth": 0.75}
sns.set_theme(style="ticks", rc=custom_params, font="Arial")
fig, axes = plt.subplots(ncols=1, nrows=2, figsize=(6, 8), gridspec_kw={'height_ratios': [1, 6]}, constrained_layout = True)
ax0 = axes[0] 
ax1 = axes[1] 

bar_width = 0.6

combined_count = combined_df[['type','valrange','Protein ID']].groupby(['type','valrange']).agg(lambda x: len(x)).reset_index()
sns.barplot(x = combined_count["valrange"], 
            y = combined_count["Protein ID"], 
            hue = combined_count["type"],
            hue_order = ['DDA', 'DDA+'], 
            palette = {"DDA": "white","DDA+": "white"},
            width = bar_width,
            errorbar = None,
            saturation = 1,
            ax = ax0)
ax0.legend_ = None
ax0.yaxis.set_visible(False)
ax0.xaxis.set_visible(False)
ax0.spines[['right','left','bottom','top']].set_visible(False)
ax0.set_xlabel('')
ax0.set_ylabel('')
ax0.tick_params(width=0.75, color='black')
show_values_on_bars(ax0, font_size=10)

PROPS = {
    'boxprops':{'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'},
    'flierprops':{'markersize':2,'marker':"x",'markerfacecolor':'black','color':'black'}
}

sns.boxplot(x = combined_df['valrange'], 
            y = combined_df['missing_pct'], 
            hue = combined_df['type'], 
            hue_order = ['DDA', 'DDA+'], 
            palette = {"DDA": "#4285F4","DDA+": "orange"},
            order = ['20-25','25-26','26-27','27-40'],
            width = bar_width, 
            showfliers = False,
            whis = 1.5, 
            orient = 'v', 
            linewidth = 0.75,
            saturation = 1,
            ax = ax1,
            **PROPS
           )
ax1.legend_.set_title(None)
ax1.set_xlabel('Mean intensity bin (log2-transformed)')
ax1.set_ylabel('Missing percentage')
ax1.tick_params(width=0.75, color='black')

plt.grid(False)
plt.legend(loc="lower left")
plt.gcf().set_size_inches(6, 5)
plt.setp(plt.gca().get_legend().get_texts(), fontsize='10') #legend 'list' fontsize
plt.savefig(r"boxplot_misPct_meanInt.pdf", bbox_inches='tight', pad_inches=0.1)


In [None]:
def group_proteins_by_missing_pct(cnt):
    if cnt == 1:
        grp = "1"
    elif 10 >= cnt > 1:
        grp = "2~10"
    elif 20 >= cnt > 10:
        grp = "11~20"
    elif 30 >= cnt > 20:
        grp = "21~30"
    elif 40 >= cnt > 30:
        grp = "31~40"
    elif 200 >= cnt > 40:
        grp = "41~200"
    elif 1000 >= cnt > 200:
        grp = "200~1000"
    elif cnt > 1000:
        grp = ">1000"
    else:
        grp = "unknown"
    return grp

common_prots = set(res['Protein ID'].tolist()).intersection(res_plus['Protein ID'])
res_dda_common = res.loc[res['Protein ID'].isin(common_prots),['Protein ID','missing_pct']]
res_plus_common = res_plus.loc[res_plus['Protein ID'].isin(common_prots),['Protein ID','missing_pct']]
combined_res = res_plus_common.merge(res_dda_common, how='inner', on='Protein ID', suffixes=('_ddaplus', '_dda'))
combined_plt = combined_res.groupby(['missing_pct_ddaplus','missing_pct_dda']).size().reset_index(name='count')
combined_plt['protein_count'] = combined_plt['count'].apply(group_proteins_by_missing_pct)


custom_params = {"axes.spines.right": True, "axes.spines.top": True, "axes.linewidth": 0.75}
sns.set_theme(style="ticks", rc=custom_params, font="Arial")

fig, ax = plt.subplots()
sns.scatterplot(data=combined_plt, 
                x="missing_pct_ddaplus", 
                y="missing_pct_dda", 
                hue="protein_count",
                hue_order=['>1000', '200~1000', '41~200', '31~40', '21~30', '11~20', '2~10', '1'],
                size="protein_count",
                size_order=['>1000', '200~1000', '41~200', '31~40', '21~30', '11~20', '2~10', '1'],
                sizes=(10, 100), 
                color="white",
                alpha=1, 
                palette='muted',
                ax = ax
               )

ax.legend_.set_title(None)
ax.set_xlabel('DDA+: Protein missing percentage')
ax.set_ylabel('DDA: Protein missing percentage')
ax.tick_params(width=0.75, color='black')
ax.legend(title='Protein count', bbox_to_anchor=(1.01, 0.25, 0.05, 0.5), fontsize="small")
plt.plot([0,1], [0,1], linestyle="--", color="grey") 
plt.grid(False)
plt.gcf().set_size_inches(6, 6)
plt.setp(plt.gca().get_legend().get_texts(), fontsize='10') 
plt.savefig(r"prot_misPct_comparison_scatterplot.pdf", bbox_inches='tight', pad_inches=0.1)


In [None]:
def plot_venn(plusIDs, ddaIDs, DE_type):
    plt.rcParams["font.family"] = "Arial"
    plt.figure(figsize=(4,4))

    common_ids = set(plusIDs).intersection(set(ddaIDs))
    plus_ids = set(plusIDs).difference(set(ddaIDs))
    regular_ids = set(ddaIDs).difference(set(plusIDs))

    vd = venn2(subsets=(len(plus_ids), len(regular_ids), len(common_ids)),
               set_labels=('DDA+_{}'.format(DE_type), 'DDA_{}'.format(DE_type)), 
               set_colors=("darkorange", "#4285F4"),
               alpha=0.8) 

    x = vd.get_label_by_id("100")
    for text in vd.set_labels:
        text.set_fontsize(12)
        if text._text.startswith('DDA+'):
            text.set_color("darkorange")
            text.set_x(text._x-0.08)
            text.set_y(-1*text._y)
            text.set_fontweight('bold')
        else:
            text.set_color("#4285F4")
            text.set_x(text._x+0.12)
            text.set_y(-1*text._y)
            text.set_fontweight('bold')

    for text in vd.subset_labels:
        text.set_fontsize(13)
    plt.savefig(r"{}_vennplot.pdf".format(DE_type), bbox_inches='tight', pad_inches=0.1) 


dda_gc = pd.read_csv(r"../dda/IDHwt-VS-IDHmut/IDHwt-VS-IDHmut.All.tsv",sep="\t")
plus_gc = pd.read_csv(r"../dda+/IDHwt-VS-IDHmut/IDHwt-VS-IDHmut.All.tsv",sep="\t")
dda_up = dda_gc.loc[dda_gc['Class']=="Up",]
dda_down = dda_gc.loc[dda_gc['Class']=="Down",]
plus_up = plus_gc.loc[plus_gc['Class']=="Up",]
plus_down = plus_gc.loc[plus_gc['Class']=="Down",]

plot_venn(plusIDs = plus_up['Gene'], 
          ddaIDs = dda_up['Gene'], 
          DE_type = "Up")
plot_venn(plusIDs = plus_down['Gene'], 
          ddaIDs = dda_down['Gene'], 
          DE_type = "Down")
