# Figure 4: Simple protein quantification

In [1]:
import pandas as pd
import re
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import datashader as ds
import plotly.io as pio
pio.renderers.default = "notebook"

## Figure 4A: Intensity histograms

In [2]:
yeast_proteins = pd.read_csv("./MaxQuant_Output/MaxQuantOutput/proteinGroups.txt", sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def plot_histograms(df, data_columns="Intensity .*",
                    columns="Intensity (.*?)_.*", rows="Intensity .*?_(.*)", log10=True,
                    highlight_column="Reverse"):
    data = df[[col for col in df.columns if re.match(data_columns, col) or col == highlight_column]]
    if highlight_column is not None:
        data[highlight_column] = data[highlight_column].apply(str)
        data.set_index(highlight_column, inplace=True)
    data.columns = pd.MultiIndex.from_tuples([
        (re.findall(columns, el)[0], re.findall(rows, el)[0]) for el in data.columns
    ], names=["columns", "rows"])
    data = data.stack(["columns", "rows"])
    if log10:
        data = np.log10(data)
        data = data[np.isfinite(data)]
        data.name = "log10("+data_columns.split(".*")[0].strip()+")"
    else:
        data.name = data_columns.split(".*")[0].strip()
    x = data.name
    data = data.reset_index()
    data.sort_values(highlight_column, inplace=True)
    plot = px.histogram(data, x=x, facet_col="columns", facet_row="rows",
                        opacity=1, template="simple_white",
                        color=highlight_column, color_discrete_sequence=px.colors.qualitative.D3[1::-1])
    plot.update_layout(width=len(set(data["columns"]))*200+100,
                       height=len(set(data["rows"]))*200+100, bargap=0)\
    .for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))\
    .update_traces(marker_line_width=0)
    return plot

In [4]:
fig4A = plot_histograms(yeast_proteins, data_columns="Intensity .*1")
fig4A.update_layout(width=400, height=300, font_size=10, margin_b=10, margin_t=20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
  result = getattr(ufunc, method)(*inputs, **kwargs)


## Fig 4B: Protein rank plot

In [5]:
def format_uniprot_annotations(df, sep=None, regex="[; ]*([^\[]*) \[[^;]+\]"):
    df = df.set_index("Entry").copy()
    df_formatted = []
    # iterate proteins
    for e,row in df.iterrows():
        # iterate annotation types
        for t,a in row.iteritems():
            try:
                if str(a) in ["", "nan"]:
                    continue
                # iterate annotation values
                if sep is not None and regex is None:
                    for v in a.split(sep):
                        df_formatted.append([e,t,v])
                elif regex is not None and sep is None:
                    for v in re.findall(regex, a):
                        df_formatted.append([e,t,v])
            except:
                print(e,t,a)
    df_formatted = pd.DataFrame(df_formatted, columns=["Protein ID", "Annotation type", "Annotation value"])
    return df_formatted

In [6]:
def add_annotation_column(df, annotations, id_col="Majority protein IDs",
                          annotation_type=None, search_term="cytoskeleton", verbose=True):
    if annotation_type != None:
        annotations = annotations[annotations["Annotation type"] == annotation_type].copy()
    terms = [el for el in set(annotations["Annotation value"]) if re.match(search_term, el) or search_term in el]
    if verbose:
        print("Found matching terms:",terms)
    ids = set(annotations.loc[annotations["Annotation value"].isin(terms), "Protein ID"])
    df_out = df.copy()
    df_out.insert(0, annotation_type+": "+search_term if annotation_type != None else search_term,
                  [search_term if any([i in el for i in ids]) else "" for el in df_out[id_col]])
    return df_out

In [7]:
yeast_go = pd.read_csv("./annotations/Saccharomycescerevisiae_SP_GO.tab", sep='\t')
yeast_go = format_uniprot_annotations(yeast_go)
yeast_go

Unnamed: 0,Protein ID,Annotation type,Annotation value
0,P10659,Gene ontology (biological process),methionine metabolic process
1,P10659,Gene ontology (biological process),one-carbon metabolic process
2,P10659,Gene ontology (biological process),S-adenosylmethionine biosynthetic process
3,P10659,Gene ontology (molecular function),ATP binding
4,P10659,Gene ontology (molecular function),metal ion binding
...,...,...,...
48983,P40217,Gene ontology (cellular component),eukaryotic 43S preinitiation complex
48984,P40217,Gene ontology (cellular component),eukaryotic 48S preinitiation complex
48985,P40217,Gene ontology (cellular component),eukaryotic translation initiation factor 3 com...
48986,P40217,Gene ontology (cellular component),eukaryotic translation initiation factor 3 com...


In [8]:
yeast_proteins = yeast_proteins.loc[yeast_proteins["Reverse"] != "+"]
yeast_proteins = yeast_proteins.loc[yeast_proteins["Potential contaminant"] != "+"]
yeast_proteins_ann = add_annotation_column(yeast_proteins, yeast_go,
                                           annotation_type="Gene ontology (molecular function)",
                                           search_term="structural constituent of ribosome")
yeast_proteins_ann = add_annotation_column(yeast_proteins_ann, yeast_go,
                                           annotation_type="Gene ontology (molecular function)",
                                           search_term="transcription factor activity")
yeast_proteins_ann.insert(0, "Annotation", [";".join([i for i in el if i != ""])
                                            if ";".join([i for i in el if i != ""]) != "" else "none"
                                            for _,el in yeast_proteins_ann[[
                                                "Gene ontology (molecular function): structural constituent of ribosome",
                                                "Gene ontology (molecular function): transcription factor activity"
                                            ]].iterrows()])

Found matching terms: ['structural constituent of ribosome']
Found matching terms: ['mitochondrial transcription factor activity', 'DNA-binding transcription factor activity, RNA polymerase II-specific', 'DNA-binding transcription factor activity']


In [23]:
rank_column = "Intensity wt_0h_1"
annotation_columns = ["Gene names", "Protein IDs"]
color = "Annotation"
fig_rank = px.scatter(yeast_proteins_ann.loc[yeast_proteins[rank_column] != 0, annotation_columns+[rank_column, color]]\
           .sort_values(rank_column, ascending=False).reset_index(), color=color,
           y=rank_column, render_mode="svg", log_y=True, hover_data=annotation_columns, template="simple_white")\
.update_xaxes(title_text="Rank").update_yaxes(showgrid=True, exponentformat="e")\
.update_layout(width=400, height=400, font_size=10)\
.update_traces(marker_color="lightgrey", selector=lambda x: x.name == "none")
fig_rank.show()

## Fig 4C, D: Sample correlation plot

In [10]:
def plot_sample_correlations(df, data_columns="Intensity (.*)", correlation_function=lambda x: np.corrcoef(x.T),
                             mode="scatter", log10=True, binning=10):
    # pick and process data
    df_sub = df[[el for el in df.columns if re.match(data_columns, el)]].copy()
    if log10:
        df_sub = df_sub.apply(np.log10)
    df_sub = df_sub.replace([np.inf, -np.inf], np.nan)
    df_sub.columns = [re.findall(data_columns, el)[0] for el in df_sub.columns]
    
    if mode == "scatter":
        # setup subplots and axes
        fig = make_subplots(rows=len(df_sub.columns), cols=len(df_sub.columns), start_cell='bottom-left',
                            shared_yaxes=True, shared_xaxes=True, horizontal_spacing=0.03, vertical_spacing=0.03)
        i_range = (np.floor(np.nanmin(df_sub)), np.ceil(np.nanmax(df_sub))+1/binning)
        j_range = (np.floor(np.nanmin(df_sub)), np.ceil(np.nanmax(df_sub))+1/binning)
        i_width = int((i_range[1]-i_range[0]-1/binning)*binning+1)
        j_width = int((j_range[1]-j_range[0]-1/binning)*binning+1)
        
        # fill plots
        for i,ni in enumerate(df_sub.columns):
            for j,nj in enumerate(df_sub.columns):
                # apply datashader
                dc = ds.Canvas(plot_width=i_width, plot_height=j_width, x_range=i_range, y_range=j_range)
                df_ij = df_sub[[ni,nj]].dropna() if i!=j else pd.DataFrame(df_sub[ni].dropna())
                da = dc.points(df_ij, x=ni, y=nj)
                zero_mask = da.values == 0
                da.values = da.values.astype(float)
                da.values[zero_mask] = np.nan
                
                # add trace
                fig.add_trace(
                    go.Heatmap(z=da,coloraxis="coloraxis1" if i!=j else "coloraxis2"),
                    row=j+1, col=i+1
                )
                
                # add annotations
                if j == 0:
                    fig.update_xaxes(title_text=ni, row=j+1, col=i+1, tickvals=list(range(0,i_width,binning)),
                                     ticktext=np.round(da[nj].values[0:i_width:binning]))
                if i == 0:
                    fig.update_yaxes(title_text=nj, row=j+1, col=i+1, tickvals=list(range(0,j_width,binning)),
                                     ticktext=np.round(da[ni].values[0:j_width:binning]))
                if i!=j:
                    fig.add_annotation(dict(text=str(np.round(np.min(correlation_function(df_sub[[ni,nj]].dropna())),4)),
                                            x=binning, y=j_width, showarrow=False), row=j+1, col=i+1)
        
        # layout figure
        fig.update_layout(template="simple_white", coloraxis2=dict(showscale=False, colorscale=["black", "black"]),
                          width=i*200+100, height=j*200+50, margin_t=0)
    elif mode=="heatmap":
        da = np.ones((len(df_sub.columns), len(df_sub.columns)))
        for i,ni in enumerate(df_sub.columns):
            for j,nj in enumerate(df_sub.columns):
                # filter data and store correlation values
                df_ij = df_sub[[ni,nj]].dropna() if i!=j else pd.DataFrame(df_sub[ni].dropna())
                if i!=j:
                    da[i,j] = np.round(np.min(correlation_function(df_sub[[ni,nj]].dropna())),4)
        # create figure and label axes
        fig = go.Figure(data=go.Heatmap(z=da))
        fig.update_xaxes(tickvals=list(range(0,i+1,1)),
                          ticktext=list(df_sub.columns))
        fig.update_yaxes(tickvals=list(range(0,j+1,1)),
                          ticktext=list(df_sub.columns))
        fig.update_layout(template="simple_white", width=i*50+100, height=j*50+100)
    else:
        raise ValueError
    return fig

In [26]:
cross_corr = plot_sample_correlations(yeast_proteins, data_columns="Intensity (wt_0h_[123])")
cross_corr.update_layout(coloraxis1=dict(showscale=False), font_size=10)

In [12]:
cross_corr2 = plot_sample_correlations(yeast_proteins, data_columns="Intensity (.*)", mode="heatmap")
cross_corr2.update_layout(coloraxis1=dict(showscale=False), font_size=10, width=500, height=500)

## Figure 4E, F: Volcano plots

In [13]:
import easyMLR as emlr
import statsmodels.stats.power as power

In [14]:
def run_ttest(df, c1, c2, cols_ann=["Majority protein IDs", "Gene names", "Annotation"],
              s0=0.05, fdr=0.01, min_fc=None, n_perm=2, plot_fdr_line=True):
    df_in = df[[col for col in df.columns if col in cols_ann or re.match(c1, col) or re.match(c2, col)]].copy()
    df_in.set_index(cols_ann, inplace=True)
    df_in = df_in.apply(np.log2).replace([np.inf, -np.inf], np.nan).dropna().reset_index()
    if s0 and not min_fc:
        res, fig = emlr.perform_ttest_analysis(df_in, id_col=cols_ann[0],
                                               c1 = [col for col in df_in.columns if re.match(c1, col)],
                                               c2 = [col for col in df_in.columns if re.match(c2, col)],
                                               plot_fdr_line=plot_fdr_line, s0=s0, fdr=fdr, n_perm=n_perm)
        fig.update_layout(legend_title_text="FDR {}, s0 {}".format(str(fdr), str(s0)))
    elif not s0 and min_fc:
        res, _ = emlr.perform_ttest_analysis(df_in, id_col=cols_ann[0],
                                             c1 = [col for col in df_in.columns if re.match(c1, col)],
                                             c2 = [col for col in df_in.columns if re.match(c2, col)],
                                             plot_fdr_line=False, s0=0, fdr=fdr, n_perm=n_perm)
        res.insert(res.shape[1], "square_cutoff",
                   ["non_sig" if abs(fc)<min_fc or qval>fdr else "sig" for fc,qval in zip(res.fc, res.qval)])
        sd = max([df_in[[col for col in df_in.columns if re.match(c1, col)]].std(axis=1).mean(),
                  df_in[[col for col in df_in.columns if re.match(c2, col)]].std(axis=1).mean()])
        exp_pow = power.tt_ind_solve_power(effect_size=min_fc/sd, alpha=fdr,
                                           nobs1=len([col for col in df_in.columns if re.match(c1, col)]))
        fig = px.scatter(x=res.fc,
                         y=-np.log10(res.pval),
                         color=res.square_cutoff,
                         template='simple_white', render_mode="svg",
                         labels=dict(x="log2 fold change", y="-log10(p-value)",
                                     color="FDR {}, power {}".format(str(fdr), str(np.round(exp_pow, 2)))
                                    )
                        ).update_layout(width=600, height=700)\
        .add_vline(min_fc, line_width=2).add_vline(-min_fc, line_width=2)\
        .add_hline(-np.log10(res.loc[res.square_cutoff=="sig", "pval"].max()), line_width=2)
    return res, fig

In [29]:
result_volc

Unnamed: 0,Majority protein IDs,Gene names,Intensity RPN4_0h_1,Intensity RPN4_0h_2,Intensity RPN4_0h_3,Intensity RPN4_0h_4,Intensity wt_0h_1,Intensity wt_0h_2,Intensity wt_0h_3,Intensity wt_0h_4,fc,tval,pval,tval_s0,pval_s0,qval,FDR 1%
0,A5Z2X5,YPR010C-A,29.798368,30.127433,29.706389,30.296579,29.627510,29.769260,29.119168,29.787527,-0.406326,-1.945885,0.099626,-1.315767,0.236289,0.233084,non_sig
1,D6VTK4,STE2,27.634434,27.290088,26.803412,26.924790,28.843620,28.835029,28.133202,27.543405,1.175633,3.219143,0.018157,2.527152,0.044852,0.037895,non_sig
2,D6W196,SAL1,23.451865,24.255227,24.728530,23.910313,23.728374,25.238487,23.480252,23.409634,-0.122297,-0.240786,0.817738,-0.201177,0.847207,0.618326,non_sig
3,O13297,CET1,28.394275,28.428981,28.411228,27.669323,28.173790,28.981635,27.753087,27.820190,-0.043776,-0.129651,0.901080,-0.100026,0.923582,0.648719,non_sig
4,O13535,TY1B-H,25.400999,27.159906,26.410734,27.477846,24.160233,24.589551,26.548382,24.381592,-1.692432,-2.357416,0.056483,-2.069195,0.083981,0.073147,non_sig
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3147,Q99383,HRP1,30.382677,30.594326,30.310622,30.147708,30.569642,30.373579,29.984001,30.460999,-0.011778,-0.074744,0.942848,-0.045726,0.965013,0.664122,non_sig
3148,Q99385,VCX1,27.700415,28.565254,28.573729,28.298590,29.013052,28.754299,28.584987,28.640825,0.463794,2.053408,0.085836,1.423268,0.204513,0.201988,non_sig
3149,Q99394,TRS33,24.270207,27.179971,27.331594,27.824809,26.248149,25.876380,25.471627,26.226906,-0.695880,-0.842397,0.431856,-0.751433,0.480817,0.453006,non_sig
3150,Q9P305,IGO2,25.244016,27.076227,25.798823,24.551568,26.084340,27.099990,25.615602,25.363425,0.373181,0.567459,0.590986,0.492560,0.639827,0.548003,non_sig


In [15]:
result_volc, fig_volc = run_ttest(yeast_proteins, cols_ann=["Majority protein IDs", "Gene names"],
                                  c1="Intensity wt_0h.*", c2="Intensity RPN4_0h.*",
                                  s0=0.1, fdr=0.01, n_perm=10)

In [16]:
result_square, fig_square = run_ttest(yeast_proteins, cols_ann=["Majority protein IDs", "Gene names"],
                          c1="Intensity wt_0h.*", c2="Intensity RPN4_0h.*",
                          min_fc=1.5, s0=None, fdr=0.01, n_perm=10)

In [17]:
fig_volc.update_layout(font_size=10, width=400, height=400).show()
fig_square.update_layout(font_size=10, width=400, height=400).show()

## Figure 4G: Enrichment analysis

In [18]:
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

In [19]:
def enrichment_analysis(df, df_ann, annotation_type="Gene ontology (cellular component)",
                        category_column="FDR 1%", id_col="Majority protein IDs", mht="fdr_bh", fdr=0.05):
    df_ann = df_ann[df_ann["Annotation type"] == annotation_type].copy()
    df = df[[category_column, id_col]].copy()
    terms = set(df_ann["Annotation value"])
    test_results = list()
    for i,term in enumerate(terms):
        df_t = add_annotation_column(df, df_ann, id_col=id_col,
                                     search_term="^{}$".format(re.escape(term)), verbose=False)
        df_t.drop(id_col, inplace=True, axis=1)
        df_t.columns = ["term", "category"]
        df_t = df_t.groupby(["term", "category"]).apply(len).unstack("term")
        # reshape the dataframe to fit the requirements for the test
        df_t = df_t.iloc[::-1, ::-1].replace(np.nan, 0)
        #if df_t.max().max() == 0:
        #    continue
        if df_t.shape != (2,2):
            continue
        odds, pval = fisher_exact(df_t)
        test_results.append([term, odds, pval, df_t.iloc[0,0], df_t.iloc[1,0]])
    test_results = pd.DataFrame(test_results, columns=["term", "odds_ratio", "pval", "n significant", "n insignificant"])
    rej,pval_adj,_,_ = multipletests(test_results.pval, alpha=fdr, method=mht)
    test_results.insert(0, "significant", rej)
    test_results.insert(0, "pval_adj", pval_adj)
    return test_results

In [20]:
def plot_enrichment(df, x="pval_adj", y="term", size="n significant", color="odds_ratio", log10=["pval_adj"]):
    df = df.copy()
    if len(log10) != 0:
        for el in log10:
            df.insert(0,"-log10({})".format(el),-np.log10(df[el]))
        if x in log10:
            x = "-log10({})".format(x)
        if y in log10:
            y = "-log10({})".format(y)
        if size in log10:
            size = "-log10({})".format(size)
        if color in log10:
            color = "-log10({})".format(color)
    sizes = list(set(df[size]))
    while len(sizes) > 6:
        new_sizes = sizes[::2]
        if new_sizes[-1] != sizes[-1]:
            new_sizes.append(sizes[-1])
        sizes=new_sizes
    fig = make_subplots(rows=1, cols=2, column_widths=[400, 70])
    fig.add_trace(go.Scatter(x=df[x], y=df[y],
                             marker=dict(size=np.interp(df[size], (df[size].min(), df[size].max()), (10,30)), 
                                         color=df[color], showscale=True,colorbar=dict(title=color)),
                             mode="markers", showlegend=False),
                  row=1, col=1)
    fig.update_layout(xaxis_title=x, yaxis_title=y)
    fig.add_trace(go.Scatter(y=list(range(len(sizes))), x=np.repeat(1, len(sizes)),
                             marker=dict(color=px.colors.qualitative.D3[0], size=np.interp(sizes, (min(sizes), max(sizes)), (10,30))),
                             showlegend=False, mode="markers"),
                  row=1, col=2)
    fig.update_xaxes(visible=False, range=(0,2.5), row=1, col=2)
    fig.update_yaxes(visible=False, range=(-6, len(sizes)), row=1, col=2)
    fig.add_annotation(dict(text=size, x=1, y=len(sizes)-0.4, showarrow=False), row=1, col=2)
    for s in sizes:
        fig.add_annotation(dict(text=str(int(s)), x=2, y=sizes.index(s), showarrow=False), row=1, col=2)
    fig.update_layout(template="simple_white", margin_t=10)
    return fig

In [21]:
fe = enrichment_analysis(result_volc, yeast_go, annotation_type="Gene ontology (biological process)")

In [22]:
fig_enr = plot_enrichment(fe[fe.significant].sort_values("pval_adj", ascending=False))
fig_enr.update_layout(font_size=10, width=850, height=300, margin_l=450).update_yaxes(range=(-2.5,4.1),row=1,col=2)