# Measuring relative protein abundance changes

Relative protein abundance changes are caluculated and the proteins are classified based on the statistical significance of the change.

In [1]:
## logging functions
from icecream import ic as info
import logging
## data functions
import pandas as pd
## system functions
from os.path import splitext
import sys
## system functions from roux
from roux.lib.io import read_table
## data functions from roux
import roux.lib.dfs as rd # attributes
sys.path.append('..')

In [40]:
## parameters
input_path=None
output_path=None
column_abundance=None
# columns in the table to groupby with. e.g. gene or gene and replicate
columns_groupby=[
 'pairs',
 'gene symbol query',
 'pairs',
 'genes id',
]
cutoff_qvalue=None # cutoff to be applied on the p-values associated with the abundance changes
cutoff_pvalue=None # cutoff to be applied on the p-values associated with the abundance changes
cutoff_log_fold_change=None # cutoff to be applied on the log-fold-change scores

col_value='protein abundance difference (DELTA-WT)'

In [None]:
## parameters inferred
output_dir_path=dirname(output_path)
output_plots_dir_path=f"{splitext(output_path)[0]}_plots/"
logging.info(output_path)

In [None]:
## validate parameters
assert not 'log' in column_abundance

## Abundance change

In [4]:
## abundance by cell
df01=read_table(input_path)
df01.head(1)

In [7]:
assert df01['gene symbol query'].nunique()==df01.groupby(columns_groupby).ngroups, "the difference should be gene-wise"

In [8]:
from roux.stat.transform import plog
df01[column_abundance+' (log2 scale)']=df01[column_abundance].apply(lambda x: plog(x,base=2,p=1))

In [9]:
## calculate difference 
## between abundance when partner's status is DELTA and when partner's status is WT 
from roux.stat.diff import get_stats_groupby
df1=get_stats_groupby(
    df1=df01,
    cols_group=columns_groupby,
    cols_value=[column_abundance+' (log2 scale)'], 
    coff_p=cutoff_pvalue,
    coff_q=cutoff_qvalue,
    colsubset='status partner',
    colindex=['image id'], ## index for individual values
    subsets=['DELTA','WT'],
         )
df1.head(1)

In [11]:
## arrange the table
df2=(df1
    .drop(['subset1','subset2','variable',
          'change',f'change is significant, P (MWU test) < {cutoff_pvalue}',
          ],axis=1)
    .rd.dropby_patterns('median')
    .rd.renameby_replace(df1.iloc[0,:][["subset1","subset2"]].to_dict())
    .rd.renameby_replace({'len ':'n '})
    .rename(columns={'difference between mean (DELTA-WT)':col_value},errors='raise')
    .sort_values(col_value)
    )
info(df2[f'significant change, Q (MWU test) < {cutoff_qvalue}'].value_counts())
df2.head(1)

In [12]:
to_table(df2,f'{output_dir_path}/01_score.tsv')

### Classify abundance change

In [16]:
df2['protein abundance change']=df2.apply(lambda x: x[f'significant change, Q (MWU test) < {cutoff_qvalue}'] \
                                          if abs(x[col_value])>=cutoff_log_fold_change else \
                                          'ns',axis=1)
logging.info(df2['protein abundance change'].value_counts())

In [17]:
logging.info(f"-> change in classification because of te cutoff on |LFC| at {cutoff_log_fold_change}")
logging.info(pd.crosstab(df2['protein abundance change'],df2[f'significant change, Q (MWU test) < {cutoff_qvalue}']))

In [18]:
to_table(df2,
         output_path,
        )

In [19]:
df2.head(1)

## Plot
### Distribution

In [23]:
ax=df2[col_value].hist(bins=30)
to_plot(prefix=output_plots_dir_path+'/hist_',plotp=ax)

### Volcano plot

In [2]:
from modules.tools.plot import volcano_abundance_change
ax=volcano_abundance_change(
        data=df2,
        colx=col_value,
        highlight=3,
        palette=[
            'orange',
            'b',
            'gray',
        ],
        verbose=False,
        col_text='gene symbol query',
        )
to_plot(prefix=output_plots_dir_path+'/volcano_',plotp=ax)