# Single cell protein abundances

Single cell protein abundances are calculated and compared with the standard.

In [2]:
## logging functions
from icecream import ic as info
import logging
## system functions
from os.path import dirname
import sys
## system functions from roux
from roux.lib.io import backup
from IPython.display import Markdown as info_nb
from roux.lib.io import read_table
## visualization functions from roux
from roux.viz.io import begin_plot
## data functions from roux
import roux.lib.dfs as rd # attributes
sys.path.append('..')

In [2]:
## parameters
metadata_path='../config/metadata.yaml'
kernel=None
force=False
test=True

In [3]:
## inferred parameters
metadata=read_metadata(metadata_path,inputs=None if not test else {'version':{'number':'test'}},)
metadata['dataset']=read_metadata(metadata['dataset_config_path'],config_base=dict(species_name=metadata['species_name'],path=metadata['dataset_path'],),)
### output
output_dir_path=metadata['processed']['abundance']+'abundance'
logging.info(f"Output directory: {output_dir_path}")
## backup old files if overwriting (force is True)
if force: backup(output_dir_path,dirname(output_dir_path),test=not force,)
## misc.
if kernel is None:
    kernel=metadata['kernels']['default']

In [3]:
## common functions
from modules.tools.query import get_wt_protein_abundance
## common maps
to_gene_ids=read_table(metadata['ids']['genes']).rd.to_dict(['gene symbol','gene id'])

## Abundance

### Processing

In [16]:
parameters_list=[]
for k in ['paralogs','controls']:
    parameters_list.append(
        dict(
        input_path=metadata['metainfo']['filtered'][k],
        output_path=metadata['abundance']['genes'][k],
        )
    )
parameters_list

In [17]:
from roux.workflow.task import run_tasks
_=run_tasks(
    input_notebook_path='31_script_abundance_curation.ipynb',
    kernel=kernel,
    parameters_list=parameters_list,
    )

## Correlation with known standard [@Ho2018-fw]

In [4]:
## read data
df01=read_table(metadata['abundance']['genes']['paralogs'])
df1=get_wt_protein_abundance(df01,to_gene_ids=to_gene_ids)
df1.head(1)

### Non-expressed genes

In [6]:
from roux.lib.io import read_excel

In [7]:
## read Ho et al data: classifications of the non-expressed genes
genes_ids_nonexpressed=read_excel(
    f"{metadata['dataset']['external']['path']}/2018_Ho_et_al/1-s2.0-S240547121730546X-mmc6.xlsx",
    sheet_name='Table S5',
    header=2).log.dropna(subset=['Qualifier a'])['Systematic Name'].unique().tolist()
info(len(genes_ids_nonexpressed))

In [8]:
### overlap between the non-expresed and the paralogous genes
assert len(set(genes_ids_nonexpressed) & set(df1['gene id'].tolist()))==0
info_nb('No overlap between the non-expressed and the genes of the study.')

### Molecules per cell

In [9]:
df2=read_table(metadata['dataset']['external']['path']+'/2018_Ho_et_al/cleaned/mmc5_molecules_per_cell.tsv')
df2.head(1)

In [10]:
## genes filtered out because of the 
genes_absent=set(df1['gene id'].tolist()) - set(df2['gene id'].tolist())
assert len(genes_absent)==0
info_nb(f"All the genes are present in Ho et al., 2018 data.")

In [11]:
## genes filtered out because of the low intensity 
genes_filtered=set(df1['gene id'].tolist()) - set(df2['gene id'].tolist())
assert len(genes_filtered)==0
info_nb(f"No genes were filtered out in Ho et al., 2018 due to low GFP.")

In [13]:
## Merge
df3=df1.log.merge(right=df2,
             how='inner',
             on=['gene id'],
             validate="1:1",
            validate_equal_length=True,)

df3.head(1)

### Plot

In [14]:
## plot data
data=df3.copy()
assert not any(data['mean molecules per cell']<=0)
data=data.rd.renameby_replace({'mean molecules per cell':'mean molecules per cell\nHo et al. 2018'})
data.head(1)

In [15]:
## plot parameters
kws_plot=dict(
    x=metadata['abundance']['column'],
    y='mean molecules per cell\nHo et al. 2018',
    ci=None,
    order=1,
    scatter_kws=dict(
        alpha=0.5,
    ),    
    stat_kws=dict(loc=0,
                resample = True,
                ),    
)

In [16]:
# plot
begin_plot()
fig,ax=plt.subplots(figsize=[2.5,2.5])
from roux.viz.scatter import plot_scatter
ax=plot_scatter(
    data=data,
    ax = ax,
    **kws_plot,
)
ax.set(xscale='log',yscale='log',
      )
# save the plot
to_plot(
    plotp=str(Path(metadata['abundance']['genes']['paralogs']).with_suffix(''))+'_plots/scatter_protein_abundance_vs_mean molecules per cell Ho et al 2018',
    fmts=['pdf','png'],
    data=data, #source data
    kws_plot=kws_plot,# plotting parameters
    validate=True,
    )