# Redistribution

Redistribution scores and classification.

In [1]:
## data functions
import numpy as np
import pandas as pd
## system functions
from os.path import dirname
## system functions from roux
from roux.lib.io import backup
from roux.lib.io import read_dict
from roux.lib.io import read_table
## workflow functions from roux
from roux.workflow.io import read_metadata
## visualization functions
import matplotlib.pyplot as plt
## visualization functions from roux
from roux.viz.io import begin_plot
from roux.viz.colors import get_colors_default
from roux.viz.io import to_plot
## data functions from roux
import roux.lib.dfs as rd # attributes
sys.path.append('..')

In [2]:
## parameters
metadata_path='../config/metadata.yaml'
kernel=None
force=False
test=True

In [3]:
## inferred parameters
metadata=read_metadata(metadata_path,inputs=None if not test else {'version':{'number':'test'}},)
metadata['dataset']=read_metadata(metadata['dataset_config_path'],config_base=dict(species_name=metadata['species_name'],path=metadata['dataset_path'],),)
### output
output_dir_path=metadata['processed']['redistribution']
logging.info(f"Output directory: {output_dir_path}")
## backup old files if overwriting (force is True)
if force: backup(output_dir_path,dirname(output_dir_path),test=not force,)
## misc.
if kernel is None:
    kernel=metadata['kernels']['default']

## Redistribution

## Distances

In [12]:
parameters_list=[]
for input_path_key in ['paralogs','controls']:
    for unit in ['replicates','genes']:
        parameters_list.append(
            dict(    
                ## parameters
                input_path=metadata['features']['zscore'][input_path_key],
                output_path=metadata['redistribution']['distances'][unit][input_path_key],
                input_type='features',
                unit=unit,
                method_distance="euclidean", # "euclidean" # cosine
                pcs=None, # number of pcs to use
                test=False,
            )
            )
len(parameters_list)

In [29]:
from roux.workflow.task import run_tasks
_=run_tasks(
    input_notebook_path='41_script_redistribution_scores.ipynb',
    kernel=kernel,
    parameters_list = parameters_list,
    fast = False,
)

## Redistribution scores

### Combine the paralogs with controls

In [30]:
%reset_selective -f "^df.*"

In [42]:
df0=pd.DataFrame(list(read_dict(f"{metadata['processed']['redistribution']}/01_scores/*/*/*_reports/parameters.yaml").values()))
df0

In [43]:
## metadata
d0=(df0
    .assign(**{
        'by': lambda df: df['input_path'].apply(lambda x: 'paralogs' if not 'controls' in x else 'controls'),
        })
    .groupby('unit').apply(lambda df: df.rd.to_dict(['by','output_path']) )
    .to_dict()
)
import yaml;
print(yaml.dump(d0).replace(metadata['processed']['redistribution'],"${processed.redistribution}"))

In [56]:
df01=read_table(
    [
        metadata['redistribution']['distances']['genes']['paralogs'],
        metadata['redistribution']['distances']['genes']['controls'],
    ]
)
df01.head(1)

In [57]:
positive_genes=read_dict(metadata['pre_processed']['visual_inspections']['redistribution'])['positive']
negative_pairs=metadata['data']['subsets']['controls']['pairs']
info(positive_genes,len(positive_genes))
info(negative_pairs,len(negative_pairs))
column_distance='euclidean distance'

### Separate redistribution distances

In [60]:
df1=df01.log.query(expr=f"`distance between` in {metadata['redistribution']['merged']['distance between']}")
assert df1['distance between'].nunique()==2
assert all(df1['gene symbol query construct1']==df1['gene symbol query construct2'])

In [61]:
df2=(df1
    .rename(columns={'gene symbol query construct1':'gene symbol query'},errors='raise')
    .drop(['gene symbol for merging','gene symbol query construct2'],axis=1)
    .assign(
        **{
            'control':lambda df: df.apply(lambda x: False if x['pairs'] in negative_pairs else True if x['gene symbol query'] in positive_genes else np.nan,
                                          axis=1),
        }
        )
     .log('gene symbol query','control')
     
    )
df2.head(1)

In [62]:
to_table(df2,f"{metadata['processed']['redistribution']}/02_merged/01_filtered.tsv")

#### Save table

In [63]:
df3=(df2
    .rename(columns={column_distance:'redistribution score'},errors='raise')
    .drop(['label common construct1','label common construct2','distance between',
          "gene symbol partner construct1","gene symbol partner construct2"],
          axis=1)
     .rd.assert_no_dups(subset=['gene symbol query','pairs'])
)
df3.head(1)

In [64]:
to_table(df3,f"{metadata['processed']['redistribution']}/02_merged/02_small.tsv")

## Classify the redistribution

### Get the threshold

In [4]:
%reset_selective -f "^df.*"

In [5]:
from roux.workflow.task import run_tasks
_=run_tasks(
    input_notebook_path='42_script_redistribution_classification.ipynb',
    kernel=kernel,
    parameters_list= [
        {
            'input_path':metadata['redistribution']['merged']['genes'],
            'output_path':metadata['redistribution']['classified']['genes'],
            "method_cutoff":metadata['redistribution']['classified']['method'],
        },
    ],#: list = None,
)

### Plots  

#### ROC plot

In [4]:
data=read_table(metadata['redistribution']['merged']['genes'])
# read_table(metadata['redistribution']['distances']['genes'].replace('02_mapped_genes_reponsiveness','02_mapped_genes_reponsiveness_with_controls'))
data=data.log.dropna(subset=['control'])
data.head(1)

In [8]:
kws_plot=dict(
    method='roc',
    show_cutoff=dict(maximize='specificity',),
    returns=['data','ax','cutoff'],
    plot_pr=False,
    kws_area={"facecolor":metadata['colors']['default'],
             },
)

begin_plot()
fig,ax=plt.subplots(figsize=[2,2])
from roux.stat.binary import get_cutoff
d1=get_cutoff(
    y_true=data['control'].values,
    y_score=data['redistribution score'].values,
    ax=ax,
    **kws_plot,
    )
ax.set(
    xlim=[ax.get_xlim()[0],1],
    ylim=[ax.get_xlim()[0],1],
      )
from roux.viz.ax_ import set_equallim
set_equallim(ax=ax)
sns.despine(
    trim=False,
    ax=ax,
    )
to_plot(
    f"{metadata['redistribution']['merged']['checks']}/{kws_plot['method']}.png",
    data=data,
    kws_plot=kws_plot,
    )

#### Redistribution histogram with the threshold

In [4]:
%reset_selective -f "^df.*"

In [5]:
_input_path=metadata['redistribution']['merged']['genes']
_output_dir_path=Path(_input_path).with_suffix('').as_posix()+'_plots/'

In [6]:
data=read_table(metadata['redistribution']['merged']['genes'])
data.head(1)

In [7]:
kws_plot=dict(
    dist=dict(
        color_line=get_colors_default()[0],
        method_threshold=metadata['redistribution']['classified']['method'],
        color_text=metadata['colors']['redistribution'],
        show_threshold=read_dict(metadata['redistribution']['classified']['stats'])['threshold'],
        bins=40,
        ),
)

In [8]:
begin_plot()
fig,ax=plt.subplots(figsize=[4,3])
from modules.tools.plot import plot_redistribution
ax=plot_redistribution(
    data,
    ax=ax,
    **kws_plot['dist'],
    )
to_plot(
    f"{_output_dir_path}/hist_redistribution",
    kws_plot=kws_plot,
    data=data,
    validate=test,
    )