# Combining processed data.

Merging of the proocessed data and classification of the paralog pairs.

In [1]:
## logging functions
from icecream import ic as info
import logging
## data functions
import numpy as np
## system functions
from os.path import dirname
import sys
## system functions from roux
from roux.lib.io import read_table
from roux.lib.io import to_table
## data functions from roux
import roux.lib.dfs as rd # attributes
sys.path.append('../')

In [33]:
## parameters
input_path=None ## redistribution
output_path=None

abundance_change_path=None
relocalization_paths=None

genes_path=None # genes from the study

In [3]:
output_dir_path=dirname(output_path)
logging.info(output_dir_path)

## Inputs

### Abundance change

In [4]:
## abundance change pair-wise
df01=read_table(abundance_change_path)
df01.head(1)

### Redistribution

In [5]:
df02=read_table(
    input_path,
)
df02.head(1)

### Relocalization

In [6]:
df03s={k: read_table(v) for k,v in relocalization_paths.items()}
df03s['genes'].head(1)

## Merge the abundance and distribution changes

### Gene-wise

In [7]:
df1=(df02.rename(columns={'gene symbol query':'gene symbol',},errors='raise')
    .log.merge(
        right=df01.rename(columns={'gene symbol query':'gene symbol',},errors='raise'),
        on=[
            'pairs',
            'gene symbol',
            'pairs',
        ],
        how='inner',
        validate="1:1",
    )
     ## map gene ids
    .assign(
     **{'gene id' : lambda df: df['gene symbol'].map(read_table(genes_path).rd.to_dict(['gene symbol','gene id'])),
    },
    )
    )
assert not df1['gene id'].isnull().any()
info(df1['protein abundance change'].value_counts())
df1.head(1)

#### Map relocalization 

In [8]:
## merge relocalization
df1=df1.merge(
    right=df03s['genes'],
    on='gene symbol',
    how='left',
    validate="1:1",
    )
info(df1['relocalization type'].value_counts())
df1.head(1)

In [9]:
to_table(df1,f'{output_dir_path}/00_genes.tsv')

### Classify pairs

In [10]:
df1['suffix']=df1.apply(lambda x: f"gene{(x['pairs'].split('-')).index(x['gene symbol'])+1}",axis=1)
logging.warning("data is sorted by the 'pairs' column. i.e. gene1 is 1st gene in the `pairs` id.")

In [11]:
assert not df1['suffix'].isnull().any()

In [12]:
df2=(df1
     .pivot(
        index=['pairs'],
        columns='suffix',
        values=[
            'gene symbol',
            'protein abundance difference (DELTA-WT)',
            'protein abundance change',
            'redistribution score',
            'redistribution',
        ]
     )
    .rd.flatten_columns()
    .reset_index())
df2.head(1)

In [13]:
## classify the pairs
from roux.stat.binary import classify_bools
df2['redistribution']=df2.loc[:,['redistribution gene1','redistribution gene2']].apply(classify_bools,axis=1)
from roux.stat.paired import get_stats_paired
df3=get_stats_paired(
    df2,
    cols=[f'redistribution score gene1',f'redistribution score gene2'],
    input_logscale=False,
    prefix=None,
    drop_cols=False,
    unidirectional_stats=['min','max'],
    fast=False
)

df3["protein abundance change paired"]=(
    df3
    .loc[:,['protein abundance change gene1','protein abundance change gene2']]
    .apply(lambda x: ' & '.join(np.unique([x['protein abundance change gene1'],
                                           x['protein abundance change gene2']])),axis=1)
                                                         )

In [14]:
df2.head(1)

#### Map relocalization 

In [15]:
## merge relocalization
df2=df2.merge(
    right=df03s['pairs'],
    on='pairs',
    how='left',
    validate="1:1",
    )
info(df2['relocalization type'].value_counts())
df2.head(1)

In [16]:
to_table(
    df2,
    output_path,
    )

## Stats

### Gene-wise

In [18]:
for k in ['redistribution','protein abundance change','relocalization type']:
    info(df1[k].value_counts())

### Pair-wise

In [17]:
for k in ['redistribution','protein abundance change paired','relocalization type']:
    info(df2[k].value_counts())