# Calculation of redistribution scores

Euclidean distances between the centroid points of the extracted features are calculated.

In [None]:
## logging functions
from icecream import ic as info
import logging
## data functions
import itertools
import numpy as np
import pandas as pd
## data functions from roux
from roux.lib.set import flatten
## system functions
import sys
import roux.lib.dfs as rd # attributes
sys.path.append('../')

In [2]:
## parameters
input_path=None
output_path=None # f'{output_dir_path}/02_mapped_genes.tsv'

input_type=None # features
unit=None # replicates
method_distance=None # "euclidean" # cosine

pcs=None # number of pcs to use

In [3]:
## validate inputs
method_distance=method_distance.lower()
input_type=input_type.lower()

assert input_type in ['features','pcs'], input_type
assert input_type in input_path, f"input_type ({input_type}) not found in input_path ({input_path})."
assert unit in ['replicates','genes']
## infer inputs and read the data
if input_type=='pcs':
    logging.warning("Features are favored over PCS for calulating the distances.") 
output_dir_path=dirname(output_path)
print(output_dir_path)
output_plots_dir_path=f"{splitext(output_path)[0]}_plots/"

## Redistribution score calculations

### Input data

In [4]:
df01=read_table(input_path,drop_index=True,)
df01.head(1)

In [5]:
columns_index=[
        'pairs',
        ]
columns_construct=['label common']

In [6]:
if input_type=='features':
    columns_value=df01.filter(like='feature #').columns.tolist()
elif  input_type=='pcs':
    columns_value=[f'PC #{i}' for i in range(1,pcs+1,1)]
_=df01.rd.assert_dense(subset=columns_index+columns_construct+columns_value)

In [7]:
df1=df01.copy()
# if test: to_table(df1,f'{output_dir_path}/inputs/df1.tsv')

In [8]:
assert all(df1.groupby('pairs')['label'].nunique()==4)
assert df1['label common'].nunique()==4
assert all(df1.groupby('pairs')['label common'].nunique()==4)
df1['label common'].unique()

### Centroids

In [9]:
def get_centroids(
    df1,
    cols_groupby,
    ):
    ## centroids
    return (
        df1
        .log()
        .groupby(cols_groupby).agg(
                                {c:np.mean for c in columns_value}
                            )
        .reset_index()
        .log('label common')
        .log('pairs')
        .log('pairs','label common')        
    )
df2=get_centroids(
df1,
cols_groupby=columns_index+['gene symbol query','gene symbol partner']+columns_construct+(['replicate'] if unit=='replicates' else [])
)
## save tables
to_table(df2,f"{output_dir_path}/00_centroids.tsv")

### Calculate the similarities (distances) between the features of the constructs

### Merge by common construct labels

In [10]:
## comparison types
df0=(pd.DataFrame(
    [list(sorted(t))+[' and '.join(list(sorted(t)))] for t in itertools.combinations(df2['label common'].unique().tolist(),2)],
    columns=['label common construct1','label common construct2','distance between',],
    )
    .sort_values('distance between')
    .reset_index(drop=True)
    )
df0

In [11]:
def merge_by_comparison(
    df0_: pd.DataFrame,
    df1: pd.DataFrame,
    right_ons_common,
    test=False,
    )-> pd.DataFrame:
    
    labels_common=df0_[['label common construct1','label common construct2']].values[0].tolist()
    if test: info(labels_common)
    def get_gene_symbol_for_merging(x,label1,label2):
        if label1.split('-')[0]!=label2.split('-')[0]:
            ## different queries e.g. gene1 - gene2
            ### for calculating distances from the partner
            if label1.split('-')[-1]!=label2.split('-')[-1]:
                ##  e.g. gene1-GFP gene2-DELTA gene2-GFP gene1-WT
                return x['gene symbol partner'] if x['label common'].endswith('-WT') else x['gene symbol query']
            else:
                ##  e.g. gene1-GFP gene2-WT gene2-GFP gene1-WT
                return x['gene symbol partner'] if x['label common'].startswith('gene2-') else x['gene symbol query']                
        else:
            ## same queries e.g. gene1 - gene1
            ## redistribution e.g. gene1-GFP gene2-DELTA	gene1-GFP gene2-WT
            return x['gene symbol query']
    df1=(df1
        .query("`label common` in @labels_common")
        .assign(
            **{
                ## for calculating distances from the partner 
                'gene symbol for merging':lambda df: df.apply(lambda x: get_gene_symbol_for_merging(x,labels_common[0],labels_common[1]),axis=1),
            }
            )
         .rd.assert_no_dups(['label common']+right_ons_common)
        )

    assert df1['label common'].nunique()==2

    df1.head(1)

    df2=(
        df0_
        .rd.merge_paired(df1,
            left_ons=[
                'label common construct1',
                'label common construct2',
                ],
            right_on=['label common'],
            common=[],
            right_ons_common=right_ons_common,
            how='inner',
            validates=['1:m', '1:1'],
            suffixes=None,
            test=False,
            verb=True,
            # **kws,
        )
        )
    return df2
## merged
df3=(
    df0
    .groupby('distance between',as_index=False)
    .apply(
        lambda df: merge_by_comparison(
        df0_=df,
        df1=df2,
        right_ons_common=columns_index+['gene symbol for merging']+(['replicate'] if unit=='replicates' else []),
        test=False
        )
    )
    .reset_index(drop=True)
    .log.dropna(subset=['distance between']) ## one-directional merged i.e. either `label common construct1` or `label common construct2`.
    .log.dropna(subset=flatten([[f'{s} construct1',f'{s} construct2'] for s in columns_value])) ## one-directional merged i.e. either `label common construct1` or `label common construct2`.
    .log('distance between')
    .log('pairs')
    .log('pairs','distance between')            
)
df3.head(1)

In [12]:
to_table(df3,f"{output_dir_path}/01_merged.tsv")

### Distances

In [13]:
## distances
from scipy.spatial import distance
get_distance=getattr(distance,method_distance)
df3[f'{method_distance} distance']=df3.apply(lambda x: get_distance([x[f'{s} construct1'] for s in columns_value],
                                              [x[f'{s} construct2'] for s in columns_value],
                                             ),axis=1)

In [14]:
df3.head(1)

In [15]:
to_table(df3,f"{output_dir_path}/02_distances.tsv")

### Small file

In [16]:
## small file
df3=(
    df3
    .drop(flatten([[f'{s} construct1',f'{s} construct2'] for s in columns_value]),axis=1)
)
to_table(df3,
            output_path,
           )      
df3.head(1)