# Pre-processed data

Pre-processed data including the protein abundance and the features extracted from the deep learning model are standardized.

In [1]:
## logging functions
import logging
## data functions
import numpy as np
## data functions from roux
from roux.lib.dict import merge_dicts
## system functions
from os.path import dirname
import sys
## system functions from roux
from IPython.display import Markdown as info_nb
from roux.lib.sys import read_ps
from roux.lib.io import read_table
import roux.lib.dfs as rd # attributes
sys.path.append('../')

In [2]:
## input parameters
metadata_path='../config/metadata.yaml'
force=False
test=True

In [3]:
## inferred parameters
metadata=read_metadata(
    metadata_path,
    inputs=None if not test else {'version':{'number':'test'}},)
metadata['dataset']=read_metadata(
    metadata['dataset_config_path'],
    config_base=dict(species_name=metadata['species_name'],
    path=metadata['dataset_path'],),)
### output
output_dir_path=metadata['processed']['ids']
logging.info(f"Output directory: {output_dir_path}")
## backup old files if overwriting (force is True)
if force: backup(output_dir_path,dirname(output_dir_path),test=not force,)

## Metainfo
### Paralogs
#### Loading the metainfo

In [4]:
df00=read_table(
    read_ps(f"{metadata['pre_processed']['metainfo']['path']}/*.csv"),
    )
df00.head(1)

In [5]:
## for merging the features
df00=(df00
.assign(**{'cell id per subset':range(len(df00))})
)

In [6]:
to_table(df00,metadata['metainfo']['raw']['paralogs'])

#### Standardizing columns

In [7]:
from modules.tools.io import read_pre_processed
df0=read_pre_processed(
    df00,
    excludes=metadata['data']['exclude'], # remove controls
    rename=True,
    clean=True,
    )
df0.head(1)

In [8]:
to_table(df0,metadata['metainfo']['combined']['paralogs'])

#### Mapping gene symbols to gene IDs

In [9]:
## ensembl
df01=read_table(metadata['dataset']['ids']['genes'])
df01.head(1)

In [10]:
df1=df0['pairs'].str.split('-',expand=True).melt()['value'].drop_duplicates().replace('control',np.nan).dropna().to_frame('gene symbol').reset_index(drop=True).log()
df1.head(1)

In [11]:
from roux.workflow.io import import_from_file
ids=import_from_file(metadata['dataset']['path']+'/modules/lib/ids.py')
df2=ids.gene_symbol_to_id(
    df1, # input
    df01, # reference table containing the gene ids/symbols/synonyms
    clean=True,
    )
df2.head(1)

In [12]:
logging.info('non-verified genes in the dataset:')
df01.log.query(expr=f"`gene id` in {df2['gene id'].tolist()} and `gene qualifier` != 'Verified'")

In [13]:
to_table(df2,metadata['ids']['genes'])

#### Creating pair IDs based on the gene IDs

Usage: for mapping the external paralog features.

In [14]:
df_=(df0
    .loc[:,['pairs']].log.drop_duplicates()
    .rd.split_ids(col='pairs',sep='-',prefix='gene symbol gene')
    )
df_.head(1)

In [15]:
df3=df_.rd.merge_paired(
    df2=df2,
    left_ons=['gene symbol gene1','gene symbol gene2'],
    right_on=['gene symbol'],
    how = 'inner',
    validates = ['1:1', '1:1'],
).assign(
**{'genes id': lambda df:df.rd.make_ids_sorted(cols=['gene id gene1','gene id gene2'],ids_have_equal_length=False,sep='--'),
  }
)
df3.head(1)

In [17]:
to_table(df3,metadata['ids']['pairs'])

### Controls

#### Metainfo

In [21]:
%reset_selective -f "^df.*"

In [22]:
df01=read_table([f"{metadata['pre_processed']['metainfo']['path']}/{s}.csv" for s in  sorted(metadata['data']['subsets']['controls from']['pairs'])])
df01.head(1)

#### Filtering

In [23]:
df1=(df01
     .assign(**{'cell id per subset':range(len(df01))})
    .log.query(expr="`pairs` == 'control' or `natMX4`.isna()")
)
df1.head(1)

In [24]:
df1.loc[:,['pairs','GFP','natMX4']].drop_duplicates().sort_values('GFP')

In [25]:
rename_partner=merge_dicts(
    [dict([s.split('-')       for s in metadata['data']['subsets']['controls']['pairs']]),
     dict([s.split('-')[::-1] for s in metadata['data']['subsets']['controls']['pairs']]),
            ])
assert rename_partner==df1.loc[:,['pairs','GFP','natMX4']].drop_duplicates().dropna().set_index('GFP')['natMX4'].to_dict()
info_nb(f'The partner gene is renamed as: {rename_partner}')

#### Assigning partner genes 

In [26]:
#filter
df2=(df1
    .assign(**{
        'gene symbol partner':lambda x: x['GFP'].map(rename_partner),
        'pairs':lambda x: x.apply(lambda y: '-'.join(sorted([y['GFP'],y['gene symbol partner']])) ,axis=1),
        'label':lambda x: x.apply(lambda y: f"{y['GFP']}-GFP {y['gene symbol partner']}-{'WT' if pd.isnull(y['natMX4']) else 'DELTA'}" ,axis=1),
        'cell id per pair': lambda df: df.groupby('pairs')['URL'].transform(lambda df: range(len(df)))
              }
          )
    )
df2.head(1)

In [27]:
df2['pairs'].unique()

In [28]:
assert all(df2.groupby('pairs')['label'].nunique()==4), df2.groupby('pairs')['label'].unique()

In [29]:
to_table(df2,metadata['metainfo']['raw']['controls'])

#### Standardizing the columns

In [30]:
### map the annotations
from modules.tools.io import read_pre_processed
df3=read_pre_processed(
    p=df2,
    rename=True,
    clean=True,
    )
df3.head(1)

In [31]:
assert all(df3.groupby('pairs')['status partner'].nunique()==2), df3.groupby('pairs')['status partner'].unique()

In [33]:
to_table(df3,metadata['metainfo']['combined']['controls'])