# Features

Processing of the features extracted from the deep learning model.

In [1]:
## required libraries
## logging functions
import logging
## data functions
import numpy as np
import pandas as pd
## system functions
from os.path import dirname
import sys
## system functions from roux
from roux.lib.io import to_table
## workflow functions from roux
from roux.workflow.io import read_metadata
## data functions from roux
import roux.lib.dfs as rd # attributes
sys.path.append('../')

In [2]:
## input parameters
metadata_path='../config/metadata.yaml'
kernel=None
force=False
test=True

In [3]:
## inferred parameters
metadata=read_metadata(metadata_path,inputs=None if not test else {'version':{'number':'test'}},)
metadata['dataset']=read_metadata(metadata['dataset_config_path'],config_base=dict(species_name=metadata['species_name'],path=metadata['dataset_path'],),)
### output
output_dir_path=metadata['processed']['processed']
logging.info(f"Output directory: {output_dir_path}")
## backup old files if overwriting (force is True)
if force: backup(output_dir_path,dirname(output_dir_path),test=not force,)
## misc.
if kernel is None:
    kernel=metadata['kernels']['default']

## Filter the features

In [4]:
from roux.workflow.task import run_tasks

In [5]:
parameters_list=[
    ## paralogs
    dict(    
        input_path=metadata['metainfo']['filtered']['paralogs'],
        output_path=metadata['features']['filtered']['paralogs'],
        feature_paths=[f"{metadata['pre_processed']['features']['path']}/*.csv"],
    ),
    ## controls
    dict(    
        input_path=metadata['metainfo']['filtered']['controls'],
        output_path=metadata['features']['filtered']['controls'],
        feature_paths=[f"{metadata['pre_processed']['features']['path']}/{s}.csv" for s in  sorted(metadata['data']['subsets']['controls from']['pairs'])],
    ),
]

In [6]:
_=run_tasks(
    input_notebook_path='20_script_features_filtering.ipynb',
    kernel=kernel,
    parameters_list=parameters_list,
    force=True,
)

## Normalise the features by Z-score

Separately for the paralogs, and for the controls based on the paralogs data.

### Paralogs

In [7]:
## paralogs
df01=read_table(metadata['features']['filtered']['paralogs'])
df01.head(1)

In [8]:
## columns
columns_index=[
        'pairs',
        ]
columns_construct=['label common']
columns_value=df01.filter(like='feature #').columns.tolist()
_=df01.rd.assert_dense(subset=columns_index+columns_construct+columns_value)

#### Plot the mean and std

In [9]:
df0=df01.loc[:,columns_value].agg([np.mean,np.std])

In [10]:
df0.T.hist()

In [11]:
## pre-process
## standardize the features
df01=df01.reset_index(drop=True)
from sklearn.preprocessing import StandardScaler
df01_=pd.DataFrame(StandardScaler().fit_transform(df01.loc[:,columns_value]),
                  columns=columns_value)
df1=df01.drop(columns_value,axis=1).join(df01_)
del df01_

In [12]:
to_table(df1,metadata['features']['zscore']['paralogs'])

#### Plot normalized the mean and std

In [13]:
df1.loc[:,columns_value].agg([np.mean,np.std]).T.hist()

### Controls

In [14]:
## controls
df02=read_table(metadata['features']['filtered']['controls'])
df02.head(1)

#### Plot the mean and std

In [15]:
df02.loc[:,columns_value].agg([np.mean,np.std]).T.hist()

In [16]:
df2=df02.copy()

In [17]:
for c in tqdm(columns_value):
    df2=df2.assign(
        **{
            c: lambda df: (df[c]-df0.loc['mean',c])/df0.loc['std',c],
        },
        )

In [18]:
to_table(df2,metadata['features']['zscore']['controls'])

#### Plot normalized the mean and std

In [19]:
df2.loc[:,columns_value].agg([np.mean,np.std]).T.hist()

## PCA
    
Notes:   

1. It does not contain the control samples.

### Get PCs

In [20]:
%reset_selective -f "^df.*"

In [21]:
def get_pca_data(
    df1,
    output_dir_path,
    ):
    output_path=f"{output_dir_path}/{df1['pairs'].unique()[0]}.tsv"
    
    df1=df1.reset_index(drop=True) ## because join later
    features = df1.filter(like='feature #').sort_index(axis=1).values

    from sklearn.decomposition import PCA
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    tfm = make_pipeline(StandardScaler(), PCA(n_components=10))
    tfm_ = tfm.fit(features)
    # features_tfm.shape

    ## variance explained
    ### obtain
    df2=pd.Series(tfm_.named_steps['pca'].explained_variance_).to_frame('explained variance')
    df2['PC #']=[f"{i:02}" for i in range(1,len(df2)+1)]

    ### save
    to_table(df2,replace_many(output_path,{'pcs':'explained_variance'}))

    ## PCs
    ### obtain
    features_tfm = tfm_.transform(features)
    df3=pd.DataFrame(features_tfm).reset_index(drop=True)
    df3.columns=[f"PC #{i+1:02}" for i in df3.columns]
    df3.index.name='cell id per pair'

    assert len(df1)==len(df3)
    df4=df1.rd.dropby_patterns(['feature #'],verbose=False).join(df3)
    assert not df3['PC #01'].isnull().any().any()
    
    ### save
    to_table(df4,output_path)
    return output_path
for k in ['paralogs','controls']:
    output_paths=(
        read_table(metadata['features']['zscore'][k]).rd.assert_no_na(subset=['label'])
        .groupby('pairs')
        .apply(lambda df: get_pca_data(
            df,
            dirname(metadata['pcs'][k]),
        )
        )
    )

### Combined PCA

In [None]:
df00=read_table([metadata['metainfo']['filtered'],metadata['metainfo_controls_filtered']])
df00.head(1)

In [None]:
df01=read_table("../results/features*_filtered/*.csv")
df01.head(1)

In [None]:
assert df00['label'].nunique()==df01['label'].nunique()

In [None]:
from modules.analysis.pca import get_pca_data

In [None]:
get_pca_data(
    p=None,
    outp='../results/pcs_combined/pcs/combined.pqt',
    force=False,
    features_path=None,
    df00=df00,
    features_pair=df01,
    )