# Test Feature Selector in Simulation Data

In [1]:
import numpy as np
import pandas as pd
from Data import GeneMutationData, GeneExpressionData, ProteinExpressionData, GeneCNVData, MultiOmicsData
from DataProcessor import BinaryDataProcessor, ContinuesDataProcessor
from FeatureSelector import FeatureSelector
from Simulator import simulate_data

In [2]:
def simulate_multi_omics_data(
    n_samples, n_group_feat, n_base, n_interact, n_correlated,
    corr_level, noise_level_X, noise_level_y
):
    # simulate data
    X, y, coef = simulate_data(
        n_samples=n_samples, label="classification",
        n_feat_normal=n_group_feat, n_feat_normal_bimodal=n_group_feat, 
        n_feat_count=n_group_feat, n_feat_count_bimodal=n_group_feat,
        n_feat_binary=n_group_feat, n_feat_uniform=n_group_feat, 
        n_base=n_base, n_interact=n_interact, n_correlated=n_correlated,
        corr_level=corr_level, noise_level_X=noise_level_X, noise_level_y=noise_level_y
    )
    
    # get features
    prot = X.iloc[:, :n_group_feat*2]
    expr = np.log2(X.iloc[:, n_group_feat*2:n_group_feat*4] + 1)
    mut = X.iloc[:, n_group_feat*4:n_group_feat*5]
    cnv = X.iloc[:, n_group_feat*5:]

    # get labels
    y = pd.Series(y)

    # omics data
    prot = ProteinExpressionData(prot, y)
    expr = GeneExpressionData(expr, y)
    mut = GeneMutationData(mut, y)
    cnv = GeneCNVData(cnv, y)

    # process omics data
    prot = ContinuesDataProcessor().fit_transform(prot).get_features(layer="processed")
    expr = ContinuesDataProcessor().fit_transform(expr).get_features(layer="processed")
    mut = BinaryDataProcessor().fit_transform(mut).get_features(layer="processed")
    cnv = ContinuesDataProcessor().fit_transform(cnv).get_features(layer="processed")

    # combine processed data
    omics = MultiOmicsData(gene_mutation=mut, gene_expression=expr, protein_expression=prot, gene_cnv=cnv, labels=y, if_processed=True)
    mat = omics.get_combined_features(layer="processed")

    return mat, y, coef
    


## 1. Simulate Data

In [3]:
mat, y, coef = simulate_multi_omics_data(
    n_samples=200, n_group_feat=100, 
    n_base=1, n_interact=0, n_correlated=20,
    corr_level="medium", noise_level_X="low", noise_level_y="low"
)

Identifying high quality features ...
Identifying informative features ...
Imputing missing value ...
Identifying and transforming skewed features ...
Identifying feature outliers ...
Scaling features ...
Identifying correlated features ...
Identifying high quality features ...
Identifying informative features ...
Imputing missing value ...
Identifying and transforming skewed features ...
Identifying feature outliers ...
Scaling features ...
Identifying correlated features ...
Identifying high quality features ...
Identifying informative features ...
Imputing missing value ...
Identifying high quality features ...
Identifying informative features ...
Imputing missing value ...
Identifying and transforming skewed features ...
Identifying feature outliers ...
Scaling features ...
Identifying correlated features ...


## 2. Select features

In [4]:
fs = FeatureSelector(task_type="classification", n_features=10, n_bootstrap=30)
fs.fit(X=mat, y=y)

Fitting bootstrap sample 1 ...




Fitting bootstrap sample 2 ...




Fitting bootstrap sample 3 ...




Fitting bootstrap sample 4 ...




Fitting bootstrap sample 5 ...




Fitting bootstrap sample 6 ...




Fitting bootstrap sample 7 ...




Fitting bootstrap sample 8 ...




Fitting bootstrap sample 9 ...




Fitting bootstrap sample 10 ...




Fitting bootstrap sample 11 ...




Fitting bootstrap sample 12 ...




Fitting bootstrap sample 13 ...




Fitting bootstrap sample 14 ...




Fitting bootstrap sample 15 ...




Fitting bootstrap sample 16 ...




Fitting bootstrap sample 17 ...




Fitting bootstrap sample 18 ...




Fitting bootstrap sample 19 ...




Fitting bootstrap sample 20 ...




Fitting bootstrap sample 21 ...




Fitting bootstrap sample 22 ...




Fitting bootstrap sample 23 ...




Fitting bootstrap sample 24 ...




Fitting bootstrap sample 25 ...




Fitting bootstrap sample 26 ...




Fitting bootstrap sample 27 ...




Fitting bootstrap sample 28 ...




Fitting bootstrap sample 29 ...




Fitting bootstrap sample 30 ...




In [5]:
fs.get_selected_features()

(['protein_expression|feat_normal_bimodal_1',
  'protein_expression|feat_normal_1',
  'protein_expression|feat_normal_bimodal_7',
  'protein_expression|feat_normal_bimodal_14',
  'protein_expression|feat_normal_bimodal_6',
  'protein_expression|feat_normal_20',
  'gene_expression|feat_count_11',
  'gene_expression|feat_count_1',
  'gene_cnv|feat_uniform_66',
  'gene_expression|feat_count_20'],
 array([0.93181818, 0.84090909, 0.67613636, 0.66477273, 0.50568182,
        0.5026738 , 0.46363636, 0.44755245, 0.38502674, 0.38311688]))

In [6]:
feature_score = fs.feature_scores_.copy()
feature_score.index = feature_score.index.to_series().apply(lambda x: x.split("|")[1])
feature_score = pd.concat([feature_score.to_frame().rename(columns={0: "score"}), coef.abs()], axis=1)
feature_score = feature_score.loc[coef.index,:].sort_values(by="coef", ascending=False)

In [7]:
feature_score

Unnamed: 0,score,coef
feat_normal_bimodal_1,0.931818,0.964716
feat_normal_1,0.840909,0.908825
feat_binary_1,0.006494,0.901939
feat_count_1,0.447552,0.674066
feat_uniform_1,0.05303,0.66182
feat_count_bimodal_1,0.011364,0.594557
