# Feature selection classes

In [1]:
import numpy as np 
import pandas as pd
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.util import Surv
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectorMixin, VarianceThreshold
from sklearn.preprocessing import StandardScaler, FunctionTransformer, Normalizer, RobustScaler, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRFClassifier, XGBRFRegressor

class CoxnetSelector(TransformerMixin, BaseEstimator):
    def __init__(self,coef_threshold=0,**kwargs):
        self.cns = CoxnetSurvivalAnalysis(**kwargs)
        self.coef_threshold = coef_threshold
        self.features_out = None
        
    def fit(self, X, y, **kwargs):
        nan = np.isnan(X).any(axis=1).values
        notnan = np.where(~nan)[0]
        self.cns.fit(X.iloc[notnan,:], y[notnan])
        _keep = np.abs(self.cns.coef_[:,-1]) > self.coef_threshold
        features_out = np.where(_keep)[0]
        self.features_out = X.columns[features_out]
        return self
    
    def transform(self, X):
        return X.loc[:,self.features_out]

    def get_feature_names_out(self):
        return np.array(self.features_out)
    


class VarianceSelector(TransformerMixin, BaseEstimator):        
    def __init__(self, **kwargs):
        self.vt = VarianceThreshold(**kwargs)
        self.features_out = None
        
    def fit(self, X, y=None, **kwargs):
        self.vt.fit(X)
        self.features_out = self.vt.get_feature_names_out()
        return self
    
    def transform(self, X):
        Xr = self.vt.transform(X)
        return pd.DataFrame(Xr, columns=self.features_out, index=X.index)

    def get_feature_names_out(self):
        return self.features_out


class StandardTransform(TransformerMixin, BaseEstimator):
    # scale a selection of features
    def __init__(self, cols=None):
        self.scaler = StandardScaler()
        self.cols = cols # columns to scale

    def fit(self, X, y=None):
        if self.cols is None:
            self.cols = X.columns
        self.scaler.fit(X[self.cols])
        self.feature_names_in = X.columns
        return self

    def transform(self, X):
        Xt = X.astype({col: 'float' for col in self.cols})
        Xt.loc[:,self.cols] = self.scaler.transform(Xt[self.cols])
        return Xt

    def get_feature_names_out(self):
        return np.array(self.feature_names_in)


class CorrelationSelector(TransformerMixin, BaseEstimator):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.features_out = None

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X1 = X.copy()
        corr_matrix = X.corr()
        col_corr = set() # correlated (deleted) columns
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if (abs(corr_matrix.iloc[i, j]) >= self.threshold) and (corr_matrix.columns[j] not in col_corr):
                    colname = corr_matrix.columns[i]
                    col_corr.add(colname)
                    if colname in X1.columns:
                        del X1[colname]
        self.features_out = X1.columns
        return self
    
    def transform(self, X):
        return X.loc[:, self.features_out]
    
    def get_feature_names_out(self):
        return self.features_out


class FrequencySelector(TransformerMixin, BaseEstimator):
    def __init__(self, minfreq=0.05, mincount=np.Inf):
        # default is to use frequency cutoff
        self.minfreq = minfreq
        self.mincount = mincount
        self.features_out = None

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        counts = X.sum(axis=0)
        freqs = counts / (~X.isna()).sum(axis=0)
        n_max = (~X.isna()).sum(axis=0).max()
        usecols = (counts >= self.mincount) if self.mincount/n_max < self.minfreq else freqs >= self.minfreq
        self.features_out = np.array(X.columns[usecols])
        return self

    def transform(self, X):
        return X.loc[:, self.features_out]

    def get_feature_names_out(self):
        return self.features_out


class Log1pTransform(TransformerMixin, BaseEstimator):
    def __init__(self,):
        self.features_out = None 
        
    def fit(self, X, y=None):
        self.features_out = X.columns
        return self
        
    def transform(self, X):
        logX = np.log1p(X)
        outX = pd.DataFrame(logX, index=X.index, columns=self.features_out)
        return outX 
    
    def get_feature_names_out(self):
        return self.features_out
    

class PCATransform(TransformerMixin, BaseEstimator):
    # accepts NA values unlike normal PCA
    def __init__(self, prefix=None, **kwargs):
        self.prefix = prefix if prefix else 'Unnamed_'
        self.features_out = None
        self.pca = PCA(**kwargs) # e.g. n_components
        
    def fit(self, X, y=None):
        Xfull = X.dropna()
        self.pca.fit(Xfull)
        self.features_out = np.array([self.prefix + str(s) for s in self.pca.get_feature_names_out()])
        return self
    
    def transform(self, X):
        Xfull = X.dropna()
        pcX = self.pca.transform(Xfull)
        outX = pd.DataFrame(pcX, 
                            index=Xfull.index, 
                            columns=self.features_out).reindex(X.index)
        return outX
    
    def get_feature_names_out(self):
        return self.features_out
class OrdEncoder(TransformerMixin, BaseEstimator):
    def __init__(self, values=[-2, -1, 0, 1, 2]):
        self.values = values
        self.encoder = None 
        self.feature_names_out = None 
    
    # encodes -2:0, -1:1, 0:2, 1:3, 2:4
    # to comply with 0-based class label requirement XGBoost random forest classifier
    def fit(self, X, y=None):
        categories = [self.values for _ in range(X.shape[1])]
        self.encoder = OrdinalEncoder(categories=categories,handle_unknown='use_encoded_value',unknown_value=np.nan).set_output(transform="pandas")
        self.feature_names_out = X.columns
        return self 
    
    def transform(self, X):
        Xout = self.encoder.fit_transform(X)
        return Xout
        
    def get_feature_names_out(self):
        return self.feature_names_out

# Imports from main.py

In [2]:
# from main.py
import os
os.chdir('/home/users/nus/e1083772/cancer-survival-ml/')
import argparse
import pandas as pd
import numpy as np

shuffle=1
fold=4
endpoint='os'

scratchdir="/scratch/users/nus/e1083772/cancer-survival-ml/data/splits"
features_file=f'{scratchdir}/{shuffle}/{fold}/train_features.parquet'
features = pd.read_parquet(features_file)

valid_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features.parquet'
valid_features = pd.read_parquet(valid_features_file)

survcols = [f'{endpoint}cdy',f'cens{endpoint}']
train_surv_file=f'{scratchdir}/{shuffle}/{fold}/train_labels.parquet'
train_surv = pd.read_parquet(train_surv_file,columns=survcols)
train_surv.rename(columns={f'{endpoint}cdy':'survtime',f'cens{endpoint}':'survflag'},inplace=True)

train_out_features_file=f'{scratchdir}/{shuffle}/{fold}/features_processed.parquet'
valid_out_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features_processed.parquet'

event = train_surv['survflag'].values
time = train_surv.survtime
offset = max(0, -np.min(time)) # some OS is negative
print(offset)
time += offset
y = Surv.from_arrays(event,time)

21


# Feature selection on RNA-Seq

In [3]:
rna = features.filter(regex='Feature_exp_')
rna_cols = rna.columns
cna = features.filter(regex='Feature_CNA_ENSG')
cna_cols = cna.columns
gistic = features.filter(regex='Feature_CNA_(Amp|Del)')
gistic_cols = gistic.columns
fish = features.filter(regex='Feature_fish')
fish_cols = fish.columns
clin = features.filter(regex='Feature_clin')
clin_cols = clin.columns
ig = features.filter(regex='Feature_SeqWGS')
ig_cols = ig.columns
sbs = features.filter(regex='Feature_SBS')
sbs_cols = sbs.columns
apobec = features.filter(regex='APOBEC')
chromothripsis = features.filter(regex='chromothripsis')

For debug

Remove gene expression and copy number

In [17]:
features = features.filter(regex='Feature_(?!exp|CNA_ENSG)')
valid_features = valid_features.filter(regex='Feature_(?!exp|CNA_ENSG)')

# Create pipeline

In [35]:
transformer_gene_exp = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Log1p', Log1pTransform()),
    ('Standard scaling', StandardTransform()),
    ('Cox ElasticNet', CoxnetSelector(l1_ratio=0.5, coef_threshold=0.05)),
])

transformer_sbs = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=1)),
    ('Log1p', Log1pTransform()),
    ('Standard scaling', StandardTransform()),
    ('Cox LASSO', CoxnetSelector(l1_ratio=0.5, coef_threshold=0.1)),
])

transformer_gene_cn = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold=0.05)),
    ('Uncorrelated', CorrelationSelector(threshold=0.9)),
])

transformer_gistic = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold = 0.2)),
])

transformer_fish = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold = 0.2)),
])

transformer_clin = Pipeline([
    ('Scale age', StandardTransform(cols=['Feature_clin_D_PT_age']))
])

transformer_ig = Pipeline([
    ('Frequency', FrequencySelector(minfreq=0.01))
])

transformer = ColumnTransformer([
    ('Gene expression', transformer_gene_exp, make_column_selector(pattern='Feature_exp_')),
    ('Gene copy number', transformer_gene_cn, make_column_selector(pattern='Feature_CNA_ENSG')),
    ('Gistic copy number', transformer_gistic, make_column_selector(pattern='Feature_CNA_(Amp|Del)')),
    ('FISH copy number', transformer_fish, make_column_selector(pattern='Feature_fish')),
    ('Mutation signatures', transformer_sbs, make_column_selector(pattern='Feature_SBS')),
    ('Clinical', transformer_clin, make_column_selector(pattern='Feature_clin')),
], remainder='passthrough').set_output(transform="pandas")

tree_args = {
    'n_estimators': 1,
    'subsample': 0.632,
    'colsample_bynode': 0.632,
    'n_jobs': 4,
    'tree_method': 'hist',
}

imputer_args = {
    'skip_complete':True
}

ContinuousImputer = IterativeImputer(estimator=XGBRFRegressor(**tree_args), initial_strategy='mean', **imputer_args)
CategoricalImputer = IterativeImputer(estimator=XGBRFClassifier(**tree_args), initial_strategy='most_frequent', **imputer_args)

imputer = ColumnTransformer([
    ('Continuous variables', ContinuousImputer, make_column_selector(pattern='Feature_(exp|clin_D_PT_age|SBS)')),
    ('Categorical variables', CategoricalImputer, make_column_selector(pattern='Feature_(?!exp|clin_D_PT_age|SBS)'))
], remainder='drop').set_output(transform="pandas")

pipeline = Pipeline([
    ('Feature selection', transformer),
    ('Joint imputation', imputer),
])

In [36]:
out = pipeline.fit_transform(features, y)

  cols = cols[cols.str.contains(self.pattern, regex=True)]
  cols = cols[cols.str.contains(self.pattern, regex=True)]


In [38]:
out.filter(regex='SBS').shape

(1016, 22)

In [39]:
out.filter(regex='Gistic copy number').shape

(1016, 69)

In [40]:
out.filter(regex='Gene copy number').shape

(1016, 0)

In [41]:
out.filter(regex='Gene expression').shape

(1016, 0)

In [42]:
out.filter(regex='FISH copy number').shape

(1016, 31)

In [None]:
# scratchdir="/scratch/users/nus/e1083772/cancer-survival-ml/data/splits"

# train_out_features_file=f'{scratchdir}/{shuffle}/{fold}/features_subset.parquet'
# valid_out_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features_subset.parquet'

# out.to_parquet(train_out_features_file)
# outv.to_parquet(valid_out_features_file)