# Imports for feature selection

In [1]:
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.util import Surv
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectorMixin, VarianceThreshold
from sklearn.preprocessing import StandardScaler, FunctionTransformer, Normalizer, RobustScaler
from sklearn.decomposition import PCA
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Imports from main.py

In [2]:
# from main.py
import os
os.chdir('/home/users/nus/e1083772/cancer-survival-ml/')
import argparse
import pandas as pd
import numpy as np

shuffle=1
fold=4
endpoint='os'

scratchdir="/scratch/users/nus/e1083772/cancer-survival-ml/data/splits"
features_file=f'{scratchdir}/{shuffle}/{fold}/train_features.parquet'
features = pd.read_parquet(features_file)

valid_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features.parquet'
valid_features = pd.read_parquet(valid_features_file)

survcols = [f'{endpoint}cdy',f'cens{endpoint}']
train_surv_file=f'{scratchdir}/{shuffle}/{fold}/train_labels.parquet'
train_surv = pd.read_parquet(train_surv_file,columns=survcols)
train_surv.rename(columns={f'{endpoint}cdy':'survtime',f'cens{endpoint}':'survflag'},inplace=True)

train_out_features_file=f'{scratchdir}/{shuffle}/{fold}/features_processed.parquet'
valid_out_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features_processed.parquet'

In [3]:
features.shape

(1016, 115442)

In [4]:
train_surv.shape

(1016, 2)

In [5]:
valid_features.shape

(127, 115442)

# Adjust negative survival times

In [6]:
event = train_surv['survflag'].values
time = train_surv.survtime
offset = max(0, -np.min(time)) # some OS is negative
print(offset)
time += offset
y = Surv.from_arrays(event,time)

21


# Feature selection on RNA-Seq

In [7]:
rna = features.filter(regex='Feature_exp_')
rna_cols = rna.columns

In [8]:
cna = features.filter(regex='Feature_CNA_ENSG')
cna_cols = cna.columns

In [9]:
gistic = features.filter(regex='Feature_CNA_(Amp|Del)')
gistic_cols = gistic.columns

In [10]:
fish = features.filter(regex='Feature_fish')
fish_cols = fish.columns

In [11]:
clin = features.filter(regex='Feature_clin')
clin_cols = clin.columns

In [12]:
ig = features.filter(regex='Feature_SeqWGS')
ig_cols = ig.columns

In [13]:
sbs = features.filter(regex='Feature_SBS')
sbs_cols = sbs.columns

In [14]:
apobec = features.filter(regex='APOBEC')

In [15]:
chromothripsis = features.filter(regex='chromothripsis')

# TransformerMixin functions

In [131]:
class VarianceSelector(TransformerMixin, BaseEstimator):        
    def __init__(self, **kwargs):
        self.vt = VarianceThreshold(**kwargs)
        self.features_out = None
        
    def fit(self, X, y=None, **kwargs):
        self.vt.fit(X)
        self.features_out = self.vt.get_feature_names_out()
        return self
    
    def transform(self, X):
        Xr = self.vt.transform(X)
        return pd.DataFrame(Xr, columns=self.features_out, index=X.index)

    def get_feature_names_out(self):
        return self.features_out

class CoxnetSelector(TransformerMixin, BaseEstimator):
    def __init__(self,coef_threshold=0,**kwargs):
        self.cns = CoxnetSurvivalAnalysis(**kwargs)
        self.coef_threshold = coef_threshold
        self.features_out = None
        
    def fit(self, X, y, **kwargs):
        nan = np.isnan(X).any(axis=1).values
        notnan = np.where(~nan)[0]
        self.cns.fit(X.iloc[notnan,:], y[notnan])
        _keep = np.abs(self.cns.coef_[:,-1]) > self.coef_threshold
        features_out = np.where(_keep)[0]
        self.features_out = X.columns[features_out]
        return self
    
    def transform(self, X):
        return X.loc[:,self.features_out]

    def get_feature_names_out(self):
        return np.array(self.features_out)

class StandardTransform(TransformerMixin, BaseEstimator):
    # scale a selection of features
    def __init__(self, cols=None):
        self.scaler = StandardScaler()
        self.cols = cols # columns to scale

    def fit(self, X, y=None):
        if self.cols is None:
            self.cols = X.columns
        self.scaler.fit(X[self.cols])
        self.feature_names_in = X.columns
        return self

    def transform(self, X):
        Xt = X.astype({col: 'float' for col in self.cols})
        Xt.loc[:,self.cols] = self.scaler.transform(Xt[self.cols])
        return Xt

    def get_feature_names_out(self):
        return np.array(self.feature_names_in)

class CorrelationSelector(TransformerMixin, BaseEstimator):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.features_out = None

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X1 = X.copy()
        corr_matrix = X.corr()
        col_corr = set() # correlated (deleted) columns
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if (abs(corr_matrix.iloc[i, j]) >= self.threshold) and (corr_matrix.columns[j] not in col_corr):
                    colname = corr_matrix.columns[i]
                    col_corr.add(colname)
                    if colname in X1.columns:
                        del X1[colname]
        self.features_out = X1.columns
        return self
    
    def transform(self, X):
        return X.loc[:, self.features_out]
    
    def get_feature_names_out(self):
        return self.features_out

class FrequencySelector(TransformerMixin, BaseEstimator):
    def __init__(self, minfreq=0.05, mincount=np.Inf):
        # default is to use frequency cutoff
        self.minfreq = minfreq
        self.mincount = mincount
        self.features_out = None

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        counts = X.sum(axis=0)
        freqs = counts / (~X.isna()).sum(axis=0)
        n_max = (~X.isna()).sum(axis=0).max()
        usecols = (counts >= self.mincount) if self.mincount/n_max < self.minfreq else freqs >= self.minfreq
        self.features_out = np.array(X.columns[usecols])
        return self

    def transform(self, X):
        return X.loc[:, self.features_out]

    def get_feature_names_out(self):
        return self.features_out

class Log1pTransform(TransformerMixin, BaseEstimator):
    def __init__(self,):
        self.features_out = None 
        
    def fit(self, X, y=None):
        self.features_out = X.columns
        return self
        
    def transform(self, X):
        logX = np.log1p(X)
        outX = pd.DataFrame(logX, index=X.index, columns=self.features_out)
        return outX 
    
    def get_feature_names_out(self):
        return self.features_out
    

class PCATransform(TransformerMixin, BaseEstimator):
    # accepts NA values unlike normal PCA
    def __init__(self, prefix=None, **kwargs):
        self.prefix = prefix if prefix else 'Unnamed_'
        self.features_out = None
        self.pca = PCA(**kwargs) # e.g. n_components
        
    def fit(self, X, y=None):
        Xfull = X.dropna()
        self.pca.fit(Xfull)
        self.features_out = np.array([self.prefix + str(s) for s in self.pca.get_feature_names_out()])
        return self
    
    def transform(self, X):
        Xfull = X.dropna()
        pcX = self.pca.transform(Xfull)
        outX = pd.DataFrame(pcX, 
                            index=Xfull.index, 
                            columns=self.features_out).reindex(X.index)
        return outX
    
    def get_feature_names_out(self):
        return self.features_out

# Create pipeline

In [136]:
transformer_gene_exp = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Log1p', Log1pTransform()),
    ('Standard scaling', StandardTransform()),
    ('Cox ElasticNet', CoxnetSelector(l1_ratio=0.5, coef_threshold=0.05)),
])

transformer_gene_cn = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold=0.05)),
    ('Uncorrelated', CorrelationSelector(threshold=0.9)),
])

transformer_gistic = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold = 0.1)),
])

transformer_sbs = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=1)),
    ('Log1p', Log1pTransform()),
    ('Standard scaling', StandardTransform()),
    ('Cox LASSO', CoxnetSelector(l1_ratio=1, coef_threshold=0.1)),
])

transformer_fish = Pipeline([
    ('Non-zero variance', VarianceSelector(threshold=0)),
    ('Coxnet', CoxnetSelector(l1_ratio=0.5, coef_threshold = 0.1)),
])

transformer_ig = Pipeline([
    ('Frequency', FrequencySelector(minfreq=0.01))
])

transformer_clin = Pipeline([
    ('Scale age', StandardTransform(cols=['Feature_clin_D_PT_age']))
])

transformer = ColumnTransformer([
    ('Gene expression', transformer_gene_exp, make_column_selector(pattern='Feature_exp_')),
    ('Gene copy number', transformer_gene_cn, make_column_selector(pattern='Feature_CNA_ENSG')),
    ('Gistic copy number', transformer_gistic, make_column_selector(pattern='Feature_CNA_(Amp|Del)')),
    ('FISH copy number', transformer_fish, make_column_selector(pattern='Feature_fish')),
    ('Mutation signatures', transformer_sbs, make_column_selector(pattern='Feature_SBS')),
    ('Clinical', transformer_clin, make_column_selector(pattern='Feature_clin')),
], remainder='passthrough').set_output(transform="pandas")

imputer_params = {
    'n_nearest_features': 10, 
    'max_iter': 100,
    'tol':1e-2,
    'skip_complete':True
}

RegressionImputer = IterativeImputer(estimator=RandomForestRegressor(), initial_strategy='mean', **imputer_params)
ClassificationImputer = IterativeImputer(estimator=RandomForestClassifier(), initial_strategy='most_frequent', **imputer_params)

imputer = ColumnTransformer([
    ('Continuous variables', RegressionImputer, make_column_selector(pattern='(Feature_(exp|clin_D_PT_age|SBS)|pca)')),
    ('Categorical variables', ClassificationImputer, make_column_selector(pattern='Feature_(?!exp|clin_D_PT_age|SBS)'))
], remainder='drop').set_output(transform="pandas")

pca = ColumnTransformer([
    ('Gene expression', PCATransform(prefix='Feature_exp',n_components=10), make_column_selector(pattern='Feature_exp')),
    ('Gene copy number', PCATransform(prefix='Feature_CN_gene',n_components=10), make_column_selector(pattern='Feature_CNA_ENSG')),
    ('Gistic copy number', PCATransform(prefix='Feature_CN_gistic',n_components=10), make_column_selector(pattern='Feature_CNA_(RNASeq|SeqWGS)')),
    ('FISH copy number', PCATransform(prefix='Feature_CN_fish',n_components=10), make_column_selector(pattern='Feature_fish)')),
], remainder='passthrough').set_output(transform="pandas")

pipeline = Pipeline([
    ('Feature selection', transformer),
    ('Joint imputation', imputer),
    ('PCA', pca)
])
pipeline

In [134]:
out = pipeline.fit_transform(features, y)

  cols = cols[cols.str.contains(self.pattern, regex=True)]
  cols = cols[cols.str.contains(self.pattern, regex=True)]


KeyboardInterrupt: 

In [34]:
out.filter(regex='Feature_exp')

Unnamed: 0_level_0,Gene expression__Feature_exp_pca0,Gene expression__Feature_exp_pca1,Gene expression__Feature_exp_pca2,Gene expression__Feature_exp_pca3,Gene expression__Feature_exp_pca4,Gene expression__Feature_exp_pca5,Gene expression__Feature_exp_pca6,Gene expression__Feature_exp_pca7,Gene expression__Feature_exp_pca8,Gene expression__Feature_exp_pca9,...,Gene expression__Feature_exp_pca655,Gene expression__Feature_exp_pca656,Gene expression__Feature_exp_pca657,Gene expression__Feature_exp_pca658,Gene expression__Feature_exp_pca659,Gene expression__Feature_exp_pca660,Gene expression__Feature_exp_pca661,Gene expression__Feature_exp_pca662,Gene expression__Feature_exp_pca663,Gene expression__Feature_exp_pca664
PUBLIC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF_1007,,,,,,,,,,,...,,,,,,,,,,
MMRF_1011,,,,,,,,,,,...,,,,,,,,,,
MMRF_1013,,,,,,,,,,,...,,,,,,,,,,
MMRF_1014,,,,,,,,,,,...,,,,,,,,,,
MMRF_1016,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MMRF_2846,,,,,,,,,,,...,,,,,,,,,,
MMRF_2847,0.115155,-0.262712,9.547609,0.174491,-1.700913,-2.594641,0.453928,2.332060,3.599387,3.843321,...,0.001026,0.007154,-0.035740,-0.027436,0.005234,0.037007,-0.010941,0.129442,-0.037312,1.899866e-15
MMRF_2848,-4.057280,5.160718,7.087989,-0.509473,0.181885,2.476096,-2.930544,2.171186,-0.907540,1.734954,...,-0.010497,-0.012944,0.043653,0.027963,0.014943,0.000232,-0.031268,0.041034,0.000061,7.341321e-16
MMRF_2851,-3.540592,-1.420694,6.074475,1.946850,2.362985,-1.068620,-0.522922,-0.722730,-1.051358,-1.105945,...,-0.010703,-0.005334,0.057544,0.053831,0.067620,0.005861,-0.074586,0.006660,-0.009274,-6.120133e-16


In [None]:
out.filter(regex='Feature_CN_gistic')

Unnamed: 0_level_0,Gistic copy number__Feature_CN_gisticpca0,Gistic copy number__Feature_CN_gisticpca1,Gistic copy number__Feature_CN_gisticpca2,Gistic copy number__Feature_CN_gisticpca3,Gistic copy number__Feature_CN_gisticpca4,Gistic copy number__Feature_CN_gisticpca5,Gistic copy number__Feature_CN_gisticpca6,Gistic copy number__Feature_CN_gisticpca7,Gistic copy number__Feature_CN_gisticpca8,Gistic copy number__Feature_CN_gisticpca9,...,Gistic copy number__Feature_CN_gisticpca87,Gistic copy number__Feature_CN_gisticpca88,Gistic copy number__Feature_CN_gisticpca89,Gistic copy number__Feature_CN_gisticpca90,Gistic copy number__Feature_CN_gisticpca91,Gistic copy number__Feature_CN_gisticpca92,Gistic copy number__Feature_CN_gisticpca93,Gistic copy number__Feature_CN_gisticpca94,Gistic copy number__Feature_CN_gisticpca95,Gistic copy number__Feature_CN_gisticpca96
PUBLIC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF_1007,,,,,,,,,,,...,,,,,,,,,,
MMRF_1011,,,,,,,,,,,...,,,,,,,,,,
MMRF_1013,,,,,,,,,,,...,,,,,,,,,,
MMRF_1014,,,,,,,,,,,...,,,,,,,,,,
MMRF_1016,2.490232,2.648216,-2.598980,0.724026,1.147801,-0.495525,-1.797174,0.024392,-0.472550,-0.614534,...,0.421373,-0.158997,-0.106580,0.130419,0.048369,0.054240,0.036508,-0.003299,-0.010472,-0.033015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MMRF_2846,,,,,,,,,,,...,,,,,,,,,,
MMRF_2847,1.365294,-1.561895,0.678246,0.228827,0.070804,-0.536736,-0.147391,1.997537,-0.253596,-0.849476,...,0.563214,-0.126622,-0.056641,-0.121586,0.091660,-0.116242,0.021429,0.018906,0.026588,-0.041085
MMRF_2848,-1.133501,2.237886,-0.598636,0.017751,0.983006,0.713377,-0.762593,-0.244717,-0.244440,-0.688495,...,0.100512,0.359694,0.006428,-0.147517,0.000310,-0.015194,-0.091654,-0.030962,-0.071680,-0.019823
MMRF_2851,-2.924364,-0.254533,1.515942,0.182573,-0.457697,-0.215411,-0.218937,-0.475667,-0.608096,0.033646,...,0.093671,-0.188176,0.084344,-0.074361,-0.060737,-0.346298,0.295009,-0.267096,-0.033595,-0.033732


In [None]:
%%time
out = pipeline.fit_transform(features, y)
print(out.shape)

In [None]:
out.filter(regex='Gistic copy number').shape

In [None]:
out.filter(regex='Gene copy number').shape

In [None]:
out.filter(regex='Gene expression').shape

In [None]:
out.filter(regex='FISH copy number').shape

In [None]:
scratchdir="/scratch/users/nus/e1083772/cancer-survival-ml/data/splits"

train_out_features_file=f'{scratchdir}/{shuffle}/{fold}/features_subset.parquet'
valid_out_features_file=f'{scratchdir}/{shuffle}/{fold}/valid_features_subset.parquet'

out.to_parquet(train_out_features_file)
outv.to_parquet(valid_out_features_file)