In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from collections import Counter
from sklearn.metrics import accuracy_score, roc_curve, auc,confusion_matrix,ConfusionMatrixDisplay,f1_score,recall_score,precision_score
import pickle
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv("combo.csv",index_col=0)

In [3]:
data

Unnamed: 0,Subgroup,1,2,3,4,5,6,7,8,9,...,2825,2826,2827,2828,2829,2830,2831,2832,2833,2834
Array.10,HER2+,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,1
Array.100,HR+,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.101,HR+,0,0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Array.102,Triple Neg,0,0,0,0,-1,-1,-1,0,-1,...,1,1,1,1,1,1,1,1,1,1
Array.104,Triple Neg,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Array.93,Triple Neg,0,0,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,1,1
Array.94,HR+,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.95,HR+,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.98,HR+,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [4]:
y = data.Subgroup
x = data.drop('Subgroup',axis=1)

In [5]:
clf=LinearSVC(random_state=0,dual='auto')

In [16]:
#define the pipeline, for different feature selection methods you can use 'SelectKBest' instead of RFE, check the documentation of sklearn for instruction
#you can change the 'model' from LinearSVC to other sklearn models
pipe = Pipeline(steps=[('feat_sel', RFE(estimator=LinearSVC(random_state=0,dual='auto',C=0.05),step=100)),
                       ('model', LinearSVC(random_state=0,dual='auto'))])

In [17]:
#nested cv to estimate the test performance of our training pipeline
test_score_nested = []
N_TRIALS = 10
param_grid = {'feat_sel__n_features_to_select': [50,75,100],'model__C': [0.01, 0.05, 0.1]}
for i in range(N_TRIALS):  
    inner_cv = StratifiedKFold(5,shuffle=True, random_state=i)
    outer_cv = StratifiedKFold(3,shuffle=True, random_state=i)
    search = GridSearchCV(estimator=pipe, param_grid=param_grid,cv=inner_cv,n_jobs=-1)
    test_score = cross_val_score(search, x, y, cv=outer_cv, n_jobs=-1)
    test_score_nested.append(test_score.mean())

In [18]:
test_score_nested

[0.7596553773024363,
 0.8205585264408795,
 0.7905525846702317,
 0.7706476530005942,
 0.8701723113487819,
 0.7798573975044564,
 0.7905525846702318,
 0.8505644682115271,
 0.8389780154486036,
 0.8110516934046346]

In [19]:
#search for best hyperparameters on whole train set
search.fit(x,y)

In [20]:
search.best_estimator_

In [22]:
search.best_score_

0.86

In [23]:
#get the best RFE object
rfe=search.best_estimator_['feat_sel']

In [24]:
#get the best model
model=search.best_estimator_['model']

In [30]:
#get the bool values for features
mask=rfe.get_support()
mask

array([False, False, False, ..., False, False, False])

In [25]:
#get the features used by the best model
mask=rfe.get_support()
features=x.columns[mask]

In [26]:
features

Index(['38', '40', '42', '119', '178', '190', '191', '193', '195', '227',
       '230', '231', '265', '306', '397', '417', '433', '487', '549', '600',
       '621', '622', '674', '675', '696', '762', '765', '766', '792', '802',
       '853', '855', '856', '986', '1000', '1010', '1080', '1092', '1144',
       '1244', '1307', '1552', '1562', '1605', '1656', '1657', '1658', '1671',
       '1673', '1678', '1679', '1680', '1681', '1818', '1870', '1873', '1882',
       '1901', '1903', '1957', '1972', '1973', '1974', '2022', '2024', '2025',
       '2027', '2028', '2046', '2057', '2126', '2185', '2199', '2207', '2208',
       '2211', '2214', '2215', '2219', '2380', '2384', '2424', '2429', '2492',
       '2496', '2516', '2529', '2549', '2550', '2609', '2642', '2743', '2750',
       '2751', '2761', '2772', '2816', '2817', '2818', '2831'],
      dtype='object')

In [32]:
model.classes_

array(['HER2+', 'HR+', 'Triple Neg'], dtype=object)

In [31]:
#Some models like random forest use feature_importance, check the documentation of your model
model.coef_

array([[-2.48072519e-02, -6.06272163e-02, -3.07125201e-02,
         7.39154930e-02, -4.93605100e-02, -5.40609367e-02,
        -5.61214555e-02,  1.51811050e-02, -4.73076997e-02,
         7.87887505e-03, -1.14384773e-02, -4.61415265e-04,
        -7.14707200e-02, -2.60761778e-02,  1.91006141e-02,
         3.56351421e-02, -2.97913870e-02, -8.01933944e-02,
         6.36495174e-02, -8.67964797e-03,  3.07005324e-02,
         3.83947661e-02, -3.54945856e-02, -1.79105155e-02,
        -1.20852116e-02,  4.96626299e-02,  3.84097779e-02,
         1.72596770e-03,  1.82062209e-02,  2.39836085e-02,
         6.55018809e-02,  8.75115400e-02,  7.04233102e-02,
        -5.59533476e-02, -4.43430190e-02, -6.74990805e-03,
         2.23503648e-02, -8.30488588e-02, -2.23532976e-03,
        -7.40089021e-03,  3.99945204e-02, -1.59155954e-03,
        -3.86889040e-03, -1.24097603e-02,  5.86280717e-03,
         1.46066936e-02, -6.85106455e-03,  1.64976775e-02,
         7.26048598e-03,  7.87977363e-03,  6.75575393e-0

In [28]:
with open('linearSVC.pickle', 'wb') as f:
    pickle.dump(model, f)