In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier, RandomForestClassifier

## Note: 
#### Aim 1: Validating the original classifier - Elastic Net
#### Aim 2: Training new classifiers

### For Aim 2:
### Preprocessing:
#### - Missing values in both training and testing datasets are replaced by mean; 
#### - Invalid samples and family member samples are removed
#### - Merge 4 datasets as training and the left 1 as testing (repeated 5 times)
### Data Engineering:
#### - Variable selection method: Logistic Regression / Random Forest;
### Models:
#### Stacking: SVM, LR, RF, GB, MNB
#### Voting: SVM, LR, RF, GB

In [3]:
# Read Datasets

# 1658 * 834
data_ERISK = pd.read_csv('ERisk_data.csv')
# 1464 * 834
data_ERISK = data_ERISK.dropna()

# 614 rows × 834 columns
data_BSGS = pd.read_csv('BSGS_data.csv')
# 358 rows × 834 columns
data_BSGS = data_BSGS[data_BSGS['label']!='0']
data_BSGS['label'] = data_BSGS['label'].replace(['MZ','DZ'],[1,0])
data_BSGS = data_BSGS.fillna(data_BSGS.mean())

# 180 * 834
data_DENMARK = pd.read_csv('DENMARK_data.csv')
data_DENMARK['label'] = data_DENMARK['label'].replace([2],[0])

# 479 * 832
data_AMDTSS = pd.read_csv('AMDTSS_data.csv')
# 264 * 832 - removing family members
data_AMDTSS = data_AMDTSS[data_AMDTSS['label']!='Sister']
data_AMDTSS['label'] = data_AMDTSS['label'].replace(['MZ','DZ'],[1,0])

# 648 * 834
data_EMTAB = pd.read_csv('EMTAB_data.csv')
data_EMTAB['label'] = data_EMTAB['label'].replace(['dizygotic', 'monozygotic'],[0, 1])
data_EMTAB = data_EMTAB.fillna(data_EMTAB.mean())

### Stacking

In [4]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)))
	level0.append(('rf', RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')))
	level0.append(('gb', GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)))
	level0.append(('svm', SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20)))
	level0.append(('mnb', GaussianNB()))
	# define meta learner model
	level1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

# Training: E-Risk, BSGS, Denmark, AMDTSS
# Testing: E-MTAB

## Preprocessing

In [5]:
#2202 * 834
train_data1 = pd.concat([data_ERISK, data_BSGS, data_DENMARK])
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
156,0.0,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
157,0.0,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
158,0.0,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [6]:
# 2002 * 832
train_data1 = train_data1.loc[:,[i for i in data_AMDTSS.columns]]
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
156,0.0,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
157,0.0,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
158,0.0,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [7]:
# 2530 * 832
train_data1 = pd.concat([train_data1, data_AMDTSS])
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [8]:
# Training 75% developing 25%
x_train1, x_dev1, y_train1, y_dev1 = train_test_split(train_data1.drop(columns=['label']), train_data1['label'])
x_train1.shape, x_dev1.shape, y_train1.shape, y_dev1.shape

((1684, 831), (562, 831), (1684,), (562,))

## Validate Original Classifier

In [9]:
en1 = LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5)
en1.fit(x_train1, y_train1)
y_EMTAB = data_EMTAB['label']
x_EMTAB = data_EMTAB.loc[:,[i for i in data_AMDTSS.columns]]
x_EMTAB = x_EMTAB.drop(['label'], axis = 1)
en_auc1 = roc_auc_score(y_EMTAB, en1.predict_proba(x_EMTAB)[:, 1])
en_auc1



0.7223243464052287

## Train New Classifiers

## Variable Selection by RF

In [10]:
# Grid search for rf
# The number of trees in the forest.
n_estimators = [50, 100, 200, 300, 500]
# The function to measure the quality of a split
criterion = ["gini", "entropy"]
# A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
min_impurity_decrease = [0.1, 0.000001, 0.00001]
# The maximum depth of the tree.
max_depth = [20, 50, 100, 500, 1000]

param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
rf = RandomForestClassifier()
grid_rf = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_rf = grid_rf.fit(x_train1, y_train1) 
grid_result_rf.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomForestClassifier(max_depth=500, min_impurity_decrease=1e-05,
                       n_estimators=500)

In [11]:
# Variable selection by random forest
rf_selection1 = SelectFromModel(grid_result_rf.best_estimator_)
rf_selection1.fit(x_train1, y_train1)

SelectFromModel(estimator=RandomForestClassifier(max_depth=500,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=500))

In [12]:
# selected variables
selected_feat_rf1 = x_train1.columns[(rf_selection1.get_support())]
len(selected_feat_rf1)

280

In [13]:
rf_selected_train_data1 = train_data1.loc[:,[i for i in selected_feat_rf1]]
rf_selected_train_data1['label'] = train_data1['label'] 
rf_selected_train_data1

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg14770527,cg23316599,cg10933186,cg11108474,...,cg09243445,cg14692950,cg23604683,cg09009380,cg09990584,cg07903023,cg07635017,cg08641118,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.597436,0.396608,0.850368,0.479727,...,0.306735,0.672161,0.596974,0.323892,0.342347,0.641294,0.532118,0.081238,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.570267,0.419441,0.770310,0.483118,...,0.316390,0.660461,0.648905,0.315845,0.321944,0.668892,0.546033,0.099777,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.510173,0.479463,0.741130,0.368669,...,0.335758,0.661402,0.671847,0.313867,0.327677,0.433934,0.536528,0.079384,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.481978,0.510254,0.667161,0.360383,...,0.355541,0.620736,0.654326,0.300121,0.303911,0.416025,0.557829,0.067174,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.549200,0.506738,0.731597,0.401675,...,0.359560,0.675501,0.630504,0.313593,0.333914,0.593111,0.546425,0.075612,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.759353,0.573460,0.799908,0.401122,...,0.365249,0.789125,0.759704,0.389640,0.385300,0.699565,0.647941,0.176504,0.275265,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.700554,0.475778,0.679567,0.298102,...,0.362257,0.730544,0.710898,0.337200,0.347106,0.598367,0.603185,0.143384,0.230949,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.686412,0.454601,0.703902,0.322749,...,0.403914,0.841245,0.780795,0.410973,0.507703,0.744434,0.631754,0.077538,0.242766,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.764943,0.376560,0.921208,0.445176,...,0.381835,0.819452,0.786505,0.340245,0.452558,0.637866,0.595563,0.108415,0.302313,0.0


In [14]:
# Training 75% developing 25%
x_train_rf1, x_dev_rf1, y_train_rf1, y_dev_rf1 = train_test_split(rf_selected_train_data1.drop(columns=['label']), rf_selected_train_data1['label'])
x_train_rf1.shape, x_dev_rf1.shape, y_train_rf1.shape, y_dev_rf1.shape

((1684, 280), (562, 280), (1684,), (562,))

In [15]:
rf_selected1 = grid_result_rf.best_estimator_
rf_selected1.fit(x_train_rf1, y_train_rf1)
rf_selected_auc1 = roc_auc_score(y_dev_rf1, rf_selected1.predict_proba(x_dev_rf1)[:, 1])
rf_selected_auc1

0.8345204391437465

In [16]:
y_EMTAB_rf_selected = data_EMTAB['label']
x_EMTAB_rf_selected = data_EMTAB.loc[:,[i for i in selected_feat_rf1]]

In [17]:
rf_selected_auc_EMTAB = roc_auc_score(y_EMTAB_rf_selected, rf_selected1.predict_proba(x_EMTAB_rf_selected)[:, 1])
rf_selected_auc_EMTAB

0.641452205882353

In [18]:
stacking_rf_EMTAB = get_stacking()
stacking_rf_EMTAB.fit(x_train_rf1, y_train_rf1)
stacking_rf_EMTAB_auc = roc_auc_score(y_EMTAB_rf_selected, stacking_rf_EMTAB.predict_proba(x_EMTAB_rf_selected)[:, 1])
stacking_rf_EMTAB_auc



0.6255718954248366

In [19]:
clf1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
clf2 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')
clf3 = GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)
clf4 = SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20, probability=True)

In [20]:
voting_rf_EMTAB = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_rf_EMTAB.fit(x_train_rf1, y_train_rf1)
voting__rf_EMTAB_auc = roc_auc_score(y_EMTAB_rf_selected, voting_rf_EMTAB.predict_proba(x_EMTAB_rf_selected)[:, 1])
voting__rf_EMTAB_auc



0.6876633986928105

## Variable Selection by LR

In [21]:
# Grid search for lr
penalty = ["l1", "l2", "elasticnet"]
# Tolerance for stopping criteria.
tol = [0.00001, 0.001, 0.0000001]
# Inverse of regularization strength
C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# Algorithm to use in the optimization problem
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
lr = LogisticRegression()
grid_lr = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_lr = grid_lr.fit(x_train1, y_train1) 
grid_result_lr.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1314, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1314, in fit
    raise ValueError("l1_ratio must be between 0 and 1;"
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio

LogisticRegression(C=1, solver='newton-cg', tol=1e-05)

In [22]:
lr_selection1 = SelectFromModel(grid_result_lr.best_estimator_)
lr_selection1.fit(x_train1, y_train1)

SelectFromModel(estimator=LogisticRegression(C=1, solver='newton-cg',
                                             tol=1e-05))

In [23]:
selected_feat_lr1 = x_train1.columns[(lr_selection1.get_support())]
len(selected_feat_lr1)

358

In [24]:
lr_selected_train_data1 = train_data1.loc[:,[i for i in selected_feat_lr1]]
lr_selected_train_data1['label'] = train_data1['label'] 
lr_selected_train_data1

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg26916862,cg03124146,cg14770527,cg23316599,cg11108474,cg16340103,...,cg06495631,cg07903023,cg14466863,cg15236528,cg07291889,cg11174855,cg11359720,cg08641118,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.194961,0.144218,0.597436,0.396608,0.479727,0.556758,...,0.570667,0.641294,0.825537,0.156715,0.706627,0.327951,0.731394,0.081238,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.192382,0.139766,0.570267,0.419441,0.483118,0.481381,...,0.589975,0.668892,0.823126,0.220170,0.740209,0.195741,0.745175,0.099777,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.197505,0.193932,0.510173,0.479463,0.368669,0.513161,...,0.507627,0.433934,0.748225,0.178511,0.643731,0.083195,0.773520,0.079384,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.178132,0.181609,0.481978,0.510254,0.360383,0.432874,...,0.507012,0.416025,0.761062,0.086153,0.709617,0.134511,0.753129,0.067174,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.194651,0.170328,0.549200,0.506738,0.401675,0.475677,...,0.673096,0.593111,0.805394,0.137047,0.727944,0.069399,0.824286,0.075612,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.415842,0.387231,0.759353,0.573460,0.401122,0.677112,...,0.645043,0.699565,0.780383,0.128186,0.855644,0.285387,0.713411,0.176504,0.275265,0.0
472,0.155194,0.650936,0.436741,0.127649,0.214664,0.116075,0.700554,0.475778,0.298102,0.585688,...,0.487461,0.598367,0.734427,0.134853,0.777445,0.098905,0.652946,0.143384,0.230949,1.0
475,0.170119,0.668992,0.532885,0.141059,0.177975,0.141652,0.686412,0.454601,0.322749,0.536903,...,0.561416,0.744434,0.805425,0.146198,0.836003,0.208649,0.850505,0.077538,0.242766,1.0
477,0.316241,0.642261,0.906532,0.303791,0.242378,0.278751,0.764943,0.376560,0.445176,0.485595,...,0.566642,0.637866,0.760405,0.130743,0.857884,0.223300,0.607103,0.108415,0.302313,0.0


In [25]:
# Training 75% developing 25%
x_train_lr1, x_dev_lr1, y_train_lr1, y_dev_lr1 = train_test_split(lr_selected_train_data1.drop(columns=['label']), lr_selected_train_data1['label'])
x_train_lr1.shape, x_dev_lr1.shape, y_train_lr1.shape, y_dev_lr1.shape

((1684, 358), (562, 358), (1684,), (562,))

In [26]:
lr_selected1 = grid_result_lr.best_estimator_
lr_selected1.fit(x_train_lr1, y_train_lr1)
lr_selected_auc1 = roc_auc_score(y_dev_lr1, lr_selected1.predict_proba(x_dev_lr1)[:, 1])
lr_selected_auc1

0.841263808655113

In [27]:
y_EMTAB_lr_selected = data_EMTAB['label']
x_EMTAB_lr_selected = data_EMTAB.loc[:,[i for i in selected_feat_lr1]]

In [28]:
lr_selected_auc_EMTAB = roc_auc_score(y_EMTAB_lr_selected, lr_selected1.predict_proba(x_EMTAB_lr_selected)[:, 1])
lr_selected_auc_EMTAB

0.7087724673202613

In [29]:
stacking_lr_EMTAB = get_stacking()
stacking_lr_EMTAB.fit(x_train_lr1, y_train_lr1)
stacking_lr_EMTAB_auc = roc_auc_score(y_EMTAB_lr_selected, stacking_lr_EMTAB.predict_proba(x_EMTAB_lr_selected)[:, 1])
stacking_lr_EMTAB_auc



0.658935866013072

In [30]:
voting_lr_EMTAB = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_lr_EMTAB.fit(x_train_lr1, y_train_lr1)
voting__lr_EMTAB_auc = roc_auc_score(y_EMTAB_lr_selected, voting_lr_EMTAB.predict_proba(x_EMTAB_lr_selected)[:, 1])
voting__lr_EMTAB_auc



0.6998672385620915

# Training: E-Risk, BSGS, Denmark, E-MTAB
# Testing: AMDTSS

In [31]:
# 2650 * 834
train_data2 = pd.concat([data_ERISK, data_BSGS, data_DENMARK, data_EMTAB])
train_data2

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [32]:
# 2650 * 832
train_data2 = train_data2.loc[:,[i for i in data_AMDTSS.columns]]
train_data2

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [33]:
# Training 75% developing 25%
x_train2, x_dev2, y_train2, y_dev2 = train_test_split(train_data2.drop(columns=['label']), train_data2['label'])
x_train2.shape, x_dev2.shape, y_train2.shape, y_dev2.shape

((1972, 831), (658, 831), (1972,), (658,))

## Validate Original Classifier

In [34]:
en2 = LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5)
en2.fit(x_train2, y_train2)
y_AMDTSS = data_AMDTSS['label']
x_AMDTSS = data_AMDTSS.drop(['label'], axis = 1)
en_auc2 = roc_auc_score(y_AMDTSS, en2.predict_proba(x_AMDTSS)[:, 1])
en_auc2



0.7252640036730945

## Train New Clssifiers

## Variable Selection by RF

In [35]:
# Grid search for rf
# The number of trees in the forest.
n_estimators = [50, 100, 200, 300, 500]
# The function to measure the quality of a split
criterion = ["gini", "entropy"]
# A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
min_impurity_decrease = [0.1, 0.000001, 0.00001]
# The maximum depth of the tree.
max_depth = [20, 50, 100, 500, 1000]

param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
rf = RandomForestClassifier()
grid_rf2 = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_rf2 = grid_rf2.fit(x_train2, y_train2) 

grid_result_rf2.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomForestClassifier(max_depth=50, min_impurity_decrease=1e-05,
                       n_estimators=300)

In [36]:
# Variable selection by random forest
rf_selection2 = SelectFromModel(grid_result_rf2.best_estimator_)
rf_selection2.fit(x_train2, y_train2)

SelectFromModel(estimator=RandomForestClassifier(max_depth=50,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=300))

In [37]:
# selected variables
selected_feat_rf2 = x_train2.columns[(rf_selection2.get_support())]
len(selected_feat_rf2)

280

In [38]:
rf_selected_train_data2 = train_data2.loc[:,[i for i in selected_feat_rf2]]
rf_selected_train_data2['label'] = train_data2['label'] 
rf_selected_train_data2

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg25787956,cg04182912,cg09009380,cg09990584,cg07291889,cg21808635,cg07635017,cg08641118,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.396608,...,0.503054,0.588122,0.323892,0.342347,0.706627,0.780742,0.532118,0.081238,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.419441,...,0.466924,0.645375,0.315845,0.321944,0.740209,0.696424,0.546033,0.099777,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.479463,...,0.441580,0.689910,0.313867,0.327677,0.643731,0.606163,0.536528,0.079384,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.510254,...,0.476676,0.677442,0.300121,0.303911,0.709617,0.680310,0.557829,0.067174,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.506738,...,0.493368,0.565591,0.313593,0.333914,0.727944,0.722998,0.546425,0.075612,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,0.427100,...,0.556560,0.634700,0.281210,0.423670,0.846850,0.811640,0.426310,0.106370,0.148490,1.0
644,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,0.470970,...,0.517510,0.617870,0.277860,0.421900,0.806160,0.824450,0.465400,0.121100,0.114360,1.0
645,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,0.356980,...,0.493220,0.592040,0.311790,0.427700,0.797160,0.820790,0.438150,0.134000,0.127200,0.0
646,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,0.273990,...,0.479540,0.627130,0.294900,0.414230,0.841330,0.798780,0.394220,0.143910,0.156560,0.0


In [39]:
# Training 75% developing 25%
x_train_rf2, x_dev_rf2, y_train_rf2, y_dev_rf2 = train_test_split(rf_selected_train_data2.drop(columns=['label']), rf_selected_train_data2['label'])
x_train_rf2.shape, x_dev_rf2.shape, y_train_rf2.shape, y_dev_rf2.shape

((1972, 280), (658, 280), (1972,), (658,))

In [40]:
rf_selected2 = grid_result_rf2.best_estimator_
rf_selected2.fit(x_train_rf2, y_train_rf2)
rf_selected_auc2 = roc_auc_score(y_dev_rf2, rf_selected2.predict_proba(x_dev_rf2)[:, 1])
rf_selected_auc2

0.8438077548606491

In [41]:
y_AMDTSS_rf_selected = data_AMDTSS['label']
x_AMDTSS_rf_selected = data_AMDTSS.loc[:,[i for i in selected_feat_rf2]]

In [42]:
rf_selected_auc_AMDTSS = roc_auc_score(y_AMDTSS_rf_selected, rf_selected2.predict_proba(x_AMDTSS_rf_selected)[:, 1])
rf_selected_auc_AMDTSS

0.6046544995408633

In [43]:
stacking_rf_AMDTSS = get_stacking()
stacking_rf_AMDTSS.fit(x_train_rf2, y_train_rf2)
stacking_rf_AMDTSS_auc = roc_auc_score(y_AMDTSS_rf_selected, stacking_rf_AMDTSS.predict_proba(x_AMDTSS_rf_selected)[:, 1])
stacking_rf_AMDTSS_auc



0.7148186409550047

In [44]:
voting_rf_AMDTSS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_rf_AMDTSS.fit(x_train_rf2, y_train_rf2)
voting__rf_AMDTSS_auc = roc_auc_score(y_AMDTSS_rf_selected, voting_rf_AMDTSS.predict_proba(x_AMDTSS_rf_selected)[:, 1])
voting__rf_AMDTSS_auc



0.6966253443526171

## Variable Selection by LR

In [45]:
# Grid search for lr
penalty = ["l1", "l2", "elasticnet"]
# Tolerance for stopping criteria.
tol = [0.00001, 0.001, 0.0000001]
# Inverse of regularization strength
C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# Algorithm to use in the optimization problem
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
lr = LogisticRegression()
grid_lr2 = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_lr2 = grid_lr2.fit(x_train2, y_train2) 

grid_result_lr2.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 450, in _check_solver
    raise ValueError("Only 'saga' solver supports elasticnet penalty,"
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Lib

LogisticRegression(C=0.5, solver='newton-cg', tol=1e-07)

In [46]:
lr_selection2 = SelectFromModel(grid_result_lr2.best_estimator_)
lr_selection2.fit(x_train2, y_train2)

SelectFromModel(estimator=LogisticRegression(C=0.5, solver='newton-cg',
                                             tol=1e-07))

In [47]:
selected_feat_lr2 = x_train2.columns[(lr_selection2.get_support())]
len(selected_feat_lr2)

351

In [48]:
lr_selected_train_data2 = train_data2.loc[:,[i for i in selected_feat_lr2]]
lr_selected_train_data2['label'] = train_data2['label'] 
lr_selected_train_data2

Unnamed: 0,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg03124146,cg14770527,cg23316599,cg11108474,cg26262573,...,cg09990584,cg06495631,cg15236528,cg07291889,cg21808635,cg07635017,cg08641118,cg22034735,cg17805624,label
0,0.606732,0.730107,0.240143,0.561082,0.533287,0.144218,0.597436,0.396608,0.479727,0.367165,...,0.342347,0.570667,0.156715,0.706627,0.780742,0.532118,0.081238,0.508394,0.279428,1.0
1,0.599726,0.715363,0.242588,0.564277,0.578224,0.139766,0.570267,0.419441,0.483118,0.364813,...,0.321944,0.589975,0.220170,0.740209,0.696424,0.546033,0.099777,0.498426,0.360946,1.0
2,0.552816,0.572559,0.169127,0.541453,0.509944,0.193932,0.510173,0.479463,0.368669,0.334221,...,0.327677,0.507627,0.178511,0.643731,0.606163,0.536528,0.079384,0.204013,0.296357,1.0
3,0.655871,0.391728,0.224729,0.480992,0.421599,0.181609,0.481978,0.510254,0.360383,0.352629,...,0.303911,0.507012,0.086153,0.709617,0.680310,0.557829,0.067174,0.297127,0.292479,1.0
4,0.493554,0.395203,0.231550,0.474545,0.381759,0.170328,0.549200,0.506738,0.401675,0.441518,...,0.333914,0.673096,0.137047,0.727944,0.722998,0.546425,0.075612,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.668810,0.538530,0.221750,0.450940,0.409590,0.202900,0.703690,0.427100,0.379050,0.481300,...,0.423670,0.362890,0.032750,0.846850,0.811640,0.426310,0.106370,0.381660,0.148490,1.0
644,0.673100,0.552430,0.211480,0.484940,0.468880,0.240040,0.703450,0.470970,0.357540,0.524900,...,0.421900,0.285190,0.034740,0.806160,0.824450,0.465400,0.121100,0.315100,0.114360,1.0
645,0.586160,0.691140,0.240640,0.481040,0.579180,0.234960,0.633500,0.356980,0.222150,0.374090,...,0.427700,0.339370,0.025840,0.797160,0.820790,0.438150,0.134000,0.261410,0.127200,0.0
646,0.679960,0.553150,0.286020,0.315550,0.421130,0.297830,0.680370,0.273990,0.159820,0.345570,...,0.414230,0.327810,0.025380,0.841330,0.798780,0.394220,0.143910,0.566440,0.156560,0.0


In [49]:
# Training 75% developing 25%
x_train_lr2, x_dev_lr2, y_train_lr2, y_dev_lr2 = train_test_split(lr_selected_train_data2.drop(columns=['label']), lr_selected_train_data2['label'])
x_train_lr2.shape, x_dev_lr2.shape, y_train_lr2.shape, y_dev_lr2.shape

((1972, 351), (658, 351), (1972,), (658,))

In [50]:
lr_selected2 = grid_result_lr2.best_estimator_
lr_selected2.fit(x_train_lr2, y_train_lr2)
lr_selected_auc2 = roc_auc_score(y_dev_lr2, lr_selected2.predict_proba(x_dev_lr2)[:, 1])
lr_selected_auc2

0.8356017556017555

In [51]:
y_AMDTSS_lr_selected = data_AMDTSS['label']
x_AMDTSS_lr_selected = data_AMDTSS.loc[:,[i for i in selected_feat_lr2]]

In [52]:
lr_selected_auc_AMDTSS = roc_auc_score(y_AMDTSS_lr_selected, lr_selected2.predict_proba(x_AMDTSS_lr_selected)[:, 1])
lr_selected_auc_AMDTSS

0.6904269972451791

In [53]:
stacking_lr_AMDTSS = get_stacking()
stacking_lr_AMDTSS.fit(x_train_lr2, y_train_lr2)
stacking_lr_AMDTSS_auc = roc_auc_score(y_AMDTSS_lr_selected, stacking_lr_AMDTSS.predict_proba(x_AMDTSS_lr_selected)[:, 1])
stacking_lr_AMDTSS_auc



0.6887052341597796

In [54]:
voting_lr_AMDTSS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_lr_AMDTSS.fit(x_train_lr2, y_train_lr2)
voting__lr_AMDTSS_auc = roc_auc_score(y_AMDTSS_lr_selected, voting_lr_AMDTSS.predict_proba(x_AMDTSS_lr_selected)[:, 1])
voting__lr_AMDTSS_auc



0.6836547291092746

# Training: E-Risk, BSGS, AMDTSS, E-MTAB
# Testing: Denmark

In [55]:
#2470 * 834
train_data3 = pd.concat([data_ERISK, data_BSGS, data_EMTAB])
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [56]:
# 2470 * 832
train_data3 = train_data3.loc[:,[i for i in data_AMDTSS.columns]]
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [57]:
# 2734 * 832
train_data3 = pd.concat([train_data3, data_AMDTSS])
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [58]:
# Training 75% developing 25%
x_train3, x_dev3, y_train3, y_dev3 = train_test_split(train_data3.drop(columns=['label']), train_data3['label'])
x_train3.shape, x_dev3.shape, y_train3.shape, y_dev3.shape

((2050, 831), (684, 831), (2050,), (684,))

## Validate Original Classifier

In [59]:
en3 = LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5)
en3.fit(x_train3, y_train3)
y_DENMARK = data_DENMARK['label']
x_DENMARK = data_DENMARK.loc[:,[i for i in data_AMDTSS.columns]]
x_DENMARK = x_DENMARK.drop(['label'], axis = 1)
en_auc3 = roc_auc_score(y_DENMARK, en3.predict_proba(x_DENMARK)[:, 1])
en_auc3



0.6893796992481203

## Train New Classifiers

## Variable Selection by RF

In [60]:
# Grid search for rf
# The number of trees in the forest.
n_estimators = [50, 100, 200, 300, 500]
# The function to measure the quality of a split
criterion = ["gini", "entropy"]
# A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
min_impurity_decrease = [0.1, 0.000001, 0.00001]
# The maximum depth of the tree.
max_depth = [20, 50, 100, 500, 1000]

param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
rf = RandomForestClassifier()
grid_rf3 = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_rf3 = grid_rf3.fit(x_train3, y_train3) 

grid_result_rf3.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomForestClassifier(criterion='entropy', max_depth=20,
                       min_impurity_decrease=1e-05, n_estimators=300)

In [61]:
# Variable selection by random forest
rf_selection3 = SelectFromModel(grid_result_rf3.best_estimator_)
rf_selection3.fit(x_train3, y_train3)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_depth=20,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=300))

In [62]:
# selected variables
selected_feat_rf3 = x_train3.columns[(rf_selection3.get_support())]
len(selected_feat_rf3)

285

In [63]:
rf_selected_train_data3 = train_data3.loc[:,[i for i in selected_feat_rf3]]
rf_selected_train_data3['label'] = train_data3['label'] 
rf_selected_train_data3

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg14770527,cg23316599,cg10933186,...,cg07430760,cg01929855,cg09243445,cg27132152,cg15489799,cg09990584,cg11174855,cg21808635,cg08641118,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.597436,0.396608,0.850368,...,0.090786,0.865708,0.306735,0.363816,0.434138,0.342347,0.327951,0.780742,0.081238,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.570267,0.419441,0.770310,...,0.086232,0.889347,0.316390,0.454734,0.456402,0.321944,0.195741,0.696424,0.099777,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.510173,0.479463,0.741130,...,0.062408,0.880054,0.335758,0.479652,0.476032,0.327677,0.083195,0.606163,0.079384,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.481978,0.510254,0.667161,...,0.119200,0.849659,0.355541,0.414647,0.461083,0.303911,0.134511,0.680310,0.067174,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.549200,0.506738,0.731597,...,0.129792,0.886471,0.359560,0.443079,0.502171,0.333914,0.069399,0.722998,0.075612,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.759353,0.573460,0.799908,...,0.163088,0.900970,0.365249,0.557590,0.546019,0.385300,0.285387,0.869079,0.176504,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.700554,0.475778,0.679567,...,0.092700,0.859307,0.362257,0.488550,0.478794,0.347106,0.098905,0.766231,0.143384,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.686412,0.454601,0.703902,...,0.106060,0.922259,0.403914,0.515793,0.603469,0.507703,0.208649,0.827691,0.077538,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.764943,0.376560,0.921208,...,0.107320,0.909179,0.381835,0.512241,0.594803,0.452558,0.223300,0.776557,0.108415,0.0


In [64]:
# Training 75% developing 25%
x_train_rf3, x_dev_rf3, y_train_rf3, y_dev_rf3 = train_test_split(rf_selected_train_data3.drop(columns=['label']), rf_selected_train_data3['label'])
x_train_rf3.shape, x_dev_rf3.shape, y_train_rf3.shape, y_dev_rf3.shape

((2050, 285), (684, 285), (2050,), (684,))

In [65]:
rf_selected3 = grid_result_rf3.best_estimator_
rf_selected3.fit(x_train_rf3, y_train_rf3)
rf_selected_auc3 = roc_auc_score(y_dev_rf3, rf_selected3.predict_proba(x_dev_rf3)[:, 1])
rf_selected_auc3

0.793661699660755

In [66]:
y_DENMARK_rf_selected = data_DENMARK['label']
x_DENMARK_rf_selected = data_DENMARK.loc[:,[i for i in selected_feat_rf3]]

In [67]:
rf_selected_auc_DENMARK = roc_auc_score(y_DENMARK_rf_selected, rf_selected3.predict_proba(x_DENMARK_rf_selected)[:, 1])
rf_selected_auc_DENMARK

0.6067512531328321

In [68]:
stacking_rf_DENMARK = get_stacking()
stacking_rf_DENMARK.fit(x_train_rf3, y_train_rf3)
stacking_rf_DENMARK_auc = roc_auc_score(y_DENMARK_rf_selected, stacking_rf_DENMARK.predict_proba(x_DENMARK_rf_selected)[:, 1])
stacking_rf_DENMARK_auc



0.6076127819548872

In [69]:
voting_rf_DENMARK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_rf_DENMARK.fit(x_train_rf3, y_train_rf3)
voting_rf_DENMARK_auc = roc_auc_score(y_DENMARK_rf_selected, voting_rf_DENMARK.predict_proba(x_DENMARK_rf_selected)[:, 1])
voting_rf_DENMARK_auc



0.6915726817042607

## Variable Selection by LR

In [71]:
# Grid search for lr
penalty = ["l1", "l2", "elasticnet"]
# Tolerance for stopping criteria.
tol = [0.00001, 0.001, 0.0000001]
# Inverse of regularization strength
C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# Algorithm to use in the optimization problem
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
lr = LogisticRegression()
grid_lr3 = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_lr3 = grid_lr3.fit(x_train3, y_train3) 

grid_result_lr3.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 450, in _check_solver
    raise ValueError("Only 'saga' solver supports elasticnet penalty,"
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Lib

LogisticRegression(C=0.5, solver='liblinear', tol=1e-05)

In [72]:
lr_selection3 = SelectFromModel(grid_result_lr3.best_estimator_)
lr_selection3.fit(x_train3, y_train3)

SelectFromModel(estimator=LogisticRegression(C=0.5, solver='liblinear',
                                             tol=1e-05))

In [73]:
selected_feat_lr3 = x_train3.columns[(lr_selection3.get_support())]
len(selected_feat_lr3)

347

In [74]:
lr_selected_train_data3 = train_data3.loc[:,[i for i in selected_feat_lr3]]
lr_selected_train_data3['label'] = train_data3['label'] 
lr_selected_train_data3

Unnamed: 0,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg03124146,cg14770527,cg11108474,cg16340103,cg04838249,...,cg09990584,cg14466863,cg11174855,cg04524933,cg11359720,cg07635017,cg08641118,cg22034735,cg17805624,label
0,0.606732,0.730107,0.240143,0.561082,0.533287,0.144218,0.597436,0.479727,0.556758,0.709753,...,0.342347,0.825537,0.327951,0.919347,0.731394,0.532118,0.081238,0.508394,0.279428,1.0
1,0.599726,0.715363,0.242588,0.564277,0.578224,0.139766,0.570267,0.483118,0.481381,0.727749,...,0.321944,0.823126,0.195741,0.945236,0.745175,0.546033,0.099777,0.498426,0.360946,1.0
2,0.552816,0.572559,0.169127,0.541453,0.509944,0.193932,0.510173,0.368669,0.513161,0.668243,...,0.327677,0.748225,0.083195,0.906838,0.773520,0.536528,0.079384,0.204013,0.296357,1.0
3,0.655871,0.391728,0.224729,0.480992,0.421599,0.181609,0.481978,0.360383,0.432874,0.745117,...,0.303911,0.761062,0.134511,0.956986,0.753129,0.557829,0.067174,0.297127,0.292479,1.0
4,0.493554,0.395203,0.231550,0.474545,0.381759,0.170328,0.549200,0.401675,0.475677,0.783492,...,0.333914,0.805394,0.069399,0.921778,0.824286,0.546425,0.075612,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.819136,0.889120,0.274042,0.525204,0.698463,0.387231,0.759353,0.401122,0.677112,0.805374,...,0.385300,0.780383,0.285387,0.953237,0.713411,0.647941,0.176504,0.529537,0.275265,0.0
472,0.650936,0.436741,0.127649,0.458996,0.414456,0.116075,0.700554,0.298102,0.585688,0.670181,...,0.347106,0.734427,0.098905,0.945503,0.652946,0.603185,0.143384,0.522733,0.230949,1.0
475,0.668992,0.532885,0.141059,0.522190,0.469892,0.141652,0.686412,0.322749,0.536903,0.624104,...,0.507703,0.805425,0.208649,0.962932,0.850505,0.631754,0.077538,0.617378,0.242766,1.0
477,0.642261,0.906532,0.303791,0.403003,0.830418,0.278751,0.764943,0.445176,0.485595,0.663237,...,0.452558,0.760405,0.223300,0.917177,0.607103,0.595563,0.108415,0.444886,0.302313,0.0


In [75]:
# Training 75% developing 25%
x_train_lr3, x_dev_lr3, y_train_lr3, y_dev_lr3 = train_test_split(lr_selected_train_data3.drop(columns=['label']), lr_selected_train_data3['label'])
x_train_lr3.shape, x_dev_lr3.shape, y_train_lr3.shape, y_dev_lr3.shape

((2050, 347), (684, 347), (2050,), (684,))

In [76]:
lr_selected3 = grid_result_lr3.best_estimator_
lr_selected3.fit(x_train_lr3, y_train_lr3)
lr_selected_auc3 = roc_auc_score(y_dev_lr3, lr_selected3.predict_proba(x_dev_lr3)[:, 1])
lr_selected_auc3

0.8164935231499293

In [77]:
y_DENMARK_lr_selected = data_DENMARK['label']
x_DENMARK_lr_selected = data_DENMARK.loc[:,[i for i in selected_feat_lr3]]

In [78]:
lr_selected_auc_DENMARK = roc_auc_score(y_DENMARK_lr_selected, lr_selected3.predict_proba(x_DENMARK_lr_selected)[:, 1])
lr_selected_auc_DENMARK

0.6920426065162907

In [79]:
stacking_lr_DENMARK = get_stacking()
stacking_lr_DENMARK.fit(x_train_lr3, y_train_lr3)
stacking_lr_DENMARK_auc = roc_auc_score(y_DENMARK_lr_selected, stacking_lr_DENMARK.predict_proba(x_DENMARK_lr_selected)[:, 1])
stacking_lr_DENMARK_auc



0.5366541353383458

In [80]:
voting_lr_DENMARK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_lr_DENMARK.fit(x_train_lr3, y_train_lr3)
voting_lr_DENMARK_auc = roc_auc_score(y_DENMARK_lr_selected, voting_lr_DENMARK.predict_proba(x_DENMARK_lr_selected)[:, 1])
voting_lr_DENMARK_auc



0.6099624060150376

# Training: E-Risk, AMDTSS, E-MTAB, Denmark
# Testing: BSGS

In [81]:
#2292 * 834
train_data4 = pd.concat([data_ERISK, data_DENMARK, data_EMTAB])
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [82]:
# 2292 * 832
train_data4 = train_data4.loc[:,[i for i in data_AMDTSS.columns]]
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [83]:
# 2556 * 832
train_data4 = pd.concat([train_data4, data_AMDTSS])
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [84]:
# Training 75% developing 25%
x_train4, x_dev4, y_train4, y_dev4 = train_test_split(train_data4.drop(columns=['label']), train_data4['label'])
x_train4.shape, x_dev4.shape, y_train4.shape, y_dev4.shape

((1902, 831), (634, 831), (1902,), (634,))

## Validate Original Classifier

In [85]:
en4 = LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5)
en4.fit(x_train4, y_train4)
y_BSGS = data_BSGS['label']
x_BSGS = data_BSGS.loc[:,[i for i in data_AMDTSS.columns]]
x_BSGS = x_BSGS.drop(['label'], axis = 1)
en_auc4 = roc_auc_score(y_BSGS, en4.predict_proba(x_BSGS)[:, 1])
en_auc4



0.8167414050822123

## Train New Classifiers

## Variable Selection by RF

In [86]:
# Grid search for rf
# The number of trees in the forest.
n_estimators = [50, 100, 200, 300, 500]
# The function to measure the quality of a split
criterion = ["gini", "entropy"]
# A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
min_impurity_decrease = [0.1, 0.000001, 0.00001]
# The maximum depth of the tree.
max_depth = [20, 50, 100, 500, 1000]

param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
rf = RandomForestClassifier()
grid_rf4 = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_rf4 = grid_rf4.fit(x_train4, y_train4) 

grid_result_rf4.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomForestClassifier(max_depth=1000, min_impurity_decrease=1e-06,
                       n_estimators=500)

In [87]:
# Variable selection by random forest
rf_selection4 = SelectFromModel(grid_result_rf4.best_estimator_)
rf_selection4.fit(x_train4, y_train4)

SelectFromModel(estimator=RandomForestClassifier(max_depth=1000,
                                                 min_impurity_decrease=1e-06,
                                                 n_estimators=500))

In [88]:
# selected variables
selected_feat_rf4 = x_train4.columns[(rf_selection4.get_support())]
len(selected_feat_rf4)

285

In [89]:
rf_selected_train_data4 = train_data4.loc[:,[i for i in selected_feat_rf4]]
rf_selected_train_data4['label'] = train_data4['label'] 
rf_selected_train_data4

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg22381068,cg24686497,cg06427838,cg06100807,cg15489799,cg09990584,cg15236528,cg03556669,cg21808635,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.396608,...,0.267603,0.065910,0.089080,0.084025,0.434138,0.342347,0.156715,0.305896,0.780742,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.419441,...,0.290622,0.068113,0.078382,0.062799,0.456402,0.321944,0.220170,0.313702,0.696424,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.479463,...,0.240754,0.055055,0.044524,0.052757,0.476032,0.327677,0.178511,0.230778,0.606163,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.510254,...,0.281359,0.059226,0.088794,0.061023,0.461083,0.303911,0.086153,0.257855,0.680310,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.506738,...,0.269913,0.205009,0.073908,0.051648,0.502171,0.333914,0.137047,0.380755,0.722998,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,0.573460,...,0.300424,0.079161,0.037285,0.094577,0.546019,0.385300,0.128186,0.425339,0.869079,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,0.475778,...,0.330803,0.041471,0.033304,0.041635,0.478794,0.347106,0.134853,0.290931,0.766231,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,0.454601,...,0.336278,0.044382,0.207561,0.024130,0.603469,0.507703,0.146198,0.502986,0.827691,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,0.376560,...,0.353845,0.067505,0.070588,0.045483,0.594803,0.452558,0.130743,0.225473,0.776557,0.0


In [90]:
# Training 75% developing 25%
x_train_rf4, x_dev_rf4, y_train_rf4, y_dev_rf4 = train_test_split(rf_selected_train_data4.drop(columns=['label']), rf_selected_train_data4['label'])
x_train_rf4.shape, x_dev_rf4.shape, y_train_rf4.shape, y_dev_rf4.shape

((1902, 285), (634, 285), (1902,), (634,))

In [91]:
rf_selected4 = grid_result_rf4.best_estimator_
rf_selected4.fit(x_train_rf4, y_train_rf4)
rf_selected_auc4 = roc_auc_score(y_dev_rf4, rf_selected4.predict_proba(x_dev_rf4)[:, 1])
rf_selected_auc4

0.8177638984350876

In [92]:
y_BSGS_rf_selected = data_BSGS['label']
x_BSGS_rf_selected = data_BSGS.loc[:,[i for i in selected_feat_rf4]]

In [93]:
rf_selected_auc_BSGS = roc_auc_score(y_BSGS_rf_selected, rf_selected4.predict_proba(x_BSGS_rf_selected)[:, 1])
rf_selected_auc_BSGS

0.7798206278026906

In [94]:
stacking_rf_BSGS = get_stacking()
stacking_rf_BSGS.fit(x_train_rf4, y_train_rf4)
stacking_rf_BSGS_auc = roc_auc_score(y_BSGS_rf_selected, stacking_rf_BSGS.predict_proba(x_BSGS_rf_selected)[:, 1])
stacking_rf_BSGS_auc



0.8097326025577146

In [95]:
voting_rf_BSGS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_rf_BSGS.fit(x_train_rf4, y_train_rf4)
voting_rf_BSGS_auc = roc_auc_score(y_BSGS_rf_selected, voting_rf_BSGS.predict_proba(x_BSGS_rf_selected)[:, 1])
voting_rf_BSGS_auc



0.8137186513868129

## Variable Selection by LR

In [96]:
# Grid search for lr
penalty = ["l1", "l2", "elasticnet"]
# Tolerance for stopping criteria.
tol = [0.00001, 0.001, 0.0000001]
# Inverse of regularization strength
C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# Algorithm to use in the optimization problem
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
lr = LogisticRegression()
grid_lr4 = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_lr4 = grid_lr4.fit(x_train4, y_train4) 

grid_result_lr4.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  

LogisticRegression(C=20, solver='sag', tol=1e-05)

In [97]:
lr_selection4 = SelectFromModel(grid_result_lr4.best_estimator_)
lr_selection4.fit(x_train4, y_train4)



SelectFromModel(estimator=LogisticRegression(C=20, solver='sag', tol=1e-05))

In [98]:
selected_feat_lr4 = x_train4.columns[(lr_selection4.get_support())]
len(selected_feat_lr4)

342

In [99]:
lr_selected_train_data4 = train_data4.loc[:,[i for i in selected_feat_lr4]]
lr_selected_train_data4['label'] = train_data4['label'] 
lr_selected_train_data4

Unnamed: 0,cg01193368,cg06098368,cg08690094,cg03124146,cg14770527,cg23316599,cg11108474,cg16340103,cg04838249,cg26262573,...,cg27132152,cg09990584,cg07903023,cg15236528,cg21808635,cg07635017,cg08641118,cg22034735,cg17805624,label
0,0.606732,0.240143,0.561082,0.144218,0.597436,0.396608,0.479727,0.556758,0.709753,0.367165,...,0.363816,0.342347,0.641294,0.156715,0.780742,0.532118,0.081238,0.508394,0.279428,1.0
1,0.599726,0.242588,0.564277,0.139766,0.570267,0.419441,0.483118,0.481381,0.727749,0.364813,...,0.454734,0.321944,0.668892,0.220170,0.696424,0.546033,0.099777,0.498426,0.360946,1.0
2,0.552816,0.169127,0.541453,0.193932,0.510173,0.479463,0.368669,0.513161,0.668243,0.334221,...,0.479652,0.327677,0.433934,0.178511,0.606163,0.536528,0.079384,0.204013,0.296357,1.0
3,0.655871,0.224729,0.480992,0.181609,0.481978,0.510254,0.360383,0.432874,0.745117,0.352629,...,0.414647,0.303911,0.416025,0.086153,0.680310,0.557829,0.067174,0.297127,0.292479,1.0
4,0.493554,0.231550,0.474545,0.170328,0.549200,0.506738,0.401675,0.475677,0.783492,0.441518,...,0.443079,0.333914,0.593111,0.137047,0.722998,0.546425,0.075612,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.819136,0.274042,0.525204,0.387231,0.759353,0.573460,0.401122,0.677112,0.805374,0.666337,...,0.557590,0.385300,0.699565,0.128186,0.869079,0.647941,0.176504,0.529537,0.275265,0.0
472,0.650936,0.127649,0.458996,0.116075,0.700554,0.475778,0.298102,0.585688,0.670181,0.339098,...,0.488550,0.347106,0.598367,0.134853,0.766231,0.603185,0.143384,0.522733,0.230949,1.0
475,0.668992,0.141059,0.522190,0.141652,0.686412,0.454601,0.322749,0.536903,0.624104,0.516098,...,0.515793,0.507703,0.744434,0.146198,0.827691,0.631754,0.077538,0.617378,0.242766,1.0
477,0.642261,0.303791,0.403003,0.278751,0.764943,0.376560,0.445176,0.485595,0.663237,0.521479,...,0.512241,0.452558,0.637866,0.130743,0.776557,0.595563,0.108415,0.444886,0.302313,0.0


In [100]:
# Training 75% developing 25%
x_train_lr4, x_dev_lr4, y_train_lr4, y_dev_lr4 = train_test_split(lr_selected_train_data4.drop(columns=['label']), lr_selected_train_data4['label'])
x_train_lr4.shape, x_dev_lr4.shape, y_train_lr4.shape, y_dev_lr4.shape

((1902, 342), (634, 342), (1902,), (634,))

In [101]:
lr_selected4 = grid_result_lr4.best_estimator_
lr_selected4.fit(x_train_lr4, y_train_lr4)
lr_selected_auc4 = roc_auc_score(y_dev_lr4, lr_selected4.predict_proba(x_dev_lr4)[:, 1])
lr_selected_auc4



0.8228367689357622

In [102]:
y_BSGS_lr_selected = data_BSGS['label']
x_BSGS_lr_selected = data_BSGS.loc[:,[i for i in selected_feat_lr4]]

In [103]:
lr_selected_auc_BSGS = roc_auc_score(y_BSGS_lr_selected, lr_selected4.predict_proba(x_BSGS_lr_selected)[:, 1])
lr_selected_auc_BSGS

0.7776449094834745

In [104]:
stacking_lr_BSGS = get_stacking()
stacking_lr_BSGS.fit(x_train_lr4, y_train_lr4)
stacking_lr_BSGS_auc = roc_auc_score(y_BSGS_lr_selected, stacking_lr_BSGS.predict_proba(x_BSGS_lr_selected)[:, 1])
stacking_lr_BSGS_auc



0.8109948513535958

In [105]:
voting_lr_BSGS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_lr_BSGS.fit(x_train_lr4, y_train_lr4)
voting_lr_BSGS_auc = roc_auc_score(y_BSGS_lr_selected, voting_lr_BSGS.predict_proba(x_BSGS_lr_selected)[:, 1])
voting_lr_BSGS_auc



0.8100315562198971

# Training: BSGS, AMDTSS, E-MTAB, Denmark
# Testing: E-Risk

In [106]:
# 1186 * 834
train_data5 = pd.concat([data_BSGS, data_DENMARK, data_EMTAB])
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [107]:
# 1186 * 832
train_data5 = train_data5.loc[:,[i for i in data_AMDTSS.columns]]
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [108]:
# 1450 * 832
train_data5 = pd.concat([train_data5, data_AMDTSS])
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [109]:
# Training 75% developing 25%
x_train5, x_dev5, y_train5, y_dev5 = train_test_split(train_data5.drop(columns=['label']), train_data5['label'])
x_train5.shape, x_dev5.shape, y_train5.shape, y_dev5.shape

((1072, 831), (358, 831), (1072,), (358,))

## Validate Original Classifier

In [110]:
en5 = LogisticRegression(penalty = "elasticnet", solver = "saga", l1_ratio = 0.5)
en5.fit(x_train5, y_train5)
y_ERISK = data_ERISK['label']
x_ERISK = data_ERISK.loc[:,[i for i in data_AMDTSS.columns]]
x_ERISK = x_ERISK.drop(['label'], axis = 1)
en_auc5 = roc_auc_score(y_ERISK, en5.predict_proba(x_ERISK)[:, 1])
en_auc5



0.7316751818098132

## Train New Classifier

## Variable Selection by RF

In [111]:
# Grid search for rf
# The number of trees in the forest.
n_estimators = [50, 100, 200, 300, 500]
# The function to measure the quality of a split
criterion = ["gini", "entropy"]
# A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
min_impurity_decrease = [0.1, 0.000001, 0.00001]
# The maximum depth of the tree.
max_depth = [20, 50, 100, 500, 1000]

param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
rf = RandomForestClassifier()
grid_rf5 = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_rf5 = grid_rf5.fit(x_train5, y_train5) 

grid_result_rf5.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomForestClassifier(criterion='entropy', max_depth=20,
                       min_impurity_decrease=1e-05, n_estimators=300)

In [112]:
# Variable selection by random forest
rf_selection5 = SelectFromModel(grid_result_rf5.best_estimator_)
rf_selection5.fit(x_train5, y_train5)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_depth=20,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=300))

In [113]:
# selected variables
selected_feat_rf5 = x_train5.columns[(rf_selection5.get_support())]
len(selected_feat_rf5)

287

In [114]:
rf_selected_train_data5 = train_data5.loc[:,[i for i in selected_feat_rf5]]
rf_selected_train_data5['label'] = train_data5['label'] 
rf_selected_train_data5

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg14770527,cg23316599,cg11108474,...,cg20999932,cg06427838,cg15694117,cg25787956,cg13665998,cg15236528,cg19418458,cg21808635,cg09166085,label
0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.617466,0.462327,0.345466,...,0.292063,0.027239,0.891736,0.562426,0.110296,0.169342,0.448765,0.810244,0.141956,0
2,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.594098,0.432864,0.322264,...,0.279418,0.062395,0.991601,0.546175,0.355201,0.178129,0.458319,0.790811,0.357434,0
4,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.605732,0.452428,0.369445,...,0.294543,0.032160,0.985825,0.557975,0.082696,0.190076,0.275012,0.837924,0.141186,1
7,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.625395,0.477888,0.363694,...,0.282708,0.011340,0.979381,0.533838,0.084307,0.150414,0.463502,0.834654,0.117529,0
10,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.567658,0.282244,0.324882,...,0.375531,0.038941,0.977677,0.553008,0.298719,0.246285,0.225151,0.808565,0.321008,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.759353,0.573460,0.401122,...,0.223322,0.037285,0.982111,0.641851,0.088521,0.128186,0.542180,0.869079,0.168747,0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.700554,0.475778,0.298102,...,0.220185,0.033304,0.945254,0.513343,0.091538,0.134853,0.361169,0.766231,0.172315,1
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.686412,0.454601,0.322749,...,0.362009,0.207561,0.997431,0.595221,0.078325,0.146198,0.609414,0.827691,0.129992,1
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.764943,0.376560,0.445176,...,0.337030,0.070588,0.901474,0.579367,0.329829,0.130743,0.357777,0.776557,0.338011,0


In [115]:
# Training 75% developing 25%
x_train_rf5, x_dev_rf5, y_train_rf5, y_dev_rf5 = train_test_split(rf_selected_train_data5.drop(columns=['label']), rf_selected_train_data5['label'])
x_train_rf5.shape, x_dev_rf5.shape, y_train_rf5.shape, y_dev_rf5.shape

((1072, 287), (358, 287), (1072,), (358,))

In [116]:
rf_selected5 = grid_result_rf5.best_estimator_
rf_selected5.fit(x_train_rf5, y_train_rf5)
rf_selected_auc5 = roc_auc_score(y_dev_rf5, rf_selected5.predict_proba(x_dev_rf5)[:, 1])
rf_selected_auc5

0.7901511376847701

In [117]:
y_ERISK_rf_selected = data_ERISK['label']
x_ERISK_rf_selected = data_ERISK.loc[:,[i for i in selected_feat_rf5]]

In [118]:
rf_selected_auc_ERISK = roc_auc_score(y_ERISK_rf_selected, rf_selected5.predict_proba(x_ERISK_rf_selected)[:, 1])
rf_selected_auc_ERISK

0.7183491745681058

In [119]:
stacking_rf_ERISK = get_stacking()
stacking_rf_ERISK.fit(x_train_rf5, y_train_rf5)
stacking_rf_ERISK_auc = roc_auc_score(y_ERISK_rf_selected, stacking_rf_ERISK.predict_proba(x_ERISK_rf_selected)[:, 1])
stacking_rf_ERISK_auc



0.7101456779895057

In [120]:
voting_rf_ERISK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_rf_ERISK.fit(x_train_rf5, y_train_rf5)
voting_rf_ERISK_auc = roc_auc_score(y_ERISK_rf_selected, voting_rf_ERISK.predict_proba(x_ERISK_rf_selected)[:, 1])
voting_rf_ERISK_auc



0.7205479609684249

## Variable Selection by LR

In [121]:
# Grid search for lr
penalty = ["l1", "l2", "elasticnet"]
# Tolerance for stopping criteria.
tol = [0.00001, 0.001, 0.0000001]
# Inverse of regularization strength
C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# Algorithm to use in the optimization problem
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
lr = LogisticRegression()
grid_lr5 = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
                          verbose = 1, n_jobs = -1) 
grid_result_lr5 = grid_lr5.fit(x_train5, y_train5) 

grid_result_lr5.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/L

LogisticRegression(C=30, solver='saga', tol=0.001)

In [122]:
lr_selection5 = SelectFromModel(grid_result_lr5.best_estimator_)
lr_selection5.fit(x_train5, y_train5)



SelectFromModel(estimator=LogisticRegression(C=30, solver='saga', tol=0.001))

In [123]:
selected_feat_lr5 = x_train5.columns[(lr_selection5.get_support())]
len(selected_feat_lr5)

343

In [124]:
lr_selected_train_data5 = train_data5.loc[:,[i for i in selected_feat_lr5]]
lr_selected_train_data5['label'] = train_data5['label'] 
lr_selected_train_data5

Unnamed: 0,cg06098368,cg08690094,cg03124146,cg10933186,cg11108474,cg04838249,cg11430197,cg15509177,cg01418385,cg03203197,...,cg04600077,cg09009380,cg13665998,cg07903023,cg19418458,cg21808635,cg08641118,cg09166085,cg17805624,label
0,0.269499,0.516672,0.224724,0.796146,0.345466,0.812046,0.313824,0.418919,0.351138,0.594579,...,0.636356,0.323738,0.110296,0.560400,0.448765,0.810244,0.113030,0.141956,0.243362,0
2,0.219685,0.441304,0.169521,0.737150,0.322264,0.539041,0.188154,0.407961,0.269543,0.476736,...,0.724144,0.357260,0.355201,0.627009,0.458319,0.790811,0.182175,0.357434,0.286040,0
4,0.314880,0.489395,0.206316,0.822910,0.369445,0.724093,0.286106,0.481554,0.353168,0.484432,...,0.728159,0.368134,0.082696,0.635566,0.275012,0.837924,0.121501,0.141186,0.277873,1
7,0.276872,0.506406,0.310907,0.845854,0.363694,0.783814,0.372262,0.493866,0.302054,0.487117,...,0.741501,0.394190,0.084307,0.630104,0.463502,0.834654,0.215362,0.117529,0.308074,0
10,0.137395,0.380327,0.170393,0.737851,0.324882,0.581293,0.305695,0.416754,0.264194,0.419088,...,0.783412,0.375530,0.298719,0.619235,0.225151,0.808565,0.150539,0.321008,0.351189,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.274042,0.525204,0.387231,0.799908,0.401122,0.805374,0.427378,0.505403,0.493238,0.402383,...,0.718748,0.389640,0.088521,0.699565,0.542180,0.869079,0.176504,0.168747,0.275265,0
472,0.127649,0.458996,0.116075,0.679567,0.298102,0.670181,0.358344,0.404168,0.257487,0.525479,...,0.712071,0.337200,0.091538,0.598367,0.361169,0.766231,0.143384,0.172315,0.230949,1
475,0.141059,0.522190,0.141652,0.703902,0.322749,0.624104,0.327654,0.492028,0.322215,0.444669,...,0.748548,0.410973,0.078325,0.744434,0.609414,0.827691,0.077538,0.129992,0.242766,1
477,0.303791,0.403003,0.278751,0.921208,0.445176,0.663237,0.503726,0.609263,0.517012,0.516163,...,0.830315,0.340245,0.329829,0.637866,0.357777,0.776557,0.108415,0.338011,0.302313,0


In [125]:
# Training 75% developing 25%
x_train_lr5, x_dev_lr5, y_train_lr5, y_dev_lr5 = train_test_split(lr_selected_train_data5.drop(columns=['label']), lr_selected_train_data5['label'])
x_train_lr5.shape, x_dev_lr5.shape, y_train_lr5.shape, y_dev_lr5.shape

((1072, 343), (358, 343), (1072,), (358,))

In [126]:
lr_selected5 = grid_result_lr5.best_estimator_
lr_selected5.fit(x_train_lr5, y_train_lr5)
lr_selected_auc5 = roc_auc_score(y_dev_lr5, lr_selected5.predict_proba(x_dev_lr5)[:, 1])
lr_selected_auc5



0.8351923076923077

In [127]:
y_ERISK_lr_selected = data_ERISK['label']
x_ERISK_lr_selected = data_ERISK.loc[:,[i for i in selected_feat_lr5]]

In [128]:
lr_selected_auc_ERISK = roc_auc_score(y_ERISK_lr_selected, lr_selected5.predict_proba(x_ERISK_lr_selected)[:, 1])
lr_selected_auc_ERISK

0.7152796956028109

In [129]:
stacking_lr_ERISK = get_stacking()
stacking_lr_ERISK.fit(x_train_lr5, y_train_lr5)
stacking_lr_ERISK_auc = roc_auc_score(y_ERISK_lr_selected, stacking_lr_ERISK.predict_proba(x_ERISK_lr_selected)[:, 1])
stacking_lr_ERISK_auc



0.7148385958452238

In [130]:
voting_lr_ERISK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_lr_ERISK.fit(x_train_lr5, y_train_lr5)
voting_lr_ERISK_auc = roc_auc_score(y_ERISK_lr_selected, voting_lr_ERISK.predict_proba(x_ERISK_lr_selected)[:, 1])
voting_lr_ERISK_auc



0.7158665500629047