In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

### Note: 
#### - Missing values in both training and testing datasets are replaced by mean; 
#### - Haven't removed incorrect sample in the Denmark dataset

In [3]:
# Read Datasets

# 1658 * 834
data_ERISK = pd.read_csv('ERisk_data.csv')
# 1464 * 834
data_ERISK = data_ERISK.dropna()

# 614 rows × 834 columns
data_BSGS = pd.read_csv('BSGS_data.csv')
# 358 rows × 834 columns
data_BSGS = data_BSGS[data_BSGS['label']!='0']
data_BSGS['label'] = data_BSGS['label'].replace(['MZ','DZ'],[1,0])
data_BSGS = data_BSGS.fillna(data_BSGS.mean())

# 180 * 834
data_DENMARK = pd.read_csv('DENMARK_data.csv')
data_DENMARK['label'] = data_DENMARK['label'].replace([2],[0])

# 479 * 832
data_AMDTSS = pd.read_csv('AMDTSS_data.csv')
# 264 * 832 - removing family members
data_AMDTSS = data_AMDTSS[data_AMDTSS['label']!='Sister']
data_AMDTSS['label'] = data_AMDTSS['label'].replace(['MZ','DZ'],[1,0])

# 648 * 834
data_EMTAB = pd.read_csv('EMTAB_data.csv')
data_EMTAB['label'] = data_EMTAB['label'].replace(['dizygotic', 'monozygotic'],[0, 1])
data_EMTAB = data_EMTAB.fillna(data_EMTAB.mean())

# Training: E-Risk, BSGS, Denmark, AMDTSS
# Testing: E-MTAB

## Preprocessing

In [4]:
#2202 * 834
train_data1 = pd.concat([data_ERISK, data_BSGS, data_DENMARK])
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.0,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
176,0.0,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
177,0.0,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
178,0.0,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [5]:
# 2002 * 832
train_data1 = train_data1.loc[:,[i for i in data_AMDTSS.columns]]
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.0,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
176,0.0,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
177,0.0,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
178,0.0,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [6]:
# 2530 * 832
train_data1 = pd.concat([train_data1, data_AMDTSS])
train_data1

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [7]:
# Training 75% developing 25%
x_train1, x_dev1, y_train1, y_dev1 = train_test_split(train_data1.drop(columns=['label']), train_data1['label'])
x_train1.shape, x_dev1.shape, y_train1.shape, y_dev1.shape

((1699, 831), (567, 831), (1699,), (567,))

## Variable Selection by RF

In [8]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train1, y_train1) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [9]:
# Variable selection by random forest
rf_selection1 = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth =  100, criterion = 'entropy'))
rf_selection1.fit(x_train1, y_train1)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_depth=100,
                                                 min_impurity_decrease=1e-06,
                                                 n_estimators=500))

In [10]:
# selected variables
selected_feat_rf1 = x_train1.columns[(rf_selection1.get_support())]
len(selected_feat_rf1)

297

In [11]:
rf_selected_train_data1 = train_data1.loc[:,[i for i in selected_feat_rf1]]
rf_selected_train_data1['label'] = train_data1['label'] 
rf_selected_train_data1

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg03124146,cg14770527,cg23316599,cg10933186,...,cg26046072,cg15694117,cg15489799,cg04600077,cg09009380,cg07903023,cg21808635,cg07635017,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.144218,0.597436,0.396608,0.850368,...,0.856189,0.961428,0.434138,0.657943,0.323892,0.641294,0.780742,0.532118,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.139766,0.570267,0.419441,0.770310,...,0.888501,0.936653,0.456402,0.667529,0.315845,0.668892,0.696424,0.546033,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.193932,0.510173,0.479463,0.741130,...,0.875376,0.934546,0.476032,0.579968,0.313867,0.433934,0.606163,0.536528,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.181609,0.481978,0.510254,0.667161,...,0.875556,0.942972,0.461083,0.638226,0.300121,0.416025,0.680310,0.557829,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.170328,0.549200,0.506738,0.731597,...,0.906104,0.938093,0.502171,0.629674,0.313593,0.593111,0.722998,0.546425,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.387231,0.759353,0.573460,0.799908,...,0.865380,0.982111,0.546019,0.718748,0.389640,0.699565,0.869079,0.647941,0.275265,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.116075,0.700554,0.475778,0.679567,...,0.859387,0.945254,0.478794,0.712071,0.337200,0.598367,0.766231,0.603185,0.230949,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.141652,0.686412,0.454601,0.703902,...,0.785703,0.997431,0.603469,0.748548,0.410973,0.744434,0.827691,0.631754,0.242766,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.278751,0.764943,0.376560,0.921208,...,0.912417,0.901474,0.594803,0.830315,0.340245,0.637866,0.776557,0.595563,0.302313,0.0


In [12]:
# Training 75% developing 25%
x_train_rf1, x_dev_rf1, y_train_rf1, y_dev_rf1 = train_test_split(rf_selected_train_data1.drop(columns=['label']), rf_selected_train_data1['label'])
x_train_rf1.shape, x_dev_rf1.shape, y_train_rf1.shape, y_dev_rf1.shape

((1699, 297), (567, 297), (1699,), (567,))

In [13]:
rf_selected1 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth =  100, criterion = 'entropy')
rf_selected1.fit(x_train_rf1, y_train_rf1)
rf_selected_auc1 = roc_auc_score(y_dev_rf1, rf_selected1.predict_proba(x_dev_rf1)[:, 1])
rf_selected_auc1

0.7966056782334385

In [14]:
y_EMTAB_rf_selected = data_EMTAB['label']
x_EMTAB_rf_selected = data_EMTAB.loc[:,[i for i in selected_feat_rf1]]

In [15]:
rf_selected_auc_EMTAB = roc_auc_score(y_EMTAB_rf_selected, rf_selected1.predict_proba(x_EMTAB_rf_selected)[:, 1])
rf_selected_auc_EMTAB

0.6710886437908496

## Variable Selection by LR

In [16]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train1, y_train1) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [17]:
lr_selection1 = SelectFromModel(LogisticRegression(tol = 0.1e-05, solver = 'newton-cg', penalty = 'l2', C = 20))
lr_selection1.fit(x_train1, y_train1)

SelectFromModel(estimator=LogisticRegression(C=20, solver='newton-cg',
                                             tol=1e-06))

In [18]:
selected_feat_lr1 = x_train1.columns[(lr_selection1.get_support())]
len(selected_feat_lr1)

336

In [19]:
lr_selected_train_data1 = train_data1.loc[:,[i for i in selected_feat_lr1]]
lr_selected_train_data1['label'] = train_data1['label'] 
lr_selected_train_data1

Unnamed: 0,cg22695986,cg01193368,cg06098368,cg26916862,cg23316599,cg26262573,cg14285533,cg15509177,cg19710323,cg09785512,...,cg09990584,cg06495631,cg13665998,cg14466863,cg15236528,cg07635017,cg08641118,cg22034735,cg17805624,label
0,0.233353,0.606732,0.240143,0.194961,0.396608,0.367165,0.103790,0.575823,0.716404,0.806746,...,0.342347,0.570667,0.062450,0.825537,0.156715,0.532118,0.081238,0.508394,0.279428,1.0
1,0.284813,0.599726,0.242588,0.192382,0.419441,0.364813,0.123373,0.528539,0.749239,0.792259,...,0.321944,0.589975,0.113487,0.823126,0.220170,0.546033,0.099777,0.498426,0.360946,1.0
2,0.206618,0.552816,0.169127,0.197505,0.479463,0.334221,0.053817,0.457388,0.726021,0.837832,...,0.327677,0.507627,0.073325,0.748225,0.178511,0.536528,0.079384,0.204013,0.296357,1.0
3,0.203151,0.655871,0.224729,0.178132,0.510254,0.352629,0.121969,0.439880,0.734998,0.867621,...,0.303911,0.507012,0.148140,0.761062,0.086153,0.557829,0.067174,0.297127,0.292479,1.0
4,0.266709,0.493554,0.231550,0.194651,0.506738,0.441518,0.131623,0.496573,0.682946,0.831587,...,0.333914,0.673096,0.183865,0.805394,0.137047,0.546425,0.075612,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.274042,0.415842,0.573460,0.666337,0.321696,0.505403,0.821339,0.883085,...,0.385300,0.645043,0.088521,0.780383,0.128186,0.647941,0.176504,0.529537,0.275265,0.0
472,0.155194,0.650936,0.127649,0.214664,0.475778,0.339098,0.095610,0.404168,0.843162,0.874351,...,0.347106,0.487461,0.091538,0.734427,0.134853,0.603185,0.143384,0.522733,0.230949,1.0
475,0.170119,0.668992,0.141059,0.177975,0.454601,0.516098,0.063631,0.492028,0.839909,0.820734,...,0.507703,0.561416,0.078325,0.805425,0.146198,0.631754,0.077538,0.617378,0.242766,1.0
477,0.316241,0.642261,0.303791,0.242378,0.376560,0.521479,0.158589,0.609263,0.798974,0.887907,...,0.452558,0.566642,0.329829,0.760405,0.130743,0.595563,0.108415,0.444886,0.302313,0.0


In [20]:
# Training 75% developing 25%
x_train_lr1, x_dev_lr1, y_train_lr1, y_dev_lr1 = train_test_split(lr_selected_train_data1.drop(columns=['label']), lr_selected_train_data1['label'])
x_train_lr1.shape, x_dev_lr1.shape, y_train_lr1.shape, y_dev_lr1.shape

((1699, 336), (567, 336), (1699,), (567,))

In [21]:
lr_selected1 = LogisticRegression(tol = 0.1e-05, solver = 'newton-cg', penalty = 'l2', C = 20)
lr_selected1.fit(x_train_lr1, y_train_lr1)
lr_selected_auc1 = roc_auc_score(y_dev_lr1, lr_selected1.predict_proba(x_dev_lr1)[:, 1])
lr_selected_auc1

0.8072807570977918

In [22]:
y_EMTAB_lr_selected = data_EMTAB['label']
x_EMTAB_lr_selected = data_EMTAB.loc[:,[i for i in selected_feat_lr1]]

In [23]:
lr_selected_auc_EMTAB = roc_auc_score(y_EMTAB_lr_selected, lr_selected1.predict_proba(x_EMTAB_lr_selected)[:, 1])
lr_selected_auc_EMTAB

0.6775224673202614

# Training: E-Risk, BSGS, Denmark, E-MTAB
# Testing: AMDTSS

In [24]:
# 2650 * 834
train_data2 = pd.concat([data_ERISK, data_BSGS, data_DENMARK, data_EMTAB])
train_data2

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [25]:
# 2650 * 832
train_data2 = train_data2.loc[:,[i for i in data_AMDTSS.columns]]
train_data2

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [26]:
# Training 75% developing 25%
x_train2, x_dev2, y_train2, y_dev2 = train_test_split(train_data2.drop(columns=['label']), train_data2['label'])
x_train2.shape, x_dev2.shape, y_train2.shape, y_dev2.shape

((1987, 831), (663, 831), (1987,), (663,))

## Variable Selection by RF

In [27]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train2, y_train2) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [28]:
# Variable selection by random forest
rf_selection2 = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  500, criterion = 'entropy'))
rf_selection2.fit(x_train2, y_train2)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_depth=500,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=500))

In [29]:
# selected variables
selected_feat_rf2 = x_train2.columns[(rf_selection2.get_support())]
len(selected_feat_rf2)

281

In [30]:
rf_selected_train_data2 = train_data2.loc[:,[i for i in selected_feat_rf2]]
rf_selected_train_data2['label'] = train_data2['label'] 
rf_selected_train_data2

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg24686497,cg09243445,cg15694117,cg25787956,cg09990584,cg13665998,cg21808635,cg07635017,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.396608,...,0.065910,0.306735,0.961428,0.503054,0.342347,0.062450,0.780742,0.532118,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.419441,...,0.068113,0.316390,0.936653,0.466924,0.321944,0.113487,0.696424,0.546033,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.479463,...,0.055055,0.335758,0.934546,0.441580,0.327677,0.073325,0.606163,0.536528,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.510254,...,0.059226,0.355541,0.942972,0.476676,0.303911,0.148140,0.680310,0.557829,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.506738,...,0.205009,0.359560,0.938093,0.493368,0.333914,0.183865,0.722998,0.546425,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,0.427100,...,0.047630,0.322980,0.993380,0.556560,0.423670,0.093090,0.811640,0.426310,0.148490,1.0
644,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,0.470970,...,0.062740,0.348430,0.981290,0.517510,0.421900,0.074520,0.824450,0.465400,0.114360,1.0
645,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,0.356980,...,0.061010,0.393820,0.936080,0.493220,0.427700,0.142270,0.820790,0.438150,0.127200,0.0
646,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,0.273990,...,0.056650,0.330950,0.982680,0.479540,0.414230,0.118890,0.798780,0.394220,0.156560,0.0


In [31]:
# Training 75% developing 25%
x_train_rf2, x_dev_rf2, y_train_rf2, y_dev_rf2 = train_test_split(rf_selected_train_data2.drop(columns=['label']), rf_selected_train_data2['label'])
x_train_rf2.shape, x_dev_rf2.shape, y_train_rf2.shape, y_dev_rf2.shape

((1987, 281), (663, 281), (1987,), (663,))

In [32]:
rf_selected2 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  500, criterion = 'entropy')
rf_selected2.fit(x_train_rf2, y_train_rf2)
rf_selected_auc2 = roc_auc_score(y_dev_rf2, rf_selected2.predict_proba(x_dev_rf2)[:, 1])
rf_selected_auc2

0.8117146239300674

In [33]:
y_AMDTSS_rf_selected = data_AMDTSS['label']
x_AMDTSS_rf_selected = data_AMDTSS.loc[:,[i for i in selected_feat_rf2]]

In [34]:
rf_selected_auc_AMDTSS = roc_auc_score(y_AMDTSS_rf_selected, rf_selected2.predict_proba(x_AMDTSS_rf_selected)[:, 1])
rf_selected_auc_AMDTSS

0.6372532139577594

## Variable Selection by LR

In [35]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train2, y_train2) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [36]:
lr_selection2 = SelectFromModel(LogisticRegression(tol = 0.001, solver = 'newton-cg', penalty = 'l2', C = 1))
lr_selection2.fit(x_train2, y_train2)

SelectFromModel(estimator=LogisticRegression(C=1, solver='newton-cg',
                                             tol=0.001))

In [37]:
selected_feat_lr2 = x_train2.columns[(lr_selection2.get_support())]
len(selected_feat_lr2)

349

In [38]:
lr_selected_train_data2 = train_data2.loc[:,[i for i in selected_feat_lr2]]
lr_selected_train_data2['label'] = train_data2['label'] 
lr_selected_train_data2

Unnamed: 0,cg01193368,cg06098368,cg08690094,cg26916862,cg14770527,cg23316599,cg11108474,cg04838249,cg26262573,cg19689427,...,cg04600077,cg09990584,cg15236528,cg07291889,cg11174855,cg21808635,cg07635017,cg22034735,cg17805624,label
0,0.606732,0.240143,0.561082,0.194961,0.597436,0.396608,0.479727,0.709753,0.367165,0.487860,...,0.657943,0.342347,0.156715,0.706627,0.327951,0.780742,0.532118,0.508394,0.279428,1.0
1,0.599726,0.242588,0.564277,0.192382,0.570267,0.419441,0.483118,0.727749,0.364813,0.457657,...,0.667529,0.321944,0.220170,0.740209,0.195741,0.696424,0.546033,0.498426,0.360946,1.0
2,0.552816,0.169127,0.541453,0.197505,0.510173,0.479463,0.368669,0.668243,0.334221,0.429852,...,0.579968,0.327677,0.178511,0.643731,0.083195,0.606163,0.536528,0.204013,0.296357,1.0
3,0.655871,0.224729,0.480992,0.178132,0.481978,0.510254,0.360383,0.745117,0.352629,0.380312,...,0.638226,0.303911,0.086153,0.709617,0.134511,0.680310,0.557829,0.297127,0.292479,1.0
4,0.493554,0.231550,0.474545,0.194651,0.549200,0.506738,0.401675,0.783492,0.441518,0.441566,...,0.629674,0.333914,0.137047,0.727944,0.069399,0.722998,0.546425,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.668810,0.221750,0.450940,0.261340,0.703690,0.427100,0.379050,0.746530,0.481300,0.447700,...,0.647500,0.423670,0.032750,0.846850,0.074460,0.811640,0.426310,0.381660,0.148490,1.0
644,0.673100,0.211480,0.484940,0.259820,0.703450,0.470970,0.357540,0.788110,0.524900,0.504330,...,0.661330,0.421900,0.034740,0.806160,0.033430,0.824450,0.465400,0.315100,0.114360,1.0
645,0.586160,0.240640,0.481040,0.275920,0.633500,0.356980,0.222150,0.727720,0.374090,0.575540,...,0.662250,0.427700,0.025840,0.797160,0.117130,0.820790,0.438150,0.261410,0.127200,0.0
646,0.679960,0.286020,0.315550,0.231630,0.680370,0.273990,0.159820,0.671960,0.345570,0.409810,...,0.674750,0.414230,0.025380,0.841330,0.043240,0.798780,0.394220,0.566440,0.156560,0.0


In [39]:
# Training 75% developing 25%
x_train_lr2, x_dev_lr2, y_train_lr2, y_dev_lr2 = train_test_split(lr_selected_train_data2.drop(columns=['label']), lr_selected_train_data2['label'])
x_train_lr2.shape, x_dev_lr2.shape, y_train_lr2.shape, y_dev_lr2.shape

((1987, 349), (663, 349), (1987,), (663,))

In [40]:
lr_selected2 = LogisticRegression(tol = 0.001, solver = 'newton-cg', penalty = 'l2', C = 1)
lr_selected2.fit(x_train_lr2, y_train_lr2)
lr_selected_auc2 = roc_auc_score(y_dev_lr2, lr_selected2.predict_proba(x_dev_lr2)[:, 1])
lr_selected_auc2

0.8425211922340717

In [41]:
y_AMDTSS_lr_selected = data_AMDTSS['label']
x_AMDTSS_lr_selected = data_AMDTSS.loc[:,[i for i in selected_feat_lr2]]

In [42]:
lr_selected_auc_AMDTSS = roc_auc_score(y_AMDTSS_lr_selected, lr_selected2.predict_proba(x_AMDTSS_lr_selected)[:, 1])
lr_selected_auc_AMDTSS

0.6912878787878789

# Training: E-Risk, BSGS, AMDTSS, E-MTAB
# Testing: Denmark

In [43]:
#2470 * 834
train_data3 = pd.concat([data_ERISK, data_BSGS, data_EMTAB])
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [44]:
# 2470 * 832
train_data3 = train_data3.loc[:,[i for i in data_AMDTSS.columns]]
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [45]:
# 2734 * 832
train_data3 = pd.concat([train_data3, data_AMDTSS])
train_data3

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [46]:
# Training 75% developing 25%
x_train3, x_dev3, y_train3, y_dev3 = train_test_split(train_data3.drop(columns=['label']), train_data3['label'])
x_train3.shape, x_dev3.shape, y_train3.shape, y_dev3.shape

((2050, 831), (684, 831), (2050,), (684,))

## Variable Selection by RF

In [47]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train3, y_train3) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [48]:
# Variable selection by random forest
rf_selection3 = SelectFromModel(RandomForestClassifier(n_estimators = 300, min_impurity_decrease = 1e-05, max_depth =  100, criterion = 'entropy'))
rf_selection3.fit(x_train3, y_train3)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 max_depth=100,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=300))

In [49]:
# selected variables
selected_feat_rf3 = x_train3.columns[(rf_selection3.get_support())]
len(selected_feat_rf3)

295

In [50]:
rf_selected_train_data3 = train_data3.loc[:,[i for i in selected_feat_rf3]]
rf_selected_train_data3['label'] = train_data3['label'] 
rf_selected_train_data3

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg15489799,cg09990584,cg13665998,cg15236528,cg04524933,cg21808635,cg07635017,cg08641118,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.396608,...,0.434138,0.342347,0.062450,0.156715,0.919347,0.780742,0.532118,0.081238,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.419441,...,0.456402,0.321944,0.113487,0.220170,0.945236,0.696424,0.546033,0.099777,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.479463,...,0.476032,0.327677,0.073325,0.178511,0.906838,0.606163,0.536528,0.079384,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.510254,...,0.461083,0.303911,0.148140,0.086153,0.956986,0.680310,0.557829,0.067174,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.506738,...,0.502171,0.333914,0.183865,0.137047,0.921778,0.722998,0.546425,0.075612,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,0.573460,...,0.546019,0.385300,0.088521,0.128186,0.953237,0.869079,0.647941,0.176504,0.275265,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,0.475778,...,0.478794,0.347106,0.091538,0.134853,0.945503,0.766231,0.603185,0.143384,0.230949,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,0.454601,...,0.603469,0.507703,0.078325,0.146198,0.962932,0.827691,0.631754,0.077538,0.242766,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,0.376560,...,0.594803,0.452558,0.329829,0.130743,0.917177,0.776557,0.595563,0.108415,0.302313,0.0


In [51]:
# Training 75% developing 25%
x_train_rf3, x_dev_rf3, y_train_rf3, y_dev_rf3 = train_test_split(rf_selected_train_data3.drop(columns=['label']), rf_selected_train_data3['label'])
x_train_rf3.shape, x_dev_rf3.shape, y_train_rf3.shape, y_dev_rf3.shape

((2050, 295), (684, 295), (2050,), (684,))

In [52]:
rf_selected3 = RandomForestClassifier(n_estimators = 300, min_impurity_decrease = 1e-05, max_depth =  100, criterion = 'entropy')
rf_selected3.fit(x_train_rf3, y_train_rf3)
rf_selected_auc3 = roc_auc_score(y_dev_rf3, rf_selected3.predict_proba(x_dev_rf3)[:, 1])
rf_selected_auc3

0.7989652781938156

In [53]:
y_DENMARK_rf_selected = data_DENMARK['label']
x_DENMARK_rf_selected = data_DENMARK.loc[:,[i for i in selected_feat_rf3]]

In [54]:
rf_selected_auc_DENMARK = roc_auc_score(y_DENMARK_rf_selected, rf_selected3.predict_proba(x_DENMARK_rf_selected)[:, 1])
rf_selected_auc_DENMARK

0.6391637803067788

## Variable Selection by LR

In [55]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train3, y_train3) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [56]:
lr_selection3 = SelectFromModel(LogisticRegression(tol = 1e-07, solver = 'saga', penalty = 'l2', C = 20))
lr_selection3.fit(x_train3, y_train3)



SelectFromModel(estimator=LogisticRegression(C=20, solver='saga', tol=1e-07))

In [57]:
selected_feat_lr3 = x_train3.columns[(lr_selection3.get_support())]
len(selected_feat_lr3)

346

In [58]:
lr_selected_train_data3 = train_data3.loc[:,[i for i in selected_feat_lr3]]
lr_selected_train_data3['label'] = train_data3['label'] 
lr_selected_train_data3

Unnamed: 0,cg01193368,cg06098368,cg08690094,cg11236452,cg14770527,cg23316599,cg11108474,cg16340103,cg19689427,cg14285533,...,cg19370715,cg04182912,cg09990584,cg07291889,cg04524933,cg07635017,cg08641118,cg22034735,cg17805624,label
0,0.606732,0.240143,0.561082,0.533287,0.597436,0.396608,0.479727,0.556758,0.487860,0.103790,...,0.801903,0.588122,0.342347,0.706627,0.919347,0.532118,0.081238,0.508394,0.279428,1.0
1,0.599726,0.242588,0.564277,0.578224,0.570267,0.419441,0.483118,0.481381,0.457657,0.123373,...,0.774984,0.645375,0.321944,0.740209,0.945236,0.546033,0.099777,0.498426,0.360946,1.0
2,0.552816,0.169127,0.541453,0.509944,0.510173,0.479463,0.368669,0.513161,0.429852,0.053817,...,0.743938,0.689910,0.327677,0.643731,0.906838,0.536528,0.079384,0.204013,0.296357,1.0
3,0.655871,0.224729,0.480992,0.421599,0.481978,0.510254,0.360383,0.432874,0.380312,0.121969,...,0.752980,0.677442,0.303911,0.709617,0.956986,0.557829,0.067174,0.297127,0.292479,1.0
4,0.493554,0.231550,0.474545,0.381759,0.549200,0.506738,0.401675,0.475677,0.441566,0.131623,...,0.773852,0.565591,0.333914,0.727944,0.921778,0.546425,0.075612,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.819136,0.274042,0.525204,0.698463,0.759353,0.573460,0.401122,0.677112,0.658904,0.321696,...,0.865159,0.553463,0.385300,0.855644,0.953237,0.647941,0.176504,0.529537,0.275265,0.0
472,0.650936,0.127649,0.458996,0.414456,0.700554,0.475778,0.298102,0.585688,0.460595,0.095610,...,0.848273,0.558968,0.347106,0.777445,0.945503,0.603185,0.143384,0.522733,0.230949,1.0
475,0.668992,0.141059,0.522190,0.469892,0.686412,0.454601,0.322749,0.536903,0.541866,0.063631,...,0.876952,0.592279,0.507703,0.836003,0.962932,0.631754,0.077538,0.617378,0.242766,1.0
477,0.642261,0.303791,0.403003,0.830418,0.764943,0.376560,0.445176,0.485595,0.721145,0.158589,...,0.803013,0.611860,0.452558,0.857884,0.917177,0.595563,0.108415,0.444886,0.302313,0.0


In [59]:
# Training 75% developing 25%
x_train_lr3, x_dev_lr3, y_train_lr3, y_dev_lr3 = train_test_split(lr_selected_train_data3.drop(columns=['label']), lr_selected_train_data3['label'])
x_train_lr3.shape, x_dev_lr3.shape, y_train_lr3.shape, y_dev_lr3.shape

((2050, 346), (684, 346), (2050,), (684,))

In [60]:
lr_selected3 = LogisticRegression(tol = 1e-07, solver = 'saga', penalty = 'l2', C = 20)
lr_selected3.fit(x_train_lr3, y_train_lr3)
lr_selected_auc3 = roc_auc_score(y_dev_lr3, lr_selected3.predict_proba(x_dev_lr3)[:, 1])
lr_selected_auc3



0.8538468774318234

In [61]:
y_DENMARK_lr_selected = data_DENMARK['label']
x_DENMARK_lr_selected = data_DENMARK.loc[:,[i for i in selected_feat_lr3]]

In [62]:
lr_selected_auc_DENMARK = roc_auc_score(y_DENMARK_lr_selected, lr_selected3.predict_proba(x_DENMARK_lr_selected)[:, 1])
lr_selected_auc_DENMARK

0.6144235526966848

# Training: E-Risk, AMDTSS, E-MTAB, Denmark
# Testing: BSGS

In [63]:
#2292 * 834
train_data4 = pd.concat([data_ERISK, data_DENMARK, data_EMTAB])
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [64]:
# 2292 * 832
train_data4 = train_data4.loc[:,[i for i in data_AMDTSS.columns]]
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1.0,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1.0,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0.0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0.0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [65]:
# 2556 * 832
train_data4 = pd.concat([train_data4, data_AMDTSS])
train_data4

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1.0,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1.0,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [66]:
# Training 75% developing 25%
x_train4, x_dev4, y_train4, y_dev4 = train_test_split(train_data4.drop(columns=['label']), train_data4['label'])
x_train4.shape, x_dev4.shape, y_train4.shape, y_dev4.shape

((1917, 831), (639, 831), (1917,), (639,))

## Variable Selection by RF

In [67]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train4, y_train4) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [68]:
# Variable selection by random forest
rf_selection4 = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  20, criterion = 'gini'))
rf_selection4.fit(x_train4, y_train4)

SelectFromModel(estimator=RandomForestClassifier(max_depth=20,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=500))

In [69]:
# selected variables
selected_feat_rf4 = x_train4.columns[(rf_selection4.get_support())]
len(selected_feat_rf4)

295

In [70]:
rf_selected_train_data4 = train_data4.loc[:,[i for i in selected_feat_rf4]]
rf_selected_train_data4['label'] = train_data4['label'] 
rf_selected_train_data4

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg24567421,cg00413089,cg24686497,cg06427838,cg15694117,cg03556669,cg04524933,cg21808635,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.396608,...,0.852106,0.383746,0.065910,0.089080,0.961428,0.305896,0.919347,0.780742,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.419441,...,0.891110,0.352384,0.068113,0.078382,0.936653,0.313702,0.945236,0.696424,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.479463,...,0.805776,0.114592,0.055055,0.044524,0.934546,0.230778,0.906838,0.606163,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.510254,...,0.803325,0.102988,0.059226,0.088794,0.942972,0.257855,0.956986,0.680310,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.506738,...,0.817501,0.521092,0.205009,0.073908,0.938093,0.380755,0.921778,0.722998,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,0.573460,...,0.908296,0.265921,0.079161,0.037285,0.982111,0.425339,0.953237,0.869079,0.275265,0.0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,0.475778,...,0.912584,0.174020,0.041471,0.033304,0.945254,0.290931,0.945503,0.766231,0.230949,1.0
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,0.454601,...,0.907858,0.306328,0.044382,0.207561,0.997431,0.502986,0.962932,0.827691,0.242766,1.0
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,0.376560,...,0.944586,0.582603,0.067505,0.070588,0.901474,0.225473,0.917177,0.776557,0.302313,0.0


In [71]:
# Training 75% developing 25%
x_train_rf4, x_dev_rf4, y_train_rf4, y_dev_rf4 = train_test_split(rf_selected_train_data4.drop(columns=['label']), rf_selected_train_data4['label'])
x_train_rf4.shape, x_dev_rf4.shape, y_train_rf4.shape, y_dev_rf4.shape

((1917, 295), (639, 295), (1917,), (639,))

In [72]:
rf_selected4 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  20, criterion = 'gini')
rf_selected4.fit(x_train_rf4, y_train_rf4)
rf_selected_auc4 = roc_auc_score(y_dev_rf4, rf_selected4.predict_proba(x_dev_rf4)[:, 1])
rf_selected_auc4

0.7976632484677038

In [73]:
y_BSGS_rf_selected = data_BSGS['label']
x_BSGS_rf_selected = data_BSGS.loc[:,[i for i in selected_feat_rf4]]

In [74]:
rf_selected_auc_BSGS = roc_auc_score(y_BSGS_rf_selected, rf_selected4.predict_proba(x_BSGS_rf_selected)[:, 1])
rf_selected_auc_BSGS

0.7869456900847034

## Variable Selection by LR

In [75]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train4, y_train4) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [76]:
lr_selection4 = SelectFromModel(LogisticRegression(tol = 1e-05, solver = 'newton-cg', penalty = 'l2', C = 0.5))
lr_selection4.fit(x_train4, y_train4)

SelectFromModel(estimator=LogisticRegression(C=0.5, solver='newton-cg',
                                             tol=1e-05))

In [77]:
selected_feat_lr4 = x_train4.columns[(lr_selection4.get_support())]
len(selected_feat_lr4)

358

In [78]:
lr_selected_train_data4 = train_data4.loc[:,[i for i in selected_feat_lr4]]
lr_selected_train_data4['label'] = train_data4['label'] 
lr_selected_train_data4

Unnamed: 0,cg06098368,cg08690094,cg11236452,cg14770527,cg23316599,cg11108474,cg04838249,cg26262573,cg14285533,cg15509177,...,cg06495631,cg13665998,cg07903023,cg15236528,cg03556669,cg21808635,cg09166085,cg22034735,cg17805624,label
0,0.240143,0.561082,0.533287,0.597436,0.396608,0.479727,0.709753,0.367165,0.103790,0.575823,...,0.570667,0.062450,0.641294,0.156715,0.305896,0.780742,0.108676,0.508394,0.279428,1.0
1,0.242588,0.564277,0.578224,0.570267,0.419441,0.483118,0.727749,0.364813,0.123373,0.528539,...,0.589975,0.113487,0.668892,0.220170,0.313702,0.696424,0.128168,0.498426,0.360946,1.0
2,0.169127,0.541453,0.509944,0.510173,0.479463,0.368669,0.668243,0.334221,0.053817,0.457388,...,0.507627,0.073325,0.433934,0.178511,0.230778,0.606163,0.091236,0.204013,0.296357,1.0
3,0.224729,0.480992,0.421599,0.481978,0.510254,0.360383,0.745117,0.352629,0.121969,0.439880,...,0.507012,0.148140,0.416025,0.086153,0.257855,0.680310,0.158550,0.297127,0.292479,1.0
4,0.231550,0.474545,0.381759,0.549200,0.506738,0.401675,0.783492,0.441518,0.131623,0.496573,...,0.673096,0.183865,0.593111,0.137047,0.380755,0.722998,0.178922,0.526409,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.274042,0.525204,0.698463,0.759353,0.573460,0.401122,0.805374,0.666337,0.321696,0.505403,...,0.645043,0.088521,0.699565,0.128186,0.425339,0.869079,0.168747,0.529537,0.275265,0.0
472,0.127649,0.458996,0.414456,0.700554,0.475778,0.298102,0.670181,0.339098,0.095610,0.404168,...,0.487461,0.091538,0.598367,0.134853,0.290931,0.766231,0.172315,0.522733,0.230949,1.0
475,0.141059,0.522190,0.469892,0.686412,0.454601,0.322749,0.624104,0.516098,0.063631,0.492028,...,0.561416,0.078325,0.744434,0.146198,0.502986,0.827691,0.129992,0.617378,0.242766,1.0
477,0.303791,0.403003,0.830418,0.764943,0.376560,0.445176,0.663237,0.521479,0.158589,0.609263,...,0.566642,0.329829,0.637866,0.130743,0.225473,0.776557,0.338011,0.444886,0.302313,0.0


In [79]:
# Training 75% developing 25%
x_train_lr4, x_dev_lr4, y_train_lr4, y_dev_lr4 = train_test_split(lr_selected_train_data4.drop(columns=['label']), lr_selected_train_data4['label'])
x_train_lr4.shape, x_dev_lr4.shape, y_train_lr4.shape, y_dev_lr4.shape

((1917, 358), (639, 358), (1917,), (639,))

In [80]:
lr_selected4 = LogisticRegression(tol = 1e-05, solver = 'newton-cg', penalty = 'l2', C = 0.5)
lr_selected4.fit(x_train_lr4, y_train_lr4)
lr_selected_auc4 = roc_auc_score(y_dev_lr4, lr_selected4.predict_proba(x_dev_lr4)[:, 1])
lr_selected_auc4

0.8341660298987048

In [81]:
y_BSGS_lr_selected = data_BSGS['label']
x_BSGS_lr_selected = data_BSGS.loc[:,[i for i in selected_feat_lr4]]

In [82]:
lr_selected_auc_BSGS = roc_auc_score(y_BSGS_lr_selected, lr_selected4.predict_proba(x_BSGS_lr_selected)[:, 1])
lr_selected_auc_BSGS

0.8229197807673144

# Training: BSGS, AMDTSS, E-MTAB, Denmark
# Testing: E-Risk

In [83]:
# 1186 * 834
train_data5 = pd.concat([data_BSGS, data_DENMARK, data_EMTAB])
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [84]:
# 1186 * 832
train_data5 = train_data5.loc[:,[i for i in data_AMDTSS.columns]]
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1,0.265920,0.668810,0.538530,0.221750,0.450940,0.409590,0.261340,0.202900,0.703690,...,0.074460,0.932110,0.309700,0.811640,0.651310,0.426310,0.106370,0.128560,0.381660,0.148490
644,1,0.264220,0.673100,0.552430,0.211480,0.484940,0.468880,0.259820,0.240040,0.703450,...,0.033430,0.918230,0.273310,0.824450,0.609600,0.465400,0.121100,0.089490,0.315100,0.114360
645,0,0.300860,0.586160,0.691140,0.240640,0.481040,0.579180,0.275920,0.234960,0.633500,...,0.117130,0.936460,0.436270,0.820790,0.597970,0.438150,0.134000,0.214790,0.261410,0.127200
646,0,0.289540,0.679960,0.553150,0.286020,0.315550,0.421130,0.231630,0.297830,0.680370,...,0.043240,0.931800,0.296000,0.798780,0.609940,0.394220,0.143910,0.143810,0.566440,0.156560


In [85]:
# 1450 * 832
train_data5 = pd.concat([train_data5, data_AMDTSS])
train_data5

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,...,0.071795,0.919869,0.448765,0.810244,0.686240,0.548766,0.113030,0.141956,0.334091,0.243362
2,0,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,...,0.080001,0.920140,0.458319,0.790811,0.622287,0.548240,0.182175,0.357434,0.660001,0.286040
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
10,0,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,...,0.117231,0.918648,0.225151,0.808565,0.739517,0.601517,0.150539,0.321008,0.673163,0.351189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [86]:
# Training 75% developing 25%
x_train5, x_dev5, y_train5, y_dev5 = train_test_split(train_data5.drop(columns=['label']), train_data5['label'])
x_train5.shape, x_dev5.shape, y_train5.shape, y_dev5.shape

((1087, 831), (363, 831), (1087,), (363,))

## Variable Selection by RF

In [87]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train5, y_train5) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [88]:
# Variable selection by random forest
rf_selection5 = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  20, criterion = 'gini'))
rf_selection5.fit(x_train5, y_train5)

SelectFromModel(estimator=RandomForestClassifier(max_depth=20,
                                                 min_impurity_decrease=1e-05,
                                                 n_estimators=500))

In [89]:
# selected variables
selected_feat_rf5 = x_train5.columns[(rf_selection5.get_support())]
len(selected_feat_rf5)

287

In [90]:
rf_selected_train_data5 = train_data5.loc[:,[i for i in selected_feat_rf5]]
rf_selected_train_data5['label'] = train_data5['label'] 
rf_selected_train_data5

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg02170386,cg01929855,cg09243445,cg20999932,cg06427838,cg06100807,cg13665998,cg19418458,cg21808635,label
0,0.282164,0.613574,0.670013,0.269499,0.516672,0.459066,0.254743,0.224724,0.617466,0.462327,...,0.666886,0.897981,0.389843,0.292063,0.027239,0.010423,0.110296,0.448765,0.810244,0
2,0.225027,0.619136,0.530781,0.219685,0.441304,0.504256,0.231294,0.169521,0.594098,0.432864,...,0.629995,0.903937,0.363355,0.279418,0.062395,0.116715,0.355201,0.458319,0.790811,0
4,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,0.452428,...,0.664255,0.947291,0.399124,0.294543,0.032160,0.026043,0.082696,0.275012,0.837924,1
7,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,0.477888,...,0.644917,0.952266,0.383109,0.282708,0.011340,0.195488,0.084307,0.463502,0.834654,0
10,0.206107,0.472510,0.588723,0.137395,0.380327,0.392145,0.165503,0.170393,0.567658,0.282244,...,0.581981,0.950120,0.415980,0.375531,0.038941,0.059711,0.298719,0.225151,0.808565,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,0.573460,...,0.568126,0.900970,0.365249,0.223322,0.037285,0.094577,0.088521,0.542180,0.869079,0
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,0.475778,...,0.565258,0.859307,0.362257,0.220185,0.033304,0.041635,0.091538,0.361169,0.766231,1
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,0.454601,...,0.651315,0.922259,0.403914,0.362009,0.207561,0.024130,0.078325,0.609414,0.827691,1
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,0.376560,...,0.710537,0.909179,0.381835,0.337030,0.070588,0.045483,0.329829,0.357777,0.776557,0


In [91]:
# Training 75% developing 25%
x_train_rf5, x_dev_rf5, y_train_rf5, y_dev_rf5 = train_test_split(rf_selected_train_data5.drop(columns=['label']), rf_selected_train_data5['label'])
x_train_rf5.shape, x_dev_rf5.shape, y_train_rf5.shape, y_dev_rf5.shape

((1087, 287), (363, 287), (1087,), (363,))

In [92]:
rf_selected5 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-05, max_depth =  20, criterion = 'gini')
rf_selected5.fit(x_train_rf5, y_train_rf5)
rf_selected_auc5 = roc_auc_score(y_dev_rf5, rf_selected5.predict_proba(x_dev_rf5)[:, 1])
rf_selected_auc5

0.804855436832181

In [93]:
y_ERISK_rf_selected = data_ERISK['label']
x_ERISK_rf_selected = data_ERISK.loc[:,[i for i in selected_feat_rf5]]

In [94]:
rf_selected_auc_ERISK = roc_auc_score(y_ERISK_rf_selected, rf_selected5.predict_proba(x_ERISK_rf_selected)[:, 1])
rf_selected_auc_ERISK

0.6823170778483538

## Variable Selection by LR

In [95]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train5, y_train5) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [96]:
lr_selection5 = SelectFromModel(LogisticRegression(tol = 1e-07, solver = 'newton-cg', penalty = 'l2', C = 1))
lr_selection5.fit(x_train5, y_train5)

SelectFromModel(estimator=LogisticRegression(C=1, solver='newton-cg',
                                             tol=1e-07))

In [97]:
selected_feat_lr5 = x_train5.columns[(lr_selection5.get_support())]
len(selected_feat_lr5)

349

In [98]:
lr_selected_train_data5 = train_data5.loc[:,[i for i in selected_feat_lr5]]
lr_selected_train_data5['label'] = train_data5['label'] 
lr_selected_train_data5

Unnamed: 0,cg01193368,cg06098368,cg08690094,cg03124146,cg14770527,cg10933186,cg11108474,cg16340103,cg04838249,cg19689427,...,cg07903023,cg14466863,cg11174855,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg17805624,label
0,0.613574,0.269499,0.516672,0.224724,0.617466,0.796146,0.345466,0.506898,0.812046,0.419325,...,0.560400,0.778983,0.071795,0.448765,0.810244,0.686240,0.548766,0.113030,0.243362,0
2,0.619136,0.219685,0.441304,0.169521,0.594098,0.737150,0.322264,0.441999,0.539041,0.391477,...,0.627009,0.787670,0.080001,0.458319,0.790811,0.622287,0.548240,0.182175,0.286040,0
4,0.691534,0.314880,0.489395,0.206316,0.605732,0.822910,0.369445,0.524547,0.724093,0.513728,...,0.635566,0.790188,0.073612,0.275012,0.837924,0.744836,0.619499,0.121501,0.277873,1
7,0.675944,0.276872,0.506406,0.310907,0.625395,0.845854,0.363694,0.564390,0.783814,0.547045,...,0.630104,0.763763,0.081113,0.463502,0.834654,0.788736,0.613200,0.215362,0.308074,0
10,0.472510,0.137395,0.380327,0.170393,0.567658,0.737851,0.324882,0.438916,0.581293,0.459371,...,0.619235,0.744077,0.117231,0.225151,0.808565,0.739517,0.601517,0.150539,0.351189,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.819136,0.274042,0.525204,0.387231,0.759353,0.799908,0.401122,0.677112,0.805374,0.658904,...,0.699565,0.780383,0.285387,0.542180,0.869079,0.713411,0.647941,0.176504,0.275265,0
472,0.650936,0.127649,0.458996,0.116075,0.700554,0.679567,0.298102,0.585688,0.670181,0.460595,...,0.598367,0.734427,0.098905,0.361169,0.766231,0.652946,0.603185,0.143384,0.230949,1
475,0.668992,0.141059,0.522190,0.141652,0.686412,0.703902,0.322749,0.536903,0.624104,0.541866,...,0.744434,0.805425,0.208649,0.609414,0.827691,0.850505,0.631754,0.077538,0.242766,1
477,0.642261,0.303791,0.403003,0.278751,0.764943,0.921208,0.445176,0.485595,0.663237,0.721145,...,0.637866,0.760405,0.223300,0.357777,0.776557,0.607103,0.595563,0.108415,0.302313,0


In [99]:
# Training 75% developing 25%
x_train_lr5, x_dev_lr5, y_train_lr5, y_dev_lr5 = train_test_split(lr_selected_train_data5.drop(columns=['label']), lr_selected_train_data5['label'])
x_train_lr5.shape, x_dev_lr5.shape, y_train_lr5.shape, y_dev_lr5.shape

((1087, 349), (363, 349), (1087,), (363,))

In [100]:
lr_selected5 = LogisticRegression(tol = 1e-07, solver = 'newton-cg', penalty = 'l2', C = 1)
lr_selected5.fit(x_train_lr5, y_train_lr5)
lr_selected_auc5 = roc_auc_score(y_dev_lr5, lr_selected5.predict_proba(x_dev_lr5)[:, 1])
lr_selected_auc5

0.8764172335600908

In [101]:
y_ERISK_lr_selected = data_ERISK['label']
x_ERISK_lr_selected = data_ERISK.loc[:,[i for i in selected_feat_lr5]]

In [102]:
lr_selected_auc_ERISK = roc_auc_score(y_ERISK_lr_selected, lr_selected5.predict_proba(x_ERISK_lr_selected)[:, 1])
lr_selected_auc_ERISK

0.7146257172665622