In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

## Preprocessing

In [77]:
train_data = pd.read_csv('ERisk_data.csv')
train_data = train_data.dropna()
train_data

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,1.0,0.346429,0.760069,0.688908,0.340425,0.465195,0.363101,0.258523,0.293546,0.575740,...,0.169065,0.939218,0.609925,0.757884,0.759516,0.489253,0.144548,0.169750,0.517466,0.238090
1653,1.0,0.246526,0.659649,0.701803,0.211376,0.444763,0.433245,0.196325,0.148841,0.579781,...,0.104019,0.921664,0.705905,0.763958,0.861796,0.486682,0.077902,0.141081,0.447169,0.199716
1654,1.0,0.224520,0.563713,0.712559,0.237203,0.424596,0.454003,0.163925,0.224361,0.613307,...,0.088733,0.894338,0.642370,0.799785,0.851035,0.522340,0.088133,0.119238,0.461302,0.242302
1655,1.0,0.210534,0.639052,0.773019,0.208191,0.618487,0.509763,0.185071,0.166923,0.614580,...,0.261384,0.883677,0.465794,0.783359,0.767986,0.490514,0.206327,0.232657,0.471418,0.226292


In [3]:
train_data.dtypes

label         float64
cg22695986    float64
cg01193368    float64
cg22056094    float64
cg06098368    float64
               ...   
cg07635017    float64
cg08641118    float64
cg09166085    float64
cg22034735    float64
cg17805624    float64
Length: 834, dtype: object

In [4]:
# Training 75% developing 25%
from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(train_data.drop(columns=['label']), train_data['label'])
x_train.shape, x_dev.shape, y_train.shape, y_dev.shape

((1098, 833), (366, 833), (1098,), (366,))

In [5]:
y_train

1394    0.0
554     1.0
1367    1.0
1240    1.0
1238    0.0
       ... 
399     1.0
870     0.0
348     0.0
571     1.0
1247    1.0
Name: label, Length: 1098, dtype: float64

## Baseline Model

### SVM (SVC)

In [6]:
from sklearn.svm import SVC
svc = SVC(probability = True)
svc.fit(x_train, y_train)
svc_auc_baseline = roc_auc_score(y_dev, svc.predict_proba(x_dev)[:, 1])
svc_auc_baseline

0.7758945968515141

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_auc_baseline = roc_auc_score(y_dev, lr.predict_proba(x_dev)[:, 1])
lr_auc_baseline

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7931215785729749

### Adaboost

In [8]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()
ab.fit(x_train, y_train)
ab_auc_baseline = roc_auc_score(y_dev, ab.predict_proba(x_dev)[:, 1])
ab_auc_baseline

0.7292858689264837

### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_auc_baseline = roc_auc_score(y_dev, rf.predict_proba(x_dev)[:, 1])
rf_auc_baseline

0.7868431633315808

### Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
mnb_auc_baseline = roc_auc_score(y_dev, mnb.predict_proba(x_dev)[:, 1])
mnb_auc_baseline

0.7217084712213527

### Stochastic Gradient Boosting

In [11]:
# from sklearn.ensemble import GradientBoostingClassifier
# gb = GradientBoostingClassifier()
# gb.fit(x_train, y_train)
# gb_auc_baseline = roc_auc_score(y_dev, gb.predict_proba(x_dev)[:, 1])
# gb_auc_baseline

## Tuning

### Logistic Regression

In [12]:
# # Grid search for lr
# penalty = ["l1", "l2", "elasticnet"]
# # Tolerance for stopping criteria.
# tol = [0.00001, 0.001, 0.0000001]
# # Inverse of regularization strength
# C = [0,1, 0.5, 1, 10, 20, 30, 50, 100]
# # Algorithm to use in the optimization problem
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']


# param_distributions = dict(penalty = penalty, tol = tol, C = C, solver = solver)
# lr = LogisticRegression()
# grid = RandomizedSearchCV(estimator = lr, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train, y_train) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [13]:
# # L2 penalty => Ridge Regression
# lr = LogisticRegression(tol = 1e-07, solver = 'sag', penalty = 'l2', C = 1, max_iter = 5000)
# lr.fit(x_train, y_train)
# lr_auc = roc_auc_score(y_dev, lr.predict_proba(x_dev)[:, 1])
# lr_auc

### SVM

In [14]:
# # Grid search for svc
# # Regularization parameter
# C = [0.1, 1, 5, 10, 20, 30, 50, 100]
# # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
# gamma = ["scale", "auto"]
# # one vs one or one vs rest
# decision_function_shape = ["ovo", "ovr"]
# # Specifies the kernel type to be used in the algorithm
# kernel = ["linear", "poly", "rbf", "sigmoid"]
# # Degree of the polynomial kernel function (‘poly’)
# degree = [1, 2, 3, 4]

# param_distributions = dict(C = C, gamma = gamma, decision_function_shape = decision_function_shape, kernel = kernel, degree = degree)
# svc = SVC(probability = True)
# grid = RandomizedSearchCV(estimator = svc, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train, y_train) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [15]:
# svc = SVC(C=20, gamma = 'scale', decision_function_shape = 'ovr', probability = True)
# svc.fit(x_train, y_train)
# svc_auc = roc_auc_score(y_dev, svc.predict_proba(x_dev)[:, 1])
# svc_auc

### Random Forest

In [16]:
# # Grid search for rf
# # The number of trees in the forest.
# n_estimators = [50, 100, 200, 300, 500]
# # The function to measure the quality of a split
# criterion = ["gini", "entropy"]
# # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# min_impurity_decrease = [0.1, 0.000001, 0.00001]
# # The maximum depth of the tree.
# max_depth = [20, 50, 100, 500, 1000]

# param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
# rf = RandomForestClassifier()
# grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train, y_train) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

In [17]:
# rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 4, min_impurity_decrease = 1e-06, criterion ='entropy')
# rf.fit(x_train, y_train)
# rf_auc = roc_auc_score(y_dev, rf.predict_proba(x_dev)[:, 1])
# rf_auc

### Gradient Boosting

In [18]:
# # Grid search for gb
# # The number of boosting stages to perform
# n_estimators = [50, 100, 200, 300, 500]
# # Learning rate shrinks the contribution of each tree by learning_rate
# learning_rate = [0.5, 0.1, 0.01, 0.001]
# # The maximum depth of the individual regression estimators
# max_depth = [1, 3, 5, 10, 20, 50]

# param_distributions = dict(n_estimators = n_estimators, learning_rate = learning_rate, max_depth = max_depth)
# gb = GradientBoostingClassifier()
# grid = RandomizedSearchCV(estimator = gb, param_distributions = param_distributions, scoring = "roc_auc",
#                           verbose = 1, n_jobs = -1) 
# grid_result = grid.fit(x_train, y_train) 

# print('Best Score: ', grid_result.best_score_) 
# print('Best Params: ', grid_result.best_params_) 

## Stacking Model

In [19]:
# get a stacking ensemble of models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)))
	level0.append(('rf', RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')))
	level0.append(('gb', GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)))
	level0.append(('svm', SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20)))
	level0.append(('mnb', MultinomialNB()))
	# define meta learner model
	level1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

In [20]:
stacking = get_stacking()
stacking.fit(x_train, y_train)
stacking_auc = roc_auc_score(y_dev, stacking.predict_proba(x_dev)[:, 1])
stacking_auc



0.8225342529304427

## Voting

In [21]:
from sklearn.ensemble import VotingClassifier
clf1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
clf2 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')
clf3 = GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)
clf4 = SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20, probability=True)
voting = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting.fit(x_train, y_train)
voting_auc = roc_auc_score(y_dev, voting.predict_proba(x_dev)[:, 1])
voting_auc



0.8135032319920824

# Testing on Other Datasets

## BSGS

In [22]:
# 614 rows × 834 columns
test_data_BSGS = pd.read_csv('BSGS_data.csv')
# 358 rows × 834 columns
test_data_BSGS = test_data_BSGS[test_data_BSGS['label']!='0']
test_data_BSGS['label'] = test_data_BSGS['label'].replace(['MZ','DZ'],[1,0])
# 130 rows × 834 columns
test_data_BSGS = test_data_BSGS.dropna()
test_data_BSGS

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
4,1,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
14,0,0.180019,0.575762,0.724558,0.170102,0.369675,0.461744,0.196268,0.160012,0.590556,...,0.163308,0.938841,0.575904,0.814739,0.683545,0.592769,0.115443,0.117083,0.422950,0.203808
16,1,0.134007,0.530812,0.758351,0.122100,0.350734,0.491976,0.184403,0.129332,0.524629,...,0.083213,0.899943,0.694841,0.761936,0.783042,0.550367,0.107730,0.273156,0.373479,0.288127
17,0,0.209093,0.556030,0.675430,0.135953,0.443692,0.463429,0.189666,0.202709,0.561508,...,0.082195,0.898694,0.586915,0.769028,0.664412,0.588547,0.109294,0.254661,0.444578,0.229604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,0,0.406125,0.664470,0.542610,0.354006,0.458286,0.420171,0.366680,0.338191,0.580177,...,0.153973,0.907672,0.596836,0.785778,0.726060,0.583695,0.110733,0.148202,0.625301,0.328636
603,0,0.264799,0.581194,0.651861,0.308223,0.440172,0.477597,0.240341,0.187257,0.573341,...,0.124853,0.907669,0.608435,0.751178,0.735682,0.565897,0.122811,0.124485,0.478227,0.209178
606,1,0.284547,0.662292,0.410788,0.264832,0.491906,0.389545,0.241194,0.144790,0.663934,...,0.107929,0.877304,0.514845,0.750081,0.701610,0.591128,0.129420,0.156055,0.478576,0.281654
609,0,0.448274,0.745615,0.574799,0.390312,0.557549,0.441380,0.372235,0.331771,0.734097,...,0.059985,0.921337,0.260785,0.770643,0.697953,0.587946,0.109968,0.103298,0.426438,0.335474


In [23]:
test_data_BSGS.dtypes

label           int64
cg22695986    float64
cg01193368    float64
cg22056094    float64
cg06098368    float64
               ...   
cg07635017    float64
cg08641118    float64
cg09166085    float64
cg22034735    float64
cg17805624    float64
Length: 834, dtype: object

In [24]:
y_BSGS = test_data_BSGS['label']
x_BSGS = test_data_BSGS.iloc[:,1:834]

In [25]:
x_BSGS

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
4,0.289924,0.691534,0.771177,0.314880,0.489395,0.531181,0.283582,0.206316,0.605732,0.452428,...,0.073612,0.950902,0.275012,0.837924,0.744836,0.619499,0.121501,0.141186,0.444107,0.277873
7,0.333657,0.675944,0.679786,0.276872,0.506406,0.511713,0.353252,0.310907,0.625395,0.477888,...,0.081113,0.946354,0.463502,0.834654,0.788736,0.613200,0.215362,0.117529,0.576769,0.308074
14,0.180019,0.575762,0.724558,0.170102,0.369675,0.461744,0.196268,0.160012,0.590556,0.361467,...,0.163308,0.938841,0.575904,0.814739,0.683545,0.592769,0.115443,0.117083,0.422950,0.203808
16,0.134007,0.530812,0.758351,0.122100,0.350734,0.491976,0.184403,0.129332,0.524629,0.321403,...,0.083213,0.899943,0.694841,0.761936,0.783042,0.550367,0.107730,0.273156,0.373479,0.288127
17,0.209093,0.556030,0.675430,0.135953,0.443692,0.463429,0.189666,0.202709,0.561508,0.347430,...,0.082195,0.898694,0.586915,0.769028,0.664412,0.588547,0.109294,0.254661,0.444578,0.229604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,0.406125,0.664470,0.542610,0.354006,0.458286,0.420171,0.366680,0.338191,0.580177,0.411739,...,0.153973,0.907672,0.596836,0.785778,0.726060,0.583695,0.110733,0.148202,0.625301,0.328636
603,0.264799,0.581194,0.651861,0.308223,0.440172,0.477597,0.240341,0.187257,0.573341,0.369483,...,0.124853,0.907669,0.608435,0.751178,0.735682,0.565897,0.122811,0.124485,0.478227,0.209178
606,0.284547,0.662292,0.410788,0.264832,0.491906,0.389545,0.241194,0.144790,0.663934,0.494841,...,0.107929,0.877304,0.514845,0.750081,0.701610,0.591128,0.129420,0.156055,0.478576,0.281654
609,0.448274,0.745615,0.574799,0.390312,0.557549,0.441380,0.372235,0.331771,0.734097,0.551261,...,0.059985,0.921337,0.260785,0.770643,0.697953,0.587946,0.109968,0.103298,0.426438,0.335474


In [26]:
# Stacking
stacking_auc_BSGS = roc_auc_score(y_BSGS, stacking.predict_proba(x_BSGS)[:, 1])
stacking_auc_BSGS

0.7999512670565302

In [27]:
# Voting
voting_auc_BSGS = roc_auc_score(y_BSGS, voting.predict_proba(x_BSGS)[:, 1])
voting_auc_BSGS

0.8121345029239766

## DENMARK

In [28]:
# 180 * 834
test_data_DENMARK = pd.read_csv('DENMARK_data.csv')
test_data_DENMARK['label'] = test_data_DENMARK['label'].replace([2],[0])
test_data_DENMARK

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1,0.404577,0.710879,0.704475,0.367051,0.535664,0.542295,0.333757,0.357504,0.693868,...,0.116305,0.939407,0.541084,0.875237,0.701400,0.669378,0.192394,0.088736,0.395090,0.391863
1,1,0.387200,0.743547,0.726618,0.358810,0.567206,0.555995,0.326876,0.317552,0.718015,...,0.141656,0.933621,0.520708,0.819701,0.707200,0.655214,0.152987,0.114516,0.435208,0.346165
2,0,0.505156,0.778775,0.591729,0.351014,0.526898,0.461550,0.383719,0.489494,0.598017,...,0.226380,0.939324,0.723762,0.837103,0.745728,0.721464,0.142051,0.168248,0.590381,0.353974
3,0,0.481427,0.701932,0.525088,0.312909,0.561675,0.587513,0.359970,0.456664,0.579250,...,0.198025,0.938652,0.812533,0.764826,0.647063,0.741899,0.141380,0.204437,0.523984,0.334411
4,1,0.322591,0.838889,0.883568,0.249061,0.714588,0.872372,0.321996,0.230222,0.902682,...,0.440983,0.932829,0.589026,0.736002,0.753208,0.801552,0.124071,0.233582,0.860372,0.341245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
176,0,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
177,0,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
178,0,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [29]:
test_data_DENMARK.dtypes

label           int64
cg22695986    float64
cg01193368    float64
cg22056094    float64
cg06098368    float64
               ...   
cg07635017    float64
cg08641118    float64
cg09166085    float64
cg22034735    float64
cg17805624    float64
Length: 834, dtype: object

In [30]:
y_DENMARK = test_data_DENMARK['label']
x_DENMARK = test_data_DENMARK.iloc[:,1:834]

In [31]:
x_DENMARK

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0.404577,0.710879,0.704475,0.367051,0.535664,0.542295,0.333757,0.357504,0.693868,0.596717,...,0.116305,0.939407,0.541084,0.875237,0.701400,0.669378,0.192394,0.088736,0.395090,0.391863
1,0.387200,0.743547,0.726618,0.358810,0.567206,0.555995,0.326876,0.317552,0.718015,0.598141,...,0.141656,0.933621,0.520708,0.819701,0.707200,0.655214,0.152987,0.114516,0.435208,0.346165
2,0.505156,0.778775,0.591729,0.351014,0.526898,0.461550,0.383719,0.489494,0.598017,0.568213,...,0.226380,0.939324,0.723762,0.837103,0.745728,0.721464,0.142051,0.168248,0.590381,0.353974
3,0.481427,0.701932,0.525088,0.312909,0.561675,0.587513,0.359970,0.456664,0.579250,0.600658,...,0.198025,0.938652,0.812533,0.764826,0.647063,0.741899,0.141380,0.204437,0.523984,0.334411
4,0.322591,0.838889,0.883568,0.249061,0.714588,0.872372,0.321996,0.230222,0.902682,0.628645,...,0.440983,0.932829,0.589026,0.736002,0.753208,0.801552,0.124071,0.233582,0.860372,0.341245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.233895,0.732645,0.750664,0.149491,0.524413,0.618680,0.234315,0.236759,0.890437,0.430755,...,0.174961,0.906643,0.581697,0.863217,0.798059,0.723334,0.130358,0.177975,0.821848,0.391814
176,0.247867,0.744726,0.477351,0.211166,0.500541,0.604047,0.259998,0.343025,0.705436,0.478920,...,0.165785,0.913656,0.615752,0.832152,0.712137,0.582116,0.300884,0.234928,0.620441,0.187204
177,0.247103,0.768668,0.489964,0.235903,0.515031,0.663927,0.295938,0.331253,0.679972,0.467238,...,0.180183,0.936972,0.580143,0.852484,0.725882,0.614776,0.291890,0.213946,0.573433,0.174113
178,0.525080,0.812421,0.533161,0.398285,0.603025,0.545732,0.494836,0.403702,0.749127,0.582402,...,0.182134,0.935897,0.435156,0.783916,0.682231,0.671497,0.108399,0.149804,0.330827,0.346468


In [32]:
# Stacking
stacking_auc_DENMARK = roc_auc_score(y_DENMARK, stacking.predict_proba(x_DENMARK)[:, 1])
stacking_auc_DENMARK

0.6458436417615042

In [33]:
# Voting
voting_auc_DENMARK = roc_auc_score(y_DENMARK, voting.predict_proba(x_DENMARK)[:, 1])
voting_auc_DENMARK

0.6558634339435924

## E_MTAB

In [34]:
# 648 * 834
test_data_EMTAB = pd.read_csv('EMTAB_data.csv')
test_data_EMTAB['label'] = test_data_EMTAB['label'].replace(['dizygotic', 'monozygotic'],[0, 1])
# 625 * 834
test_data_EMTAB = test_data_EMTAB.dropna()
test_data_EMTAB

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
2,0,0.18147,0.52642,0.51051,0.17722,0.44619,0.31620,0.24945,0.21956,0.59546,...,0.04834,0.93177,0.24853,0.87475,0.64115,0.40425,0.09932,0.10101,0.62191,0.18875
3,0,0.37041,0.59926,0.51959,0.29605,0.58181,0.46066,0.31533,0.27114,0.63982,...,0.01787,0.77425,0.20376,0.71660,0.63255,0.46512,0.15459,0.22849,0.33602,0.16824
4,0,0.23357,0.64785,0.57637,0.24968,0.49752,0.48834,0.18397,0.24959,0.69647,...,0.02198,0.91712,0.55349,0.67776,0.53092,0.39199,0.11892,0.12957,0.52550,0.10711
5,0,0.39143,0.71338,0.73148,0.31372,0.56135,0.52500,0.23553,0.29818,0.63982,...,0.03289,0.89765,0.25491,0.66921,0.59947,0.38596,0.11157,0.09025,0.56395,0.13625
6,1,0.31255,0.58115,0.48299,0.26661,0.38945,0.51235,0.27991,0.22731,0.64388,...,0.04816,0.93349,0.17547,0.82791,0.65820,0.42091,0.14940,0.08639,0.46054,0.18571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,1,0.26592,0.66881,0.53853,0.22175,0.45094,0.40959,0.26134,0.20290,0.70369,...,0.07446,0.93211,0.30970,0.81164,0.65131,0.42631,0.10637,0.12856,0.38166,0.14849
644,1,0.26422,0.67310,0.55243,0.21148,0.48494,0.46888,0.25982,0.24004,0.70345,...,0.03343,0.91823,0.27331,0.82445,0.60960,0.46540,0.12110,0.08949,0.31510,0.11436
645,0,0.30086,0.58616,0.69114,0.24064,0.48104,0.57918,0.27592,0.23496,0.63350,...,0.11713,0.93646,0.43627,0.82079,0.59797,0.43815,0.13400,0.21479,0.26141,0.12720
646,0,0.28954,0.67996,0.55315,0.28602,0.31555,0.42113,0.23163,0.29783,0.68037,...,0.04324,0.93180,0.29600,0.79878,0.60994,0.39422,0.14391,0.14381,0.56644,0.15656


In [35]:
test_data_EMTAB.dtypes

label           int64
cg22695986    float64
cg01193368    float64
cg22056094    float64
cg06098368    float64
               ...   
cg07635017    float64
cg08641118    float64
cg09166085    float64
cg22034735    float64
cg17805624    float64
Length: 834, dtype: object

In [36]:
y_EMTAB = test_data_EMTAB['label']
x_EMTAB = test_data_EMTAB.iloc[:,1:834]

In [37]:
x_EMTAB

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
2,0.18147,0.52642,0.51051,0.17722,0.44619,0.31620,0.24945,0.21956,0.59546,0.36583,...,0.04834,0.93177,0.24853,0.87475,0.64115,0.40425,0.09932,0.10101,0.62191,0.18875
3,0.37041,0.59926,0.51959,0.29605,0.58181,0.46066,0.31533,0.27114,0.63982,0.51987,...,0.01787,0.77425,0.20376,0.71660,0.63255,0.46512,0.15459,0.22849,0.33602,0.16824
4,0.23357,0.64785,0.57637,0.24968,0.49752,0.48834,0.18397,0.24959,0.69647,0.40352,...,0.02198,0.91712,0.55349,0.67776,0.53092,0.39199,0.11892,0.12957,0.52550,0.10711
5,0.39143,0.71338,0.73148,0.31372,0.56135,0.52500,0.23553,0.29818,0.63982,0.51164,...,0.03289,0.89765,0.25491,0.66921,0.59947,0.38596,0.11157,0.09025,0.56395,0.13625
6,0.31255,0.58115,0.48299,0.26661,0.38945,0.51235,0.27991,0.22731,0.64388,0.42755,...,0.04816,0.93349,0.17547,0.82791,0.65820,0.42091,0.14940,0.08639,0.46054,0.18571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.26592,0.66881,0.53853,0.22175,0.45094,0.40959,0.26134,0.20290,0.70369,0.42710,...,0.07446,0.93211,0.30970,0.81164,0.65131,0.42631,0.10637,0.12856,0.38166,0.14849
644,0.26422,0.67310,0.55243,0.21148,0.48494,0.46888,0.25982,0.24004,0.70345,0.47097,...,0.03343,0.91823,0.27331,0.82445,0.60960,0.46540,0.12110,0.08949,0.31510,0.11436
645,0.30086,0.58616,0.69114,0.24064,0.48104,0.57918,0.27592,0.23496,0.63350,0.35698,...,0.11713,0.93646,0.43627,0.82079,0.59797,0.43815,0.13400,0.21479,0.26141,0.12720
646,0.28954,0.67996,0.55315,0.28602,0.31555,0.42113,0.23163,0.29783,0.68037,0.27399,...,0.04324,0.93180,0.29600,0.79878,0.60994,0.39422,0.14391,0.14381,0.56644,0.15656


In [38]:
# Stacking
stacking_auc_EMTAB = roc_auc_score(y_EMTAB, stacking.predict_proba(x_EMTAB)[:, 1])
stacking_auc_EMTAB

0.6447420373782575

In [39]:
# Voting
voting_auc_EMTAB = roc_auc_score(y_EMTAB, voting.predict_proba(x_EMTAB)[:, 1])
voting_auc_EMTAB

0.6791918926033166

## AMDTSS

In [40]:
# 479 * 832
test_data_AMDTSS = pd.read_csv('AMDTSS_data.csv')
# 264 * 832 - removing family members
test_data_AMDTSS = test_data_AMDTSS[test_data_AMDTSS['label']!='Sister']
test_data_AMDTSS['label'] = test_data_AMDTSS['label'].replace(['MZ','DZ'],[1,0])
test_data_AMDTSS

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1,0.296226,0.661803,0.778489,0.212081,0.391185,0.650822,0.217513,0.244809,0.800621,...,0.359521,0.944748,0.595299,0.834836,0.770880,0.617535,0.132494,0.123293,0.678801,0.291940
1,0,0.339908,0.727956,0.679741,0.222514,0.564275,0.638797,0.270549,0.323845,0.713418,...,0.258551,0.926220,0.333009,0.766588,0.874771,0.576381,0.140619,0.146732,0.582626,0.266561
4,1,0.176177,0.750467,0.525574,0.123028,0.466311,0.558968,0.193404,0.153496,0.779365,...,0.214934,0.908169,0.672585,0.795238,0.638907,0.655187,0.136268,0.167424,0.648410,0.246543
9,1,0.151620,0.656230,0.646716,0.168091,0.453862,0.638478,0.191635,0.165740,0.778070,...,0.177048,0.947600,0.483259,0.812570,0.798659,0.599329,0.107737,0.151618,0.627207,0.269449
10,1,0.370611,0.796783,0.700574,0.323435,0.530269,0.609531,0.330050,0.265046,0.613107,...,0.222631,0.907124,0.448094,0.769004,0.754325,0.598490,0.080826,0.111948,0.580623,0.240740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [41]:
test_data_AMDTSS.astype({"label": int})

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1,0.296226,0.661803,0.778489,0.212081,0.391185,0.650822,0.217513,0.244809,0.800621,...,0.359521,0.944748,0.595299,0.834836,0.770880,0.617535,0.132494,0.123293,0.678801,0.291940
1,0,0.339908,0.727956,0.679741,0.222514,0.564275,0.638797,0.270549,0.323845,0.713418,...,0.258551,0.926220,0.333009,0.766588,0.874771,0.576381,0.140619,0.146732,0.582626,0.266561
4,1,0.176177,0.750467,0.525574,0.123028,0.466311,0.558968,0.193404,0.153496,0.779365,...,0.214934,0.908169,0.672585,0.795238,0.638907,0.655187,0.136268,0.167424,0.648410,0.246543
9,1,0.151620,0.656230,0.646716,0.168091,0.453862,0.638478,0.191635,0.165740,0.778070,...,0.177048,0.947600,0.483259,0.812570,0.798659,0.599329,0.107737,0.151618,0.627207,0.269449
10,1,0.370611,0.796783,0.700574,0.323435,0.530269,0.609531,0.330050,0.265046,0.613107,...,0.222631,0.907124,0.448094,0.769004,0.754325,0.598490,0.080826,0.111948,0.580623,0.240740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,1,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,1,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [42]:
test_data_AMDTSS.dtypes

label           int64
cg22695986    float64
cg01193368    float64
cg22056094    float64
cg06098368    float64
               ...   
cg07635017    float64
cg08641118    float64
cg09166085    float64
cg22034735    float64
cg17805624    float64
Length: 832, dtype: object

In [43]:
train_data_831 = train_data.loc[:,[i for i in test_data_AMDTSS.columns]]
train_data_831

Unnamed: 0,label,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,1.0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,...,0.327951,0.919347,0.365084,0.780742,0.731394,0.532118,0.081238,0.108676,0.508394,0.279428
1,1.0,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,...,0.195741,0.945236,0.337817,0.696424,0.745175,0.546033,0.099777,0.128168,0.498426,0.360946
2,1.0,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,...,0.083195,0.906838,0.357911,0.606163,0.773520,0.536528,0.079384,0.091236,0.204013,0.296357
3,1.0,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,...,0.134511,0.956986,0.583261,0.680310,0.753129,0.557829,0.067174,0.158550,0.297127,0.292479
4,0.0,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,...,0.069399,0.921778,0.333275,0.722998,0.824286,0.546425,0.075612,0.178922,0.526409,0.278850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,1.0,0.346429,0.760069,0.688908,0.340425,0.465195,0.363101,0.258523,0.293546,0.575740,...,0.169065,0.939218,0.609925,0.757884,0.759516,0.489253,0.144548,0.169750,0.517466,0.238090
1653,1.0,0.246526,0.659649,0.701803,0.211376,0.444763,0.433245,0.196325,0.148841,0.579781,...,0.104019,0.921664,0.705905,0.763958,0.861796,0.486682,0.077902,0.141081,0.447169,0.199716
1654,1.0,0.224520,0.563713,0.712559,0.237203,0.424596,0.454003,0.163925,0.224361,0.613307,...,0.088733,0.894338,0.642370,0.799785,0.851035,0.522340,0.088133,0.119238,0.461302,0.242302
1655,1.0,0.210534,0.639052,0.773019,0.208191,0.618487,0.509763,0.185071,0.166923,0.614580,...,0.261384,0.883677,0.465794,0.783359,0.767986,0.490514,0.206327,0.232657,0.471418,0.226292


In [44]:
# Training 75% developing 25%
x_train_831, x_dev_831, y_train_831, y_dev_831 = train_test_split(train_data_831.drop(columns=['label']), train_data_831['label'])
x_train_831.shape, x_dev_831.shape, y_train_831.shape, y_dev_831.shape

((1098, 831), (366, 831), (1098,), (366,))

In [45]:
# Stacking for E-Risk containing 831 predictors
stacking_831 = get_stacking()
stacking_831.fit(x_train_831, y_train_831)
stacking_831_auc = roc_auc_score(y_dev_831, stacking_831.predict_proba(x_dev_831)[:, 1])
stacking_831_auc



0.8119027199215878

In [46]:
# Voting for E-Risk containing 831 predictors
voting_831 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_831.fit(x_train_831, y_train_831)
voting_831_auc = roc_auc_score(y_dev_831, voting_831.predict_proba(x_dev_831)[:, 1])
voting_831_auc



0.8047659887282528

In [47]:
y_AMDTSS = test_data_AMDTSS['label']
x_AMDTSS = test_data_AMDTSS.iloc[:,1:832]

In [48]:
x_AMDTSS

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg23316599,...,cg11174855,cg04524933,cg19418458,cg21808635,cg11359720,cg07635017,cg08641118,cg09166085,cg22034735,cg17805624
0,0.296226,0.661803,0.778489,0.212081,0.391185,0.650822,0.217513,0.244809,0.800621,0.391962,...,0.359521,0.944748,0.595299,0.834836,0.770880,0.617535,0.132494,0.123293,0.678801,0.291940
1,0.339908,0.727956,0.679741,0.222514,0.564275,0.638797,0.270549,0.323845,0.713418,0.444374,...,0.258551,0.926220,0.333009,0.766588,0.874771,0.576381,0.140619,0.146732,0.582626,0.266561
4,0.176177,0.750467,0.525574,0.123028,0.466311,0.558968,0.193404,0.153496,0.779365,0.551556,...,0.214934,0.908169,0.672585,0.795238,0.638907,0.655187,0.136268,0.167424,0.648410,0.246543
9,0.151620,0.656230,0.646716,0.168091,0.453862,0.638478,0.191635,0.165740,0.778070,0.407078,...,0.177048,0.947600,0.483259,0.812570,0.798659,0.599329,0.107737,0.151618,0.627207,0.269449
10,0.370611,0.796783,0.700574,0.323435,0.530269,0.609531,0.330050,0.265046,0.613107,0.516575,...,0.222631,0.907124,0.448094,0.769004,0.754325,0.598490,0.080826,0.111948,0.580623,0.240740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.421692,0.819136,0.889120,0.274042,0.525204,0.698463,0.415842,0.387231,0.759353,0.573460,...,0.285387,0.953237,0.542180,0.869079,0.713411,0.647941,0.176504,0.168747,0.529537,0.275265
472,0.155194,0.650936,0.436741,0.127649,0.458996,0.414456,0.214664,0.116075,0.700554,0.475778,...,0.098905,0.945503,0.361169,0.766231,0.652946,0.603185,0.143384,0.172315,0.522733,0.230949
475,0.170119,0.668992,0.532885,0.141059,0.522190,0.469892,0.177975,0.141652,0.686412,0.454601,...,0.208649,0.962932,0.609414,0.827691,0.850505,0.631754,0.077538,0.129992,0.617378,0.242766
477,0.316241,0.642261,0.906532,0.303791,0.403003,0.830418,0.242378,0.278751,0.764943,0.376560,...,0.223300,0.917177,0.357777,0.776557,0.607103,0.595563,0.108415,0.338011,0.444886,0.302313


In [49]:
# Stacking
stacking_auc_AMDTSS = roc_auc_score(y_AMDTSS, stacking_831.predict_proba(x_AMDTSS)[:, 1])
stacking_auc_AMDTSS

0.6903122130394858

In [50]:
# Voting
voting_auc_AMDTSS = roc_auc_score(y_AMDTSS, voting_831.predict_proba(x_AMDTSS)[:, 1])
voting_auc_AMDTSS

0.691574839302112

# Sparse Matrix & Variable Selection

In [51]:
svc_sparse = SVC(C=20, gamma = 'scale', decision_function_shape = 'ovr', probability = True)
svc_sparse.fit(x_train, y_train)
svc_sparse_auc = roc_auc_score(y_dev, svc_sparse.predict_proba(x_dev)[:, 1])
svc_sparse_auc

0.8102557758327406

## Random Forest

### Round 1 - from 833 to 319

In [52]:
from sklearn.feature_selection import SelectFromModel
rf_selection = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_samples_split = 4, min_impurity_decrease = 1e-06, criterion ='entropy'))
rf_selection.fit(x_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 min_impurity_decrease=1e-06,
                                                 min_samples_split=4,
                                                 n_estimators=500))

In [53]:
selected_feat = x_train.columns[(rf_selection.get_support())]
len(selected_feat)

305

In [54]:
print(selected_feat)

Index(['cg22695986', 'cg01193368', 'cg22056094', 'cg06098368', 'cg08690094',
       'cg11236452', 'cg26916862', 'cg03124146', 'cg14770527', 'cg10933186',
       ...
       'cg15489799', 'cg09009380', 'cg09990584', 'cg06495631', 'cg07903023',
       'cg03556669', 'cg11174855', 'cg21808635', 'cg07635017', 'cg17805624'],
      dtype='object', length=305)


In [55]:
np.savetxt('rf_selected.txt', selected_feat,fmt='%s')

In [56]:
rf_selected_train_data = train_data.loc[:,[i for i in selected_feat]]
rf_selected_train_data['label'] = train_data['label'] 
rf_selected_train_data

Unnamed: 0,cg22695986,cg01193368,cg22056094,cg06098368,cg08690094,cg11236452,cg26916862,cg03124146,cg14770527,cg10933186,...,cg09009380,cg09990584,cg06495631,cg07903023,cg03556669,cg11174855,cg21808635,cg07635017,cg17805624,label
0,0.233353,0.606732,0.730107,0.240143,0.561082,0.533287,0.194961,0.144218,0.597436,0.850368,...,0.323892,0.342347,0.570667,0.641294,0.305896,0.327951,0.780742,0.532118,0.279428,1.0
1,0.284813,0.599726,0.715363,0.242588,0.564277,0.578224,0.192382,0.139766,0.570267,0.770310,...,0.315845,0.321944,0.589975,0.668892,0.313702,0.195741,0.696424,0.546033,0.360946,1.0
2,0.206618,0.552816,0.572559,0.169127,0.541453,0.509944,0.197505,0.193932,0.510173,0.741130,...,0.313867,0.327677,0.507627,0.433934,0.230778,0.083195,0.606163,0.536528,0.296357,1.0
3,0.203151,0.655871,0.391728,0.224729,0.480992,0.421599,0.178132,0.181609,0.481978,0.667161,...,0.300121,0.303911,0.507012,0.416025,0.257855,0.134511,0.680310,0.557829,0.292479,1.0
4,0.266709,0.493554,0.395203,0.231550,0.474545,0.381759,0.194651,0.170328,0.549200,0.731597,...,0.313593,0.333914,0.673096,0.593111,0.380755,0.069399,0.722998,0.546425,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,0.346429,0.760069,0.688908,0.340425,0.465195,0.363101,0.258523,0.293546,0.575740,0.809944,...,0.332918,0.325615,0.631574,0.545113,0.293381,0.169065,0.757884,0.489253,0.238090,1.0
1653,0.246526,0.659649,0.701803,0.211376,0.444763,0.433245,0.196325,0.148841,0.579781,0.838906,...,0.295150,0.300664,0.632603,0.541257,0.296495,0.104019,0.763958,0.486682,0.199716,1.0
1654,0.224520,0.563713,0.712559,0.237203,0.424596,0.454003,0.163925,0.224361,0.613307,0.801801,...,0.297558,0.312066,0.636855,0.549802,0.313466,0.088733,0.799785,0.522340,0.242302,1.0
1655,0.210534,0.639052,0.773019,0.208191,0.618487,0.509763,0.185071,0.166923,0.614580,0.825470,...,0.330735,0.403701,0.525376,0.516929,0.265253,0.261384,0.783359,0.490514,0.226292,1.0


In [57]:
# Training 75% developing 25%
x_train_rf, x_dev_rf, y_train_rf, y_dev_rf = train_test_split(rf_selected_train_data.drop(columns=['label']), rf_selected_train_data['label'])
x_train_rf.shape, x_dev_rf.shape, y_train_rf.shape, y_dev_rf.shape

((1098, 305), (366, 305), (1098,), (366,))

In [58]:
rf_selected = RandomForestClassifier(n_estimators = 500, min_samples_split = 4, min_impurity_decrease = 1e-06, criterion ='entropy')
rf_selected.fit(x_train_rf, y_train_rf)
rf_selected_auc = roc_auc_score(y_dev_rf, rf_selected.predict_proba(x_dev_rf)[:, 1])
rf_selected_auc

0.8065553286807058

### Round 2 - From 319 to 121

In [59]:
rf_selection2 = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_samples_split = 4, min_impurity_decrease = 1e-06, criterion ='entropy'))
rf_selection2.fit(x_train_rf, y_train_rf)

SelectFromModel(estimator=RandomForestClassifier(criterion='entropy',
                                                 min_impurity_decrease=1e-06,
                                                 min_samples_split=4,
                                                 n_estimators=500))

In [60]:
selected_feat2 = x_train_rf.columns[(rf_selection2.get_support())]
len(selected_feat2)

122

In [61]:
print(selected_feat2)

Index(['cg01193368', 'cg22056094', 'cg08690094', 'cg11236452', 'cg03124146',
       'cg14770527', 'cg10933186', 'cg11108474', 'cg16340103', 'cg19689427',
       ...
       'cg14003931', 'cg14364186', 'cg25293896', 'cg04388244', 'cg25690958',
       'cg14320530', 'cg27132152', 'cg06495631', 'cg07635017', 'cg17805624'],
      dtype='object', length=122)


In [62]:
np.savetxt('rf_selected2.txt', selected_feat2,fmt='%s')

In [63]:
rf_selected_train_data2 = train_data.loc[:,[i for i in selected_feat2]]
rf_selected_train_data2['label'] = train_data['label'] 
rf_selected_train_data2

Unnamed: 0,cg01193368,cg22056094,cg08690094,cg11236452,cg03124146,cg14770527,cg10933186,cg11108474,cg16340103,cg19689427,...,cg14364186,cg25293896,cg04388244,cg25690958,cg14320530,cg27132152,cg06495631,cg07635017,cg17805624,label
0,0.606732,0.730107,0.561082,0.533287,0.144218,0.597436,0.850368,0.479727,0.556758,0.487860,...,0.746484,0.333828,0.766931,0.387041,0.286565,0.363816,0.570667,0.532118,0.279428,1.0
1,0.599726,0.715363,0.564277,0.578224,0.139766,0.570267,0.770310,0.483118,0.481381,0.457657,...,0.742400,0.353813,0.725599,0.460505,0.281765,0.454734,0.589975,0.546033,0.360946,1.0
2,0.552816,0.572559,0.541453,0.509944,0.193932,0.510173,0.741130,0.368669,0.513161,0.429852,...,0.733459,0.481367,0.758648,0.430895,0.222950,0.479652,0.507627,0.536528,0.296357,1.0
3,0.655871,0.391728,0.480992,0.421599,0.181609,0.481978,0.667161,0.360383,0.432874,0.380312,...,0.748932,0.366710,0.791484,0.386071,0.257250,0.414647,0.507012,0.557829,0.292479,1.0
4,0.493554,0.395203,0.474545,0.381759,0.170328,0.549200,0.731597,0.401675,0.475677,0.441566,...,0.673719,0.407009,0.746721,0.397046,0.282481,0.443079,0.673096,0.546425,0.278850,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652,0.760069,0.688908,0.465195,0.363101,0.293546,0.575740,0.809944,0.401068,0.424275,0.512185,...,0.777092,0.346160,0.690089,0.470675,0.338159,0.445846,0.631574,0.489253,0.238090,1.0
1653,0.659649,0.701803,0.444763,0.433245,0.148841,0.579781,0.838906,0.229552,0.509571,0.478364,...,0.728032,0.215664,0.698779,0.410059,0.388064,0.425642,0.632603,0.486682,0.199716,1.0
1654,0.563713,0.712559,0.424596,0.454003,0.224361,0.613307,0.801801,0.226769,0.501054,0.474891,...,0.742944,0.283641,0.686101,0.428814,0.260629,0.448000,0.636855,0.522340,0.242302,1.0
1655,0.639052,0.773019,0.618487,0.509763,0.166923,0.614580,0.825470,0.378100,0.481624,0.489605,...,0.735591,0.245011,0.633607,0.413434,0.414899,0.496993,0.525376,0.490514,0.226292,1.0


In [64]:
# Training 75% developing 25%
x_train_rf2, x_dev_rf2, y_train_rf2, y_dev_rf2 = train_test_split(rf_selected_train_data2.drop(columns=['label']), rf_selected_train_data2['label'])
x_train_rf2.shape, x_dev_rf2.shape, y_train_rf2.shape, y_dev_rf2.shape

((1098, 122), (366, 122), (1098,), (366,))

In [65]:
rf_selected2 = RandomForestClassifier(n_estimators = 500, min_samples_split = 4, min_impurity_decrease = 1e-06, criterion ='entropy')
rf_selected2.fit(x_train_rf2, y_train_rf2)
rf_selected2_auc = roc_auc_score(y_dev_rf2, rf_selected2.predict_proba(x_dev_rf2)[:, 1])
rf_selected2_auc

0.8117292326247549

#### Testing BSGS - Round 1 selection

In [66]:
y_BSGS_rf_selected = test_data_BSGS['label']
x_BSGS_rf_selected = test_data_BSGS.loc[:,[i for i in selected_feat]]

In [67]:
rf_selected_auc_BSGS = roc_auc_score(y_BSGS_rf_selected, rf_selected.predict_proba(x_BSGS_rf_selected)[:, 1])
rf_selected_auc_BSGS

0.824317738791423

#### Testing BSGS - Round 2 selection

In [68]:
y_BSGS_rf_selected2 = test_data_BSGS['label']
x_BSGS_rf_selected2 = test_data_BSGS.loc[:,[i for i in selected_feat2]]

In [69]:
# Round 2 selection
rf_selected2_auc_BSGS = roc_auc_score(y_BSGS_rf_selected2, rf_selected2.predict_proba(x_BSGS_rf_selected2)[:, 1])
rf_selected2_auc_BSGS

0.7833820662768031

In [70]:
# Round 2 selection
rf_auc_BSGS = roc_auc_score(y_BSGS, rf.predict_proba(x_BSGS)[:, 1])
rf_auc_BSGS

0.7319688109161793