In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

### Note: 
#### - Missing values in both training and testing datasets are replaced by mean

In [3]:
# Read Datasets

# train data
train_EMTAB = pd.read_csv('train_EMTAB.csv')
train_EMTAB = train_EMTAB.iloc[: , 1:]
train_AMDTSS = pd.read_csv('train_AMDTSS.csv')
train_AMDTSS = train_AMDTSS.iloc[: , 1:]
train_DENMARK = pd.read_csv('train_DENMARK.csv')
train_DENMARK = train_DENMARK.iloc[: , 1:]
train_BSGS = pd.read_csv('train_BSGS.csv')
train_BSGS = train_BSGS.iloc[: , 1:]
train_ERISK = pd.read_csv('train_ERISK.csv')
train_ERISK = train_ERISK.iloc[: , 1:]

# test data
test_EMTAB = pd.read_csv('test_EMTAB.csv')
test_EMTAB = test_EMTAB.iloc[: , 1:]
test_AMDTSS = pd.read_csv('test_AMDTSS.csv')
test_AMDTSS = test_AMDTSS.iloc[: , 1:]
test_DENMARK = pd.read_csv('test_DENMARK.csv')
test_DENMARK = test_DENMARK.iloc[: , 1:]
test_BSGS = pd.read_csv('test_BSGS.csv')
test_BSGS = test_BSGS.iloc[: , 1:]
test_ERISK = pd.read_csv('test_ERISK.csv')
test_ERISK = test_ERISK.iloc[: , 1:]

### Stacking & Voting

In [4]:
# get a stacking ensemble of models
def get_stacking():
	# define the base models
	level0 = list()
	level0.append(('lr', LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)))
	level0.append(('rf', RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')))
	level0.append(('gb', GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)))
	level0.append(('svm', SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20)))
	level0.append(('mnb', MultinomialNB()))
	# define meta learner model
	level1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
	# define the stacking ensemble
	model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
	return model

# Training: E-Risk, BSGS, Denmark, AMDTSS
# Testing: E-MTAB

In [5]:
# Training 75% developing 25%
x_train1, x_dev1, y_train1, y_dev1 = train_test_split(train_EMTAB.drop(columns=['label']), train_EMTAB['label'])
x_train1.shape, x_dev1.shape, y_train1.shape, y_dev1.shape

((1684, 7524), (562, 7524), (1684,), (562,))

In [6]:
y_EMTAB = test_EMTAB['label']
x_EMTAB = test_EMTAB.drop(columns = ["label"])

In [7]:
stacking_EMTAB = get_stacking()
stacking_EMTAB.fit(x_train1, y_train1)
stacking_EMTAB_auc = roc_auc_score(y_EMTAB, stacking_EMTAB.predict_proba(x_EMTAB)[:, 1])
stacking_EMTAB_auc



0.9711891643709826

In [8]:
clf1 = LogisticRegression(tol = 0.001, solver = 'sag', penalty = 'l2', C = 30)
clf2 = RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth = 50, criterion = 'gini')
clf3 = GradientBoostingClassifier(n_estimators = 300, max_depth = 5, learning_rate = 0.5)
clf4 = SVC(kernel = 'rbf', gamma = 'scale', degree = 1, decision_function_shape = 'ovr', C = 20, probability=True)

In [9]:
voting_EMTAB = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_EMTAB.fit(x_train1, y_train1)
voting_EMTAB_auc = roc_auc_score(y_EMTAB, voting_EMTAB.predict_proba(x_EMTAB)[:, 1])
voting_EMTAB_auc



0.9619490358126722

# Training: E-Risk, BSGS, Denmark, E-MTAB
# Testing: AMDTSS

In [10]:
# Training 75% developing 25%
x_train2, x_dev2, y_train2, y_dev2 = train_test_split(train_AMDTSS.drop(columns=['label']), train_AMDTSS['label'])
x_train2.shape, x_dev2.shape, y_train2.shape, y_dev2.shape

((1971, 6064), (658, 6064), (1971,), (658,))

In [11]:
y_AMDTSS = test_AMDTSS['label']
x_AMDTSS = test_AMDTSS.drop(columns = ["label"])

In [12]:
# stacking_AMDTSS = get_stacking()
# stacking_AMDTSS.fit(x_train2, y_train2)
# stacking_AMDTSS_auc = roc_auc_score(y_AMDTSS, stacking_AMDTSS.predict_proba(x_AMDTSS)[:, 1])
# stacking_AMDTSS_auc

In [14]:
voting_AMDTSS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_AMDTSS.fit(x_train2, y_train2)
voting_AMDTSS_auc = roc_auc_score(y_AMDTSS, voting_AMDTSS.predict_proba(x_AMDTSS)[:, 1])
voting_AMDTSS_auc



0.621728650137741

# Training: E-Risk, BSGS, AMDTSS, E-MTAB
# Testing: Denmark

In [15]:
# Training 75% developing 25%
x_train3, x_dev3, y_train3, y_dev3 = train_test_split(train_DENMARK.drop(columns=['label']), train_DENMARK['label'])
x_train3.shape, x_dev3.shape, y_train3.shape, y_dev3.shape

((2049, 6028), (684, 6028), (2049,), (684,))

In [16]:
y_DENMARK = test_DENMARK['label']
x_DENMARK = test_DENMARK.drop(columns = ["label"])

In [17]:
# stacking_DENMARK = get_stacking()
# stacking_DENMARK.fit(x_train3, y_train3)
# stacking_DENMARK_auc = roc_auc_score(y_DENMARK, stacking_DENMARK.predict_proba(x_DENMARK)[:, 1])
# stacking_DENMARK_auc

In [18]:
voting_DENMARK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_DENMARK.fit(x_train3, y_train3)
voting_DENMARK_auc = roc_auc_score(y_DENMARK, voting_DENMARK.predict_proba(x_DENMARK)[:, 1])
voting_DENMARK_auc



0.6574248120300752

# Training: E-Risk, AMDTSS, E-MTAB, Denmark
# Testing: BSGS

In [19]:
# Training 75% developing 25%
x_train4, x_dev4, y_train4, y_dev4 = train_test_split(train_BSGS.drop(columns=['label']), train_BSGS['label'])
x_train4.shape, x_dev4.shape, y_train4.shape, y_dev4.shape

((1901, 6498), (634, 6498), (1901,), (634,))

In [20]:
y_BSGS = test_BSGS['label']
x_BSGS = test_BSGS.drop(columns = ["label"])

In [21]:
# stacking_BSGS = get_stacking()
# stacking_BSGS.fit(x_train4, y_train4)
# stacking_BSGS_auc = roc_auc_score(y_BSGS, stacking_BSGS.predict_proba(x_BSGS)[:, 1])
# stacking_BSGS_auc

In [22]:
voting_BSGS = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_BSGS.fit(x_train4, y_train4)
voting_BSGS_auc = roc_auc_score(y_BSGS, voting_BSGS.predict_proba(x_BSGS)[:, 1])
voting_BSGS_auc



0.7956817804351438

# Training: BSGS, AMDTSS, E-MTAB, Denmark
# Testing: E-Risk

In [23]:
# Training 75% developing 25%
x_train5, x_dev5, y_train5, y_dev5 = train_test_split(train_ERISK.drop(columns=['label']), train_ERISK['label'])
x_train5.shape, x_dev5.shape, y_train5.shape, y_dev5.shape

((1071, 14641), (358, 14641), (1071,), (358,))

In [24]:
y_ERISK = test_ERISK['label']
x_ERISK = test_ERISK.drop(columns = ["label"])

In [25]:
# stacking_ERISK = get_stacking()
# stacking_ERISK.fit(x_train5, y_train5)
# stacking_ERISK_auc = roc_auc_score(y_ERISK, stacking_ERISK.predict_proba(x_ERISK)[:, 1])
# stacking_ERISK_auc

In [26]:
voting_ERISK = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('svc', clf4)], voting='soft')
voting_ERISK.fit(x_train5, y_train5)
voting_ERISK_auc = roc_auc_score(y_ERISK, voting_ERISK.predict_proba(x_ERISK)[:, 1])
voting_ERISK_auc



0.6700343674245912