In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier, RandomForestClassifier

# Read Datasets

In [None]:
# Read Datasets

# 1658 * 834
data_ERISK = pd.read_csv('ERisk_data.csv')
# 1464 * 834
data_ERISK = data_ERISK.dropna()

# 614 rows × 834 columns
data_BSGS = pd.read_csv('BSGS_data.csv')
# 358 rows × 834 columns
data_BSGS = data_BSGS[data_BSGS['label']!='0']
data_BSGS['label'] = data_BSGS['label'].replace(['MZ','DZ'],[1,0])
data_BSGS = data_BSGS.fillna(data_BSGS.mean())

# 180 * 834
data_DENMARK = pd.read_csv('DENMARK_data.csv')
data_DENMARK['label'] = data_DENMARK['label'].replace([2],[0])

# 479 * 832
data_AMDTSS = pd.read_csv('AMDTSS_data.csv')
# 264 * 832 - removing family members
data_AMDTSS = data_AMDTSS[data_AMDTSS['label']!='Sister']
data_AMDTSS['label'] = data_AMDTSS['label'].replace(['MZ','DZ'],[1,0])

# 648 * 834
data_EMTAB = pd.read_csv('EMTAB_data.csv')
data_EMTAB['label'] = data_EMTAB['label'].replace(['dizygotic', 'monozygotic'],[0, 1])
data_EMTAB = data_EMTAB.fillna(data_EMTAB.mean())

In [None]:
clfs = [RandomForestClassifier(),LogisticRegression(),SVC(),
        AdaBoostClassifier(),SVC(),GaussianNB(), GradientBoostingClassifier()]

In [None]:
def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        predictions = model_selection.cross_val_predict(clf, data,classifications, cv=10)
        print (clf)
        print ("AUC: ")
        print (roc_auc_score(classifications,predictions))
        print (classification_report(classifications,predictions))

# Training: E-Risk, BSGS, Denmark, AMDTSS
# Testing: E-MTAB

In [1]:
#2202 * 834
train_data1 = pd.concat([data_ERISK, data_BSGS, data_DENMARK])

# 2002 * 832
train_data1 = train_data1.loc[:,[i for i in data_AMDTSS.columns]]

# 2530 * 832
train_data1 = pd.concat([train_data1, data_AMDTSS])
train_data1

NameError: name 'pd' is not defined

In [None]:
# Training 75% developing 25%
x_train1, x_dev1, y_train1, y_dev1 = train_test_split(train_data1.drop(columns=['label']), train_data1['label'])
x_train1.shape, x_dev1.shape, y_train1.shape, y_dev1.shape

In [None]:
do_multiple_10foldcrossvalidation(clfs,x_train1,y_train1)

# Training: E-Risk, BSGS, Denmark, E-MTAB
# Testing: AMDTSS

In [None]:
# 2650 * 834
train_data2 = pd.concat([data_ERISK, data_BSGS, data_DENMARK, data_EMTAB])

# 2650 * 832
train_data2 = train_data2.loc[:,[i for i in data_AMDTSS.columns]]
train_data2

In [None]:
# Training 75% developing 25%
x_train2, x_dev2, y_train2, y_dev2 = train_test_split(train_data2.drop(columns=['label']), train_data2['label'])
x_train2.shape, x_dev2.shape, y_train2.shape, y_dev2.shape

In [None]:
do_multiple_10foldcrossvalidation(clfs,x_train2,y_train2)

# Training: E-Risk, BSGS, AMDTSS, E-MTAB
# Testing: Denmark

In [None]:
#2470 * 834
train_data3 = pd.concat([data_ERISK, data_BSGS, data_EMTAB])

# 2470 * 832
train_data3 = train_data3.loc[:,[i for i in data_AMDTSS.columns]]

# 2734 * 832
train_data3 = pd.concat([train_data3, data_AMDTSS])
train_data3

In [None]:
# Training 75% developing 25%
x_train3, x_dev3, y_train3, y_dev3 = train_test_split(train_data3.drop(columns=['label']), train_data3['label'])
x_train3.shape, x_dev3.shape, y_train3.shape, y_dev3.shape

In [None]:
do_multiple_10foldcrossvalidation(clfs,x_train3,y_train3)

# Training: E-Risk, AMDTSS, E-MTAB, Denmark
# Testing: BSGS

In [None]:
#2292 * 834
train_data4 = pd.concat([data_ERISK, data_DENMARK, data_EMTAB])

# 2292 * 832
train_data4 = train_data4.loc[:,[i for i in data_AMDTSS.columns]]

# 2556 * 832
train_data4 = pd.concat([train_data4, data_AMDTSS])
train_data4

In [None]:
# Training 75% developing 25%
x_train4, x_dev4, y_train4, y_dev4 = train_test_split(train_data4.drop(columns=['label']), train_data4['label'])
x_train4.shape, x_dev4.shape, y_train4.shape, y_dev4.shape

In [None]:
do_multiple_10foldcrossvalidation(clfs,x_train4,y_train4)

# Training: BSGS, AMDTSS, E-MTAB, Denmark
# Testing: E-Risk

In [None]:
# 1186 * 834
train_data5 = pd.concat([data_BSGS, data_DENMARK, data_EMTAB])

# 1186 * 832
train_data5 = train_data5.loc[:,[i for i in data_AMDTSS.columns]]

# 1450 * 832
train_data5 = pd.concat([train_data5, data_AMDTSS])
train_data5

In [None]:
# Training 75% developing 25%
x_train5, x_dev5, y_train5, y_dev5 = train_test_split(train_data5.drop(columns=['label']), train_data5['label'])
x_train5.shape, x_dev5.shape, y_train5.shape, y_dev5.shape

In [None]:
do_multiple_10foldcrossvalidation(clfs,x_train5,y_train5)