In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [32]:
def variable_selection(train_data):
    # Training 75% developing 25%
    x_train, x_dev, y_train, y_dev = train_test_split(train_data.drop(columns=['zygosity']), train_data['zygosity'])
    x_train.shape, x_dev.shape, y_train.shape, y_dev.shape

    # # Grid search for rf
    # # The number of trees in the forest.
    # n_estimators = [50, 100, 200, 300, 500]
    # # The function to measure the quality of a split
    # criterion = ["gini", "entropy"]
    # # A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
    # min_impurity_decrease = [0.1, 0.000001, 0.00001]
    # # The maximum depth of the tree.
    # max_depth = [20, 50, 100, 500, 1000]

    # param_distributions = dict(n_estimators = n_estimators, criterion = criterion, min_impurity_decrease = min_impurity_decrease, max_depth = max_depth)
    # rf = RandomForestClassifier()
    # grid = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring = "roc_auc",
    #                         verbose = 1, n_jobs = -1) 
    # grid_result = grid.fit(x_train, y_train) 

    # print('Best Score: ', grid_result.best_score_) 
    # print('Best Params: ', grid_result.best_params_) 

    # Variable selection by random forest
    rf_selection = SelectFromModel(RandomForestClassifier(n_estimators = 500, min_impurity_decrease = 1e-06, max_depth =  100, criterion = 'entropy'), threshold = "3*mean")
    rf_selection.fit(x_train, y_train)

    # selected variables
    selected_feat_rf = x_train.columns[(rf_selection.get_support())]

    return selected_feat_rf

In [26]:
# Read Datasets
data_ERISK = pd.read_csv('/data/gpfs/projects/punim1257/Group31/hzx/ERISK_ALL.csv')
data_BSGS = pd.read_csv('/data/gpfs/projects/punim1257/Group31/hzx/BSGS_ALL.csv')
data_BSGS = data_BSGS.fillna(data_BSGS.mean())
data_DENMARK = pd.read_csv('/data/gpfs/projects/punim1257/Group31/hzx/DENMARK_ALL.csv')
data_AMDTSS = pd.read_csv('/data/gpfs/projects/punim1257/Group31/hzx/AMDTSS_ALL.csv')
data_EMTAB = pd.read_csv('/data/gpfs/projects/punim1257/Group31/hzx/EMTAB_ALL.csv')
data_EMTAB = data_EMTAB.fillna(data_EMTAB.mean())

## Training: E-Risk, BSGS, Denmark, AMDTSS
## Testing: E-MTAB

In [34]:
train_data_EMTAB = pd.concat([data_ERISK, data_BSGS, data_DENMARK, data_AMDTSS])
selected_feat_rf_EMTAB = variable_selection(train_data_EMTAB)
np.savetxt('selected_feat_rf_EMTAB.txt', selected_feat_rf_EMTAB,fmt='%s')

## Training: E-Risk, BSGS, Denmark, E-MTAB
## Testing: AMDTSS

In [None]:
train_data_AMDTSS = pd.concat([data_ERISK, data_BSGS, data_DENMARK, data_EMTAB])
selected_feat_rf_AMDTSS = variable_selection(train_data_AMDTSS)
np.savetxt('selected_feat_rf_AMDTSS.txt', selected_feat_rf_AMDTSS,fmt='%s')

## Training: E-Risk, BSGS, AMDTSS, E-MTAB
## Testing: Denmark

In [None]:
train_data_DENMARK = pd.concat([data_ERISK, data_BSGS, data_AMDTSS, data_EMTAB])
selected_feat_rf_DENMARK = variable_selection(train_data_DENMARK)
np.savetxt('selected_feat_rf_DENMARK.txt', selected_feat_rf_DENMARK,fmt='%s')

## Training: E-Risk, AMDTSS, E-MTAB, Denmark
## Testing: BSGS

In [None]:
train_data_BSGS = pd.concat([data_ERISK, data_DENMARK, data_AMDTSS, data_EMTAB])
selected_feat_rf_BSGS = variable_selection(train_data_BSGS)
np.savetxt('selected_feat_rf_BSGS.txt', selected_feat_rf_BSGS,fmt='%s')

## Training: BSGS, AMDTSS, E-MTAB, Denmark
## Testing: E-Risk

In [None]:
train_data_ERISK = pd.concat([data_BSGS, data_DENMARK, data_AMDTSS, data_EMTAB])
selected_feat_rf_ERISK = variable_selection(train_data_ERISK)
np.savetxt('selected_feat_rf_ERISK.txt', selected_feat_rf_ERISK,fmt='%s')