In [None]:
import numpy as np
import pandas as pd
import matplotlib
from tqdm import tqdm 
from sklearn import metrics 
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

# styling:
import seaborn as sns
plt.style.use(['ggplot'])
sns.set_palette("deep")

matplotlib.rcParams["figure.dpi"] = 300


In [None]:
# Ensemble data set up
# algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe', 'sincerities', 'ss_cor', 'ss_ranked']
algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe', 'sincerities', 'ss_cor', 'ss_ranked']
min_max_scaler = preprocessing.MinMaxScaler()

In [None]:
tf_df = pd.read_csv('tf_names.tsv', index_col=0)
len(np.array(tf_df.index))

In [None]:
ensemble_df = pd.read_csv('ensemble.csv')
all_data = pd.DataFrame(columns=ensemble_df.columns)

tf_set = np.array(tf_df.index)
np.random.seed(4)
np.random.shuffle(tf_set)
train_tf_list = tf_set[:66]
test_tf_list = tf_set[66:]



In [None]:
set_dirs = ['./']

In [None]:
algo_print_names = ['OutPredict', 'Inferelator', 'GRNBoost', 'Genie3', 'GRISLI', 'GRNVBEM', 'LEAP',  'PIDC', 'PPCOR', 'SCODE', 'SCRIBE', 'SINCERITIES']


In [None]:
ensemble_models = [LogisticRegression(random_state=42, n_jobs=-1), GaussianNB(), SGDClassifier(loss='log', random_state=42, n_jobs=-1), 
SVC(random_state=42, probability=True), KNeighborsClassifier(), RandomForestClassifier(random_state=42, n_jobs = -1), AdaBoostClassifier(random_state=42),
XGBClassifier(random_state=42, n_jobs=-1)]

In [None]:
LR_grid = {
    'random_state': [42],
    'n_jobs': [-1],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l2'],
    'C': [100, 10, 1.0, 0.1, 0.01]
}
NB_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

SGD_grid = {
    'random_state': [42],
    'n_jobs': [-1],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log'], 
    'penalty': ['l2']
}

SVC_grid = {
    'random_state': [42],
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': [50, 10, 1.0, 0.1, 0.01],
    'gamma': ['scale']
}

KNN_grid = {
    'n_neighbors': range(1, 21, 2),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

RF_grid = {
    'random_state': [42],
    'n_jobs': [-1],
    'n_estimators': [10, 50, 100, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': range (2, 10, 2),
    'bootstrap': [True, False]
}

AB_grid = {
    'random_state': [42],
    'n_estimators': [10, 50, 100, 500],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]
}

XGB_grid = {
    'random_state': [42],
    'n_jobs': [-1],
    'max_depth': range (2, 10, 2),
    'n_estimators': [10, 50, 100, 500],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]
    }

grid_list = [LR_grid, NB_grid, SGD_grid, SVC_grid, KNN_grid, RF_grid, AB_grid, XGB_grid]

ensemble_names = ['LR', 'NB', 'SGD', 'SVM', 'KNN', 'RF', 'AB', 'XGB', 'AVG']


In [None]:
df_columns = algo_print_names + ['Ensemble_LR', 'Ensemble_NB', 'Ensemble_SGD', 'Ensemble_SVM', 'Ensemble_KNN', 'Ensemble_RF', 'Ensemble_AB', 'Ensemble_XGB', 'Ensemble_AVG']
df_columns = df_columns + ['best_train_algo', 'best_test_algo', 'best_train_score', 'best_test_score']

In [None]:
from sklearn.model_selection import GridSearchCV
algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe']
algo_list = algo_print_names
algo_list.append('Ensemble')
X_train = ensemble_df[algo_names].values 
y_train = ensemble_df['edge_exist']

model = XGBClassifier(random_state=42, n_jobs=-1)
grid = XGB_grid
name = 'XGB'

grid_search = GridSearchCV(
    estimator=model,
    param_grid=grid,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 5,
    verbose=True
)
grid_search.fit(X_train, y_train)
print('best parameters for {} ensemble is: '.format(name))
print(grid_search.best_params_)
    

In [None]:
from sklearn.model_selection import GridSearchCV
algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe']
algo_list = algo_print_names
algo_list.append('Ensemble')
X_train = ensemble_df[algo_names].values 
y_train = ensemble_df['edge_exist']

for model, grid, name in zip(ensemble_models, grid_list, ensemble_names):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=grid,
        scoring = 'roc_auc',
        n_jobs = -1,
        cv = 5,
        verbose=False
    )
    grid_search.fit(X_train, y_train)
    print('best parameters for {} ensemble is: '.format(name))
    print(grid_search.best_params_)
    

best parameters for LR ensemble is: 
{'C': 100, 'n_jobs': -1, 'penalty': 'l2', 'random_state': 42, 'solver': 'newton-cg'}
best parameters for NB ensemble is: 
{'var_smoothing': 2.848035868435805e-09}
best parameters for SGD ensemble is: 
{'alpha': 0.1, 'loss': 'log', 'n_jobs': -1, 'penalty': 'l2', 'random_state': 42}
best parameters for SVM ensemble is: 
{'C': 50, 'gamma': 'scale', 'kernel': 'poly', 'random_state': 42}
best parameters for KNN ensemble is: 
{'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}
best parameters for RF ensemble is: 
{'bootstrap': False, 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 500, 'n_jobs': -1, 'random_state': 42}
best parameters for AB ensemble is: 
{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 42}