In [16]:
import numpy as np
import pandas as pd
import matplotlib
from tqdm import tqdm 
from sklearn import metrics 
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
from scipy.stats import rankdata


# styling:
import seaborn as sns
plt.style.use(['ggplot'])
sns.set_palette("deep")

matplotlib.rcParams["figure.dpi"] = 300


In [2]:
all_dir = [x[0] for x in os.walk('.')]
set_dirs = []
for dir in all_dir:
    if len(dir.split('/')) == 3:
        set_dirs.append(dir)
set_dirs.sort()

In [3]:
# Ensemble data set up
# algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe', 'sincerities', 'ss_cor', 'ss_ranked']
algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe', 'ss_cor', 'ss_ranked']
min_max_scaler = preprocessing.MinMaxScaler()

In [4]:
ensemble_df = pd.read_csv(set_dirs[0] + '/ensemble.csv')
all_data = pd.DataFrame(columns=ensemble_df.columns)

tf_set = np.array(['G'+str(i) for i in range(1,101)])
np.random.seed(4)
np.random.shuffle(tf_set)
train_tf_list = tf_set[:66]
test_tf_list = tf_set[20:40]

tf_set = np.array(['G'+str(i) for i in range(1,51)])
np.random.seed(4)
np.random.shuffle(tf_set)
train_tf_list_s = tf_set[:33]
test_tf_list_s = tf_set[10:20]



In [5]:

ensemble_df_list = []
train_df_list = []


for set_dir in set_dirs:
    train_ind = []
    test_ind = []
    ensemble_df = pd.read_csv(set_dir + '/ensemble.csv')
    ensemble_df = ensemble_df.replace(np.nan,0)
    # ensemble_df = ensemble_df.drop(columns=['singe'])
    for index, row in ensemble_df.iterrows():
        tf_name = row['edge_name'].split('_')[0]
        if (len(ensemble_df) < 5000):
            if (tf_name in train_tf_list_s):
                train_ind.append(index)
            if (tf_name in test_tf_list_s):
                test_ind.append(index)
        else:
            if (tf_name in train_tf_list):
                train_ind.append(index)
            if (tf_name in test_tf_list):
                test_ind.append(index)
    test_df = ensemble_df.loc[test_ind]
    train_df = ensemble_df.loc[train_ind]
    # ensemble_df = ensemble_df.drop(train_ind+test_ind)
    # ensemble_df = ensemble_df.drop(test_ind)
    ensemble_df = ensemble_df.drop(train_ind)
    all_data = pd.concat([all_data, test_df])
    train_df_list.append(train_df)
    ensemble_df_list.append(ensemble_df)

all_data = all_data.astype({'edge_exist': 'int8'})
all_X = all_data[algo_names] 
# all_X = min_max_scaler.fit_transform(all_X)
all_y = all_data['edge_exist']


In [6]:
set_names =['dream3_100#1', 'dream3_100#2','dream3_100#3','dream3_100#4','dream3_100#5', 'dream3_50#1', 'dream3_50#2','dream3_50#3','dream3_50#4','dream3_50#5', 'dream4_100#1', 'dream4_100#2','dream4_100#3','dream4_100#4','dream4_100#5']

In [7]:
# algo_print_names = ['OutPredict', 'Inferelator', 'GRNBoost', 'Genie3', 'GRISLI', 'GRNVBEM', 'LEAP',  'PIDC', 'PPCOR', 'SCODE', 'SCRIBE', 'SINCERITIES']
algo_print_names = ['OutPredict', 'Inferelator', 'GRNBoost', 'Genie3', 'GRISLI', 'GRNVBEM', 'LEAP',  'PIDC', 'PPCOR', 'SCODE', 'SCRIBE']


In [8]:
ensemble_models = [LogisticRegression(random_state=42, n_jobs=-1), GaussianNB(), SGDClassifier(loss='log', random_state=42, n_jobs=-1), 
SVC(random_state=42, probability=True), KNeighborsClassifier(), RandomForestClassifier(random_state=42, n_jobs = -1), AdaBoostClassifier(random_state=42),
XGBClassifier(random_state=42, n_jobs=-1)]

In [9]:
df_columns = algo_print_names + ['Ensemble_LR', 'Ensemble_NB', 'Ensemble_SGD', 'Ensemble_SVM', 'Ensemble_KNN', 'Ensemble_RF', 'Ensemble_AB', 'Ensemble_XGB', 'Ensemble_AVG']
df_columns = df_columns + ['best_train_algo', 'best_test_algo', 'best_train_score', 'best_test_score']

In [140]:
import functools
algo_names = ['op', 'inf', 'grnboost', 'genie3', 'grisli', 'grnvbem', 'leap',  'pidc', 'ppcor', 'scode', 'scribe']
algo_list = algo_print_names
algo_list.append('Ensemble')
avg_pr_scores = []
for i in range(10):
    test_ensemble = ensemble_df_list[i]
    y_test = test_ensemble['edge_exist']
    avg_score = (functools.reduce(lambda a, b: a + rankdata(test_ensemble[b]), algo_names, np.zeros_like(test_ensemble['op'])))/11.0
    pr = precision_recall_curve(y_test, avg_score/np.max(avg_score))
    avg_pr_scores.append(auc(pr[1], pr[0]))

In [145]:
df = pd.read_csv('aupr_all.csv')
df['Ensemble_AVG'] = avg_pr_scores
df.to_csv('aupr_all.csv', index=False)
df = pd.read_csv('aupr_gaussian_only.csv')
df['Ensemble_AVG'] = avg_pr_scores
df.to_csv('aupr_gaussian_only.csv', index=False)