In [1]:
import numpy as np
import pandas as pd
import matplotlib
from tqdm import tqdm 
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import math
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

# styling:
import seaborn as sns
plt.style.use(['ggplot'])
sns.set_palette("deep")

matplotlib.rcParams["figure.dpi"] = 300


In [5]:
# some global variables

min_max_scaler = preprocessing.MinMaxScaler()
algo_names = ["OutPredict", 'Inferelator', 'GRNBoost2', 'Genie3', 'PIDC', 'LEAP', 'SCODE', 'SCRIBE', 'SINCERITIES', 'PPCOR']
algos = ['op', 'inf', 'grnboost', 'genie', 'pidc', 'leap', 'scode', 'scribe', 'sincerities', 'ppcor']
ensemble_algo_names = ["OutPredict", 'Inferelator', 'GRNBoost2', 'Genie3', 'PIDC', 'LEAP', 'SCODE', 'SCRIBE', 'SINCERITIES', 'PPCOR', 'Ensemble A', 'Ensemble B', 'Ensemble C', 'Ensemble D']

In [13]:
# staking run on a particular split and a particular ensemble classifier
def evaluate_run(set_serial, clf):
    set_dir = 'tf_split_sets/{}/'.format(set_serial)
    train_df = pd.read_csv(set_dir+'ensemble_train_by_tf_genes.csv', index_col=0)
    test_df = pd.read_csv(set_dir+'ensemble_test_by_tf_genes.csv', index_col=0)
    
    min_max_scaler = preprocessing.MinMaxScaler()
    X_train = train_df[['op', 'inf', 'grnboost', 'genie', 'pidc', 'leap', 'scode', 'scribe', 'sincerities', 'ppcor']].values
    X_train = min_max_scaler.fit_transform(X_train)
    y_train = train_df['edge_exist']
    X_test = test_df[['op', 'inf', 'grnboost', 'genie', 'pidc', 'leap', 'scode', 'scribe', 'sincerities', 'ppcor']].values
    X_test = min_max_scaler.fit_transform(X_test)
    y_test = test_df['edge_exist']
    clf.fit(X_train, y_train)
    ensemble_pr = precision_recall_curve(y_test, clf.predict_proba(X_test)[:,1])

    X_train = train_df[['op', 'grnboost', 'genie', 'pidc', 'leap', 'sincerities', 'ppcor']].values
    X_train = min_max_scaler.fit_transform(X_train)
    y_train = train_df['edge_exist']
    X_test = test_df[['op', 'grnboost', 'genie', 'pidc', 'leap', 'sincerities', 'ppcor']].values
    X_test = min_max_scaler.fit_transform(X_test)
    y_test = test_df['edge_exist']
    clf.fit(X_train, y_train)
    ensemble_pr_less = precision_recall_curve(y_test, clf.predict_proba(X_test)[:,1])

    X_train = train_df[['op', 'grnboost', 'genie', 'pidc', 'leap', 'sincerities', 'ppcor', 'lagged_cor']].values
    X_train = min_max_scaler.fit_transform(X_train)
    y_train = train_df['edge_exist']
    X_test = test_df[['op', 'grnboost', 'genie', 'pidc', 'leap', 'sincerities', 'ppcor', 'lagged_cor']].values
    X_test = min_max_scaler.fit_transform(X_test)
    y_test = test_df['edge_exist']
    clf.fit(X_train, y_train)
    ensemble_pr_less_priors = precision_recall_curve(y_test, clf.predict_proba(X_test)[:,1])

    X_train = train_df[['op', 'inf', 'grnboost', 'genie', 'pidc', 'leap', 'scode', 'scribe', 'sincerities', 'ppcor', 'lagged_cor']].values
    X_train = min_max_scaler.fit_transform(X_train)
    y_train = train_df['edge_exist']
    X_test = test_df[['op', 'inf', 'grnboost', 'genie', 'pidc', 'leap', 'scode', 'scribe', 'sincerities', 'ppcor', 'lagged_cor']].values
    X_test = min_max_scaler.fit_transform(X_test)
    y_test = test_df['edge_exist']
    clf.fit(X_train, y_train)
    ensemble_pr_priors = precision_recall_curve(y_test, clf.predict_proba(X_test)[:,1])

    rand_aupr = y_test.sum()/len(y_test)
    algo_names = ["OutPredict", 'Inferelator', 'GRNBoost2', 'Genie3', 'PIDC', 'LEAP', 'SCODE', 'SCRIBE', 'SINCERITIES', 'PPCOR']
    pr_list = []
    pr_scores = []
    pr_ratios = []
    for i in range(10):
        pr = precision_recall_curve(y_test, X_test[:,i])
        pr_list.append(pr)
        pr_scores.append(auc(pr[1], pr[0]))
        pr_ratios.append(auc(pr[1], pr[0])/rand_aupr)
    rankings = np.argsort(np.array(pr_scores))[::-1]

    pr_ratios.append(auc(ensemble_pr[1], ensemble_pr[0])/rand_aupr)
    pr_ratios.append(auc(ensemble_pr_less[1], ensemble_pr_less[0])/rand_aupr)
    pr_ratios.append(auc(ensemble_pr_less_priors[1], ensemble_pr_less_priors[0])/rand_aupr)
    pr_ratios.append(auc(ensemble_pr_priors[1], ensemble_pr_priors[0])/rand_aupr)
    aupr_df.loc[len(aupr_df.index)] = pr_ratios

In [14]:
# stacking with Naive Bayes with Gaussian kernel
clf = GaussianNB()
aupr_df = pd.DataFrame(columns=ensemble_algo_names)
for i in tqdm(range(42,62)):
    evaluate_run(i, clf)
aupr_df.to_csv('bsubtilis_auprc_nb.csv', index=False)

100%|██████████| 20/20 [00:56<00:00,  2.83s/it]


In [15]:
# stacking with random forest
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
aupr_df = pd.DataFrame(columns=ensemble_algo_names)
for i in tqdm(range(42,62)):
    evaluate_run(i, clf)
aupr_df.to_csv('bsubtilis_auprc_rf.csv', index=False)

100%|██████████| 20/20 [34:49<00:00, 104.47s/it]
