In [3]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import pickle
import seaborn as sns
from CGRtools import RDFRead
from CGRtools.files import RDFWrite
from scipy.stats import rankdata

In [4]:
with open ('results/selected_datasets.pickle', 'rb') as f:
    selected_datasets = pickle.load(f)

In [5]:
def AUC_10(dataset, method):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_test_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    number_of_points = round(y_test_ex.shape[0] * 0.1)
    results = []
    iteration = 0
    y_test_ex_copy = y_test_ex.copy(deep=True)
    while y_train_in.shape[0] < y_train_ex.shape[0]:
        
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
 
        probs = rfc.predict_proba(x_test_in.values)
        probs_difference, prob_of_label_1, prob_bernulli = [], [], []
        for n, prob in enumerate(probs):
            try:
                prob_bernulli.append([x_test_in.index[n], prob[1]*(1-prob[1])])
                probs_difference.append([x_test_in.index[n], abs(prob[0]-prob[1])])
                prob_of_label_1.append([x_test_in.index[n], prob[1]])
            except IndexError:
                probs_difference.append([x_test_in.index[n], 1])
                prob_bernulli.append([x_test_in.index[n], 0])
                if 1 in classes:
                    prob_of_label_1.append([x_test_in.index[n], 1]) 
                else:
                    prob_of_label_1.append([x_test_in.index[n], 0])

        least_sure = [x[0] for x in sorted(probs_difference, key=lambda x: x[1], reverse=False)][:5]
        most_sure = [x[0] for x in sorted(prob_of_label_1, key=lambda x: x[1], reverse=True)][:5]
        least_sure_bernulli = [x[0] for x in sorted(prob_bernulli, key=lambda x: x[1], reverse=True)][:5]


        if method == 'exploration':
            adding_points = least_sure
        elif method == 'exploitation':
            adding_points = most_sure
        elif method == 'bernulli':
            adding_points = least_sure_bernulli
        else:
            try:
                adding_points = random.sample(list(x_test_in.index), 5)
            except ValueError:
                adding_points = list(x_test_in.index)
        
        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)

        pred_ex = rfc.predict(x_test_ex.values)
        proba_ex = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba_ex = np.insert(proba_ex, 0, 0, axis = 1)
            else:
                proba_ex = np.insert(proba_ex, 1, 0, axis = 1)
        max_pkis = set(y_test_ex_regr.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
        for value in y_test_ex_copy.index:
            if value in max_pkis:
                y_test_ex_copy.loc[value]['activity'] = 1 
            else:
                y_test_ex_copy.loc[value]['activity'] = 0

        fpr, tpr, _ = roc_curve(y_test_ex_copy.values, [x[1] for x in proba_ex])
       
        AUC = auc(fpr, tpr)
        results.append([iteration, AUC])
        print(results[-1])
      
        iteration += 1 

    return results

In [6]:
def code(best_or_worst_datasets):
    result = defaultdict(dict)
    for dataset in tqdm(selected_datasets[f'{best_or_worst_datasets}_ba']):
        for i in tqdm(range(10)): 
            result_for_iteration = dict()
            for method in tqdm(['exploration', 'exploitation', 'random','bernulli']):  
                  result_for_iteration[method] = AUC_10(dataset,method)
            result[dataset][i] = result_for_iteration
        with open(f'new_results/results_AUC_10_for_{best_or_worst_datasets}_ba.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result

In [33]:
def AUC_10_on_all_dataset(dataset):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_test_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    number_of_points = round(y_test_ex.shape[0] * 0.1)
    
    rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
    rfc.fit(x_train_ex.values, y_train_ex['activity'].values)
        
    pred_ex = rfc.predict(x_test_ex.values)
    proba_ex = rfc.predict_proba(x_test_ex.values)

    max_pkis = set(y_test_ex_regr.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
    for value in y_test_ex.index:
        if value in max_pkis:
            y_test_ex.loc[value]['activity'] = 1 
        else:
            y_test_ex.loc[value]['activity'] = 0
 
    fpr, tpr, _ = roc_curve(y_test_ex.values, [x[1] for x in proba_ex])

    AUC = auc(fpr, tpr)
    results = AUC     

    return results