In [1]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import pickle
import seaborn as sns
from sklearn.utils import resample

In [7]:
with open('results/selected_datasets.pickle', 'rb') as f:
    selected_datasets = pickle.load(f)

In [8]:
def search_of_max_pki_class(dataset, method):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[2], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    max_pki_index = y_train_ex_regr[['pKi']].idxmax()[0]
    max_pki = y_train_ex_regr[['pKi']].max()[0]
               
    if max_pki_index in y_train_in.index:
        random_point = random.choice(list(x_test_in.index))
        x_test_in, y_test_in = x_test_in.append(x_train_in.loc[max_pki_index]), y_test_in.append(y_train_in.loc[max_pki_index])
        x_train_in, y_train_in = x_train_in.append(x_test_in.loc[random_point]), y_train_in.append(y_test_in.loc[random_point])              
        x_train_in, y_train_in = x_train_in.drop(max_pki_index), y_train_in.drop(max_pki_index)
        x_test_in, y_test_in = x_test_in.drop(random_point), y_test_in.drop(random_point)        
    
    iteration = 0
    while True:
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
        
        probs = rfc.predict_proba(x_test_in.values)
        probs_difference, prob_of_label_1, prob_bernulli = [], [], []
        for n, prob in enumerate(probs):
            try:
                prob_bernulli.append([x_test_in.index[n], prob[1]*(1-prob[1])])
                probs_difference.append([x_test_in.index[n], abs(prob[0]-prob[1])])
                prob_of_label_1.append([x_test_in.index[n], prob[1]])
            except IndexError:
                probs_difference.append([x_test_in.index[n], 1])
                prob_bernulli.append([x_test_in.index[n], 0])
                if 1 in classes:
                    prob_of_label_1.append([x_test_in.index[n], 1]) 
                else:
                    prob_of_label_1.append([x_test_in.index[n], 0])

        least_sure = [x[0] for x in sorted(probs_difference, key=lambda x: x[1], reverse=False)][:5]
        most_sure = [x[0] for x in sorted(prob_of_label_1, key=lambda x: x[1], reverse=True)][:5]
        least_sure_bernulli = [x[0] for x in sorted(prob_bernulli, key=lambda x: x[1], reverse=True)][:5]
        if method == 'exploration':
            adding_points = least_sure
        elif method == 'exploitation':
            adding_points = most_sure
        elif method == 'bernulli':
            adding_points = least_sure_bernulli
        elif method == 'mixed 1:4':
            adding_points = set(most_sure[0:4] + least_sure[0:1])
        elif method == 'mixed 2:3':
            adding_points = set(most_sure[0:3] + least_sure[0:2])
        else:
            try:
                adding_points = random.sample(list(x_test_in.index), 5)
            except ValueError:
                adding_points = list(x_test_in.index)
                
        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)
        iteration += 1
        if max_pki_index in adding_points:
            iteration_of_max_pki = iteration
            break

    return iteration_of_max_pki, max_pki

In [9]:
def code_class(best_or_worst_datasets):
    result = defaultdict(dict)
    for dataset in tqdm(selected_datasets[f'{best_or_worst_datasets}_ba']):
        for i in tqdm(range(10)): 
            result_for_iteration = dict()
            for method in tqdm(['exploration', 'exploitation', 'bernulli','random', 'mixed 1:4', 'mixed 2:3']):  
                result_for_iteration[method] = search_of_max_pki_class(dataset,method)
            result[dataset][i] = result_for_iteration
        with open(f'new_results/new_searching_rank_for_{best_or_worst_datasets}_ba_in_intrnal_test.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result

In [2]:
def exploration(regressor, x_train_in, y_train_in, x_test_in):
    results_expl = dict()
    for n in range(7):
        x_bootsrap, y_bootstrap = resample(x_train_in, y_train_in, replace=True)
        results_expl[n] = regressor.predict(x_test_in.values)
    df = pd.DataFrame(results_expl)
    df.index = x_test_in.index
    df = df.T
    adding_points = list(df.var().nlargest(5).index)
    return adding_points

In [3]:
def exploitation(regressor, x_test_in):
    pred_in = regressor.predict(x_test_in.values)      
    pred_values = []
    for n, value in enumerate(pred_in):
        pred_values.append([x_test_in.index[n], value])
    adding_points = [x[0] for x in sorted(pred_values, key=lambda x: x[1], reverse=True)][:5]
    return adding_points

In [25]:
def search_of_max_pki_regr(dataset, method):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
  
    x_train_ex, x_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[2], data[3]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    max_pki_index = y_train_ex[['pKi']].idxmax()[0]
    max_pki = y_train_ex[['pKi']].max()[0]
               
    if max_pki_index in y_train_in.index:
        random_point = random.choice(list(x_test_in.index))
        x_test_in, y_test_in = x_test_in.append(x_train_in.loc[max_pki_index]), y_test_in.append(y_train_in.loc[max_pki_index])
        x_train_in, y_train_in = x_train_in.append(x_test_in.loc[random_point]), y_train_in.append(y_test_in.loc[random_point])              
        x_train_in, y_train_in = x_train_in.drop(max_pki_index), y_train_in.drop(max_pki_index)
        x_test_in, y_test_in = x_test_in.drop(random_point), y_test_in.drop(random_point)        
    
    iteration = 0
    while True:
        rfr = RandomForestRegressor(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfr.fit(x_train_in.values, y_train_in['pKi'].values)
        if method == 'exploitation':         
            adding_points = exploitation(rfr, x_test_in)          

        elif method == 'exploration':
            adding_points = exploration(rfr, x_train_in, y_train_in, x_test_in)
           
        elif method == 'mixed 1:4':
            adding_points = set(exploitation(rfr, x_test_in)[0:4] + exploration(rfr, x_train_in, y_train_in, x_test_in)[0:1])
        elif method == 'mixed 2:3':
            adding_points = set(exploitation(rfr, x_test_in)[0:3] + exploration(rfr, x_train_in, y_train_in, x_test_in)[0:2])
        else:
            try:
                adding_points = random.sample(list(x_test_in.index), 5)
            except ValueError:
                adding_points = list(x_test_in.index)
        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)
        
        if max_pki_index in adding_points:
            iteration_of_max_pki = iteration
            break
        iteration += 1
       
    return iteration_of_max_pki, max_pki
                

In [26]:
def code_regr(best_or_worst_datasets):
    result = defaultdict(dict)
    for dataset in tqdm(selected_datasets[f'{best_or_worst_datasets}_r2']):
        for i in tqdm(range(10)): 
            result_for_iteration = dict()
            for method in tqdm(['exploration', 'exploitation', 'random', 'mixed 1:4', 'mixed 2:3']):  
                result_for_iteration[method] = search_of_max_pki_regr(dataset,method)
            result[dataset][i] = result_for_iteration
        with open(f'results/results_searching_max_pki_for_{best_or_worst_datasets}_r2_in_internal_test.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result