In [1]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import pickle
import seaborn as sns
from CGRtools import RDFRead
from CGRtools.files import RDFWrite
from scipy.stats import rankdata
from sklearn.utils import resample

In [10]:
with open ('results/selected_datasets.pickle', 'rb') as f:
    selected_datasets = pickle.load(f)

КЛАССИФИКАЦИЯ

In [6]:
def EF_classification(dataset, method):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))   
    x_train_ex, x_test_ex, y_test_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    results = []
    iteration = 0
    while y_train_in.shape[0] < y_train_ex.shape[0]:
        
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
 
        probs = rfc.predict_proba(x_test_in.values)
        probs_difference, prob_of_label_1, prob_bernulli = [], [], []
        for n, prob in enumerate(probs):
            try:
                prob_bernulli.append([x_test_in.index[n], prob[1]*(1-prob[1])])
                probs_difference.append([x_test_in.index[n], abs(prob[0]-prob[1])])
                prob_of_label_1.append([x_test_in.index[n], prob[1]])
            except IndexError:
                probs_difference.append([x_test_in.index[n], 1])
                prob_bernulli.append([x_test_in.index[n], 0])
                if 1 in classes:
                    prob_of_label_1.append([x_test_in.index[n], 1]) 
                else:
                    prob_of_label_1.append([x_test_in.index[n], 0])

        least_sure = [x[0] for x in sorted(probs_difference, key=lambda x: x[1], reverse=False)][:5]
        most_sure = [x[0] for x in sorted(prob_of_label_1, key=lambda x: x[1], reverse=True)][:5]
        least_sure_bernulli = [x[0] for x in sorted(prob_bernulli, key=lambda x: x[1], reverse=True)][:5]


        if method == 'exploration':
            adding_points = least_sure
        elif method == 'exploitation':
            adding_points = most_sure
        elif method == 'bernulli':
            adding_points = least_sure_bernulli
        else:
            try:
                adding_points = random.sample(list(x_test_in.index), 5)
            except ValueError:
                adding_points = list(x_test_in.index)
        
        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)

        pred_ex = rfc.predict(x_test_ex.values)
        proba_ex = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba_ex = np.insert(proba_ex, 0, 0, axis = 1)
            else:
                proba_ex = np.insert(proba_ex, 1, 0, axis = 1)

        prob_of_label_1_test = []
        for n, prob in enumerate(proba_ex):
            prob_of_label_1_test.append([x_test_ex.index[n], prob[1]])

       
        ordered_probs = dict()
        ranked_probs = rankdata(prob_of_label_1_test, method='dense', axis=0)
        max_rank = np.amax(ranked_probs, axis = 0)[1]
        for i in ranked_probs:
            i[1] = max_rank - i[1]
       
        for n, el in enumerate(prob_of_label_1_test):
            ordered_probs[el[0]] = ranked_probs[n][1]
     
        number_of_points = 0
        for rank in range (max(list(ordered_probs.values()))):
            while number_of_points < round(y_test_ex.shape[0] * 0.1):
                number_of_points += list(ordered_probs.values()).count(rank)
                max_rank = rank 
                break        
      
        predicted_max_pki = set()
        for key, item in ordered_probs.items():
            if item <= max_rank:
                predicted_max_pki.add(key)
       
        true_max_pki = set(y_test_ex_regr.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
       
        try:
            results.append([iteration, len(true_max_pki.intersection(predicted_max_pki))/number_of_points, number_of_points/y_test_ex.shape[0]])
        except ZeroDivisionError:
            results.append([iteration, 0, number_of_points/y_test_ex.shape[0]])
       

        iteration += 1 

    return results

In [99]:
def EF_new_func(dataset, method, c):
    data = []
    try:
        with open(f'x_and_y/{dataset}.pickle', "rb") as f:
            for _ in range(pickle.load(f)):
                data.append(pickle.load(f))
    except FileNotFoundError:
        with open(f'x_and_y/{dataset}_worst.pickle', "rb") as f:
            for _ in range(pickle.load(f)):
                data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_test_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    results = []
    iteration = 0
    while y_train_in.shape[0] < y_train_ex.shape[0]:
        
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
 
        probs = rfc.predict_proba(x_test_in.values)
        probs_difference_with_05, abs_value, squared_value = [], [], []
        if len(classes) == 1:
            if 1 in classes:
                probs = np.insert(probs, 0, 0, axis = 1)
            else:
                probs = np.insert(probs, 1, 0, axis = 1)
        for n, prob in enumerate(probs):                
                probs_difference_with_05.append([x_test_in.index[n], prob[1] - 0.5])

        for diff in probs_difference_with_05:

            abs_value.append([diff[0], abs(diff[1]) - c*diff[1]])
            squared_value.append([diff[0], diff[1]**2 - c*diff[1]])

        if method == 'abs':
            adding_points = [x[0] for x in sorted(abs_value, key=lambda x: x[1], reverse=False)][:5]
        elif method == 'squared':
            adding_points = [x[0] for x in sorted(squared_value, key=lambda x: x[1], reverse=False)][:5]


        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)
        pred_ex = rfc.predict(x_test_ex.values)
        proba_ex = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba_ex = np.insert(proba_ex, 0, 0, axis = 1)
            else:
                proba_ex = np.insert(proba_ex, 1, 0, axis = 1)

        prob_of_label_1_test = []
        for n, prob in enumerate(proba_ex):
            prob_of_label_1_test.append([x_test_ex.index[n], prob[1]])

       
        ordered_probs = dict()
        ranked_probs = rankdata(prob_of_label_1_test, method='dense', axis=0)
        max_rank = np.amax(ranked_probs, axis = 0)[1]
        for i in ranked_probs:
            i[1] = max_rank - i[1]
       
        for n, el in enumerate(prob_of_label_1_test):
            ordered_probs[el[0]] = ranked_probs[n][1]
     
        number_of_points = 0
        for rank in range (max(list(ordered_probs.values()))):
            while number_of_points < round(y_test_ex.shape[0] * 0.1):
                number_of_points += list(ordered_probs.values()).count(rank)
                max_rank = rank 
                break
        
        if number_of_points == 0:
            print(ordered_probs, prob_of_label_1_test)
      
        predicted_max_pki = set()
        for key, item in ordered_probs.items():
            if item <= max_rank:
                predicted_max_pki.add(key)
       
        true_max_pki = set(y_test_ex_regr.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
       
        try:
            results.append([iteration, len(true_max_pki.intersection(predicted_max_pki))/number_of_points, number_of_points/y_test_ex.shape[0]])
        except ZeroDivisionError:
            results.append([iteration, 0, number_of_points/y_test_ex.shape[0]])
       
       
        iteration += 1 

    return results

In [112]:
def code_new_func(datasets):
    results = defaultdict(dict)
    for dataset in tqdm(datasets):
        for i in tqdm(range(0, 5)): 
            result_for_iteration = defaultdict(dict)
            for method in tqdm(['abs', 'squared']): 
                if method == 'abs':
                    end = 1.1
                else:
                    end = 1.0
                for c in tqdm([0.0, 0.5, 0.7, end]):   
                    result_for_iteration[method][c] = EF_new_func(dataset, method, c)
                results[dataset][i] = result_for_iteration
                with open(f'new_results/results_new_function_EF_worst_{dataset}.pickle', 'wb') as f:
                        pickle.dump(results, f)
    return results

In [None]:
def EF_class_on_all_dataset(dataset):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
  
    x_train_ex, x_test_ex, y_test_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
    rfc.fit(x_train_ex.values, y_train_ex['activity'].values)
    pred_ex = rfc.predict(x_test_ex.values)
    proba_ex = rfc.predict_proba(x_test_ex.values)
        
    prob_of_label_1_test = []
    for n, prob in enumerate(proba_ex):
        prob_of_label_1_test.append([x_test_ex.index[n], prob[1]])


    ordered_probs = dict()
    ranked_probs = rankdata(prob_of_label_1_test, method='dense', axis=0)
    max_rank = np.amax(ranked_probs, axis = 0)[1]
    for i in ranked_probs:
        i[1] = max_rank - i[1]

    for n, el in enumerate(prob_of_label_1_test):
        ordered_probs[el[0]] = ranked_probs[n][1]

    number_of_points = 0
    for rank in range (max(list(ordered_probs.values()))):
        while number_of_points < round(y_test_ex.shape[0] * 0.1):
            number_of_points += list(ordered_probs.values()).count(rank)
            max_rank = rank 
            break

    predicted_max_pki = set()
    for key, item in ordered_probs.items():
        if item <= max_rank:
            predicted_max_pki.add(key)
    for el in prob_of_label_1_test:
        if el[0] in predicted_max_pki:
            print(el[1])

    true_max_pki = set(y_test_ex_regr.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
    try:
        results = [len(true_max_pki.intersection(predicted_max_pki))/number_of_points, number_of_points/y_test_ex.shape[0]]
    except ZeroDivisionError:
        results =  [0, number_of_points/y_test_ex.shape[0]]
    return results

In [18]:
def code_class(best_or_worst_datasets):
    result = defaultdict(dict)
    for dataset in tqdm(selected_datasets[f'{best_or_worst_datasets}_ba']):
        for i in tqdm(range(10)): 
            result_for_iteration = dict()
            for method in tqdm(['exploration', 'exploitation', 'random', 'bernulli']):  
                result_for_iteration[method] = EF_classification(dataset,method)
            result[dataset][i] = result_for_iteration
        with open(f'new_results/results_EF_for_{best_or_worst_datasets}_ba.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result

РЕГРЕССИЯ

In [11]:
def EF_regression(dataset, method):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[2], data[3]
    x_train_in, x_test_in, y_train_in, y_test_in = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    results = []
    iteration = 0
    while y_train_in.shape[0] < y_train_ex.shape[0]:
        
        rfr = RandomForestRegressor(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfr.fit(x_train_in.values, y_train_in['pKi'].values)
         
        if method == 'exploitation':          
            adding_points = exploitation(rfr, x_test_in)         
        elif method == 'exploration':
            adding_points = exploration(rfr, x_train_in, y_train_in, x_test_in)          
        elif method == 'mixed 1:4':
            adding_points = set(exploitation(rfr, x_test_in)[0:4] + exploration(rfr, x_train_in, y_train_in, x_test_in)[0:1])
        elif method == 'mixed 2:3':
            adding_points = set(exploitation(rfr, x_test_in)[0:3] + exploration(rfr, x_train_in, y_train_in, x_test_in)[0:2])
        else:
            try:
                adding_points = random.sample(list(x_test_in.index), 5)
            except ValueError:
                adding_points = list(x_test_in.index)       
        
        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)

        pred_test = rfr.predict(x_test_ex.values)
        pred_value_test = []
        for n, pred in enumerate(pred_test):
            pred_value_test.append([x_test_ex.index[n], pred])
              
        ordered_preds = dict()
        ranked_preds = rankdata(pred_value_test, method='dense', axis=0)
        max_rank = np.amax(ranked_preds, axis = 0)[1]
        for i in ranked_preds:
            i[1] = max_rank - i[1]
        for n, el in enumerate(pred_value_test):
            ordered_preds[el[0]] = ranked_preds[n][1]
           
        number_of_points = 0
        for rank in range (max(list(ordered_preds.values()))):
            while number_of_points < round(y_test_ex.shape[0] * 0.1):
                number_of_points += list(ordered_preds.values()).count(rank)
                max_rank = rank 
                break       
      
        predicted_max_pki = set()
        for key, item in ordered_preds.items():
            if item <= max_rank:
                predicted_max_pki.add(key)
       
        true_max_pki = set(y_test_ex.sort_values(by = 'pKi', ascending=False).index[:number_of_points])    
        try:
            results.append([iteration, len(true_max_pki.intersection(predicted_max_pki))/number_of_points, number_of_points/y_test_ex.shape[0]])
        except ZeroDivisionError:
            results.append([iteration, 0, number_of_points/y_test_ex.shape[0]])
      
        iteration += 1 

    return results

In [12]:
def exploitation(regressor, x_test_in):
    pred_in = regressor.predict(x_test_in.values)      
    pred_values = []
    for n, value in enumerate(pred_in):
        pred_values.append([x_test_in.index[n], value])
    adding_points = [x[0] for x in sorted(pred_values, key=lambda x: x[1], reverse=True)][:5]
    return adding_points

In [13]:
def exploration(regressor, x_train_in, y_train_in, x_test_in):
    results_expl = dict()
    for n in range(7):
        x_bootsrap, y_bootstrap = resample(x_train_in, y_train_in, replace=True)
        results_expl[n] = regressor.predict(x_test_in.values)
    df = pd.DataFrame(results_expl)
    df.index = x_test_in.index
    df = df.T
    adding_points = list(df.var().nlargest(5).index)
    return adding_points

In [79]:
def EF_regression_on_all_dataset(dataset):
    data = []  
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[2], data[3]
    rfr = RandomForestRegressor(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
    rfr.fit(x_train_ex.values, y_train_ex['pKi'].values)

    pred_test = rfr.predict(x_test_ex.values)
    pred_value_test = []
    for n, pred in enumerate(pred_test):
        pred_value_test.append([x_test_ex.index[n], pred])

    ordered_preds = dict()
    ranked_preds = rankdata(pred_value_test, method='dense', axis=0)
    max_rank = np.amax(ranked_preds, axis = 0)[1]
    for i in ranked_preds:
        i[1] = max_rank - i[1]
    for n, el in enumerate(pred_value_test):
        ordered_preds[el[0]] = ranked_preds[n][1]


    number_of_points = 0
    for rank in range (max(list(ordered_preds.values()))):
        while number_of_points < round(y_test_ex.shape[0] * 0.1):
            number_of_points += list(ordered_preds.values()).count(rank)
            max_rank = rank 
            break
    
    if number_of_points == 0:
        print(ordered_preds, pred_value_test)

    predicted_max_pki = set()
    for key, item in ordered_preds.items():
        if item <= max_rank:
            predicted_max_pki.add(key)
    true_max_pki = set(y_test_ex.sort_values(by = 'pKi', ascending=False).index[:number_of_points])
    try:
        results = [len(true_max_pki.intersection(predicted_max_pki))/number_of_points, number_of_points/y_test_ex.shape[0]]
    except ZeroDivisionError:
        results = [ 0, number_of_points/y_test_ex.shape[0]]

    return results

In [14]:
def code_regr(best_or_worst_datasets):
    result = defaultdict(dict)
    for dataset in tqdm(selected_datasets[f'{best_or_worst_datasets}_r2']):
        for i in tqdm(range(10)): 
            result_for_iteration = dict()
            for method in tqdm(['exploration', 'exploitation', 'random']):  
                result_for_iteration[method] = EF_regression(dataset,method)
            result[dataset][i] = result_for_iteration
        with open(f'new_results/new_results_EF_for_{best_or_worst_datasets}_r2.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result