In [78]:
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import pickle
import seaborn as sns
from CGRtools import RDFRead
from CGRtools.files import RDFWrite
import matplotlib.pyplot as plt
from scipy.stats import rankdata

In [None]:
with open('results/selected_datasets.pickle', 'rb') as f:
    selected_datasets = pickle.load(f)

In [None]:
datasets_to_function = selected_datasets['best_ba'][:2] + selected_datasets['worst_ba'][:2]

In [92]:
def new_func(dataset, method, c):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex_regr, y_train_ex, y_test_ex = data[0], data[1], data[2], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    max_pki_index = y_train_ex_regr[['pKi']].idxmax()[0]
    max_pki = y_train_ex_regr[['pKi']].max()[0]
               
    if max_pki_index in y_train_in.index:
        random_point = random.choice(list(x_test_in.index))
        x_test_in, y_test_in = x_test_in.append(x_train_in.loc[max_pki_index]), y_test_in.append(y_train_in.loc[max_pki_index])
        x_train_in, y_train_in = x_train_in.append(x_test_in.loc[random_point]), y_train_in.append(y_test_in.loc[random_point])              
        x_train_in, y_train_in = x_train_in.drop(max_pki_index), y_train_in.drop(max_pki_index)
        x_test_in, y_test_in = x_test_in.drop(random_point), y_test_in.drop(random_point)        
    
    results = []
    iteration = 0
    while y_train_in.shape[0] <= y_train_ex.shape[0]:
        
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
        
        pred = rfc.predict(x_test_ex.values)
        proba = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba = np.insert(proba, 0, 0, axis = 1)
            else:
                proba = np.insert(proba, 1, 0, axis = 1)
        ba = balanced_accuracy_score(y_test_ex.values, pred)
        fpr, tpr, _ = roc_curve(y_test_ex.values, [x[1] for x in proba])
        AUC = auc(fpr, tpr)
        results.append([iteration, ba, AUC])
                
        if y_test_in.shape[0] != 0:
        
            probs = rfc.predict_proba(x_test_in.values)
            probs_difference_with_05, abs_value, squared_value = [], [], []
            if len(classes) == 1:
                if 1 in classes:
                    probs = np.insert(proba, 0, 0, axis = 1)
                else:
                    probs = np.insert(proba, 1, 0, axis = 1)
            for n, prob in enumerate(probs):                
                    probs_difference_with_05.append([x_test_in.index[n], prob[1] - 0.5])
            
            for diff in probs_difference_with_05:
                
                abs_value.append([diff[0], abs(diff[1]) - c*diff[1]])
                squared_value.append([diff[0], diff[1]**2 - c*diff[1]])

            if method == 'abs':
                adding_points = [x[0] for x in sorted(abs_value, key=lambda x: x[1], reverse=False)][:5]
            elif method == 'squared':
                adding_points = [x[0] for x in sorted(squared_value, key=lambda x: x[1], reverse=False)][:5]
           

            for point in adding_points:
                x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
                x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)
            
            if max_pki_index in adding_points:
                iteration_of_max_pki = iteration
        else:
            break

        iteration += 1 
        
    return results, iteration_of_max_pki

In [102]:
def new_func_search_max_pki_in_test(dataset, method, c):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_r_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[3], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    max_pki_index_test = y_r_test_ex.idxmax()[0]
    max_pki_test = y_r_test_ex.max()[0]
    rank_of_max_pki = dict()
    iteration = 0
    iteration_of_max_pki = 'not found'
    
    while y_train_in.shape[0] < y_train_ex.shape[0]:
        
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)
     
        probs = rfc.predict_proba(x_test_in.values)
        probs_difference_with_05, abs_value, squared_value = [], [], []
        if len(classes) == 1:
            if 1 in classes:
                probs = np.insert(probs, 0, 0, axis = 1)
            else:
                probs = np.insert(probs, 1, 0, axis = 1)
        for n, prob in enumerate(probs):                
                probs_difference_with_05.append([x_test_in.index[n], prob[1] - 0.5])

        for diff in probs_difference_with_05:

            abs_value.append([diff[0], abs(diff[1]) - c*diff[1]])
            squared_value.append([diff[0], diff[1]**2 - c*diff[1]])

        if method == 'abs':
            adding_points = [x[0] for x in sorted(abs_value, key=lambda x: x[1], reverse=False)][:5]
        elif method == 'squared':
            adding_points = [x[0] for x in sorted(squared_value, key=lambda x: x[1], reverse=False)][:5]


        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)
            
        pred = rfc.predict(x_test_ex.values)
        proba = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba = np.insert(proba, 0, 0, axis = 1)
            else:
                proba = np.insert(proba, 1, 0, axis = 1)

        prob_of_label_1_test = []
        for n, prob in enumerate(proba):
            prob_of_label_1_test.append([x_test_ex.index[n], prob[1]])
            
        ordered_probs = dict()
        ranked_probs = rankdata(prob_of_label_1_test, method='dense', axis=0)
        max_rank = np.amax(ranked_probs, axis = 0)[1]
        for i in ranked_probs:
            i[1] = max_rank - i[1]
        for n, el in enumerate(prob_of_label_1_test):
            ordered_probs[el[0]] = ranked_probs[n][1]
        rank_of_max_pki[iteration] = ordered_probs[max_pki_index_test]
        
        if iteration_of_max_pki == 'not found':
            number_of_top = 0
            top_indexes = []
            i = 0
            while number_of_top<5:
                number_of_top += list(ordered_probs.values()).count(i)
                for index, rank in ordered_probs.items():
                    if rank == i:
                        top_indexes.append(index)
                i+=1
            if max_pki_index_test in top_indexes:
                iteration_of_max_pki = iteration 
            else:
                pass
       
                
        iteration += 1
  
    return rank_of_max_pki, iteration_of_max_pki, max_pki_test

In [105]:
def code_serch(datasets):
    results = defaultdict(dict)
    for dataset in datasets:
        for i in tqdm(range(10)): 
            result_for_iteration = defaultdict(dict)
            for method in tqdm(['abs', 'squared']): 
                if method == 'abs':
                    end = 1.2
                else:
                    end = 1.1
                for c in tqdm(np.arange(0.0, end, 0.1)):   
                    result_for_iteration[method][c] = new_func_search_max_pki_in_test(dataset, method, c)
                results[dataset][i] = result_for_iteration
    with open(f'results/results_new_function_search_max_pki_in_test.pickle', 'wb') as f:
            pickle.dump(results, f)
    return results

In [68]:
def code(datasets):
    results = defaultdict(dict)
    for dataset in datasets:
        for i in tqdm(range(10)): 
            result_for_iteration = defaultdict(dict)
            for method in tqdm(['abs', 'squared']): 
                if method == 'abs':
                    end = 1.2
                else:
                    end = 1.1
                for c in tqdm(np.arange(0.0, end, 0.1)):
                    result_for_iteration[method][c] = new_func(dataset, method, c)
                results[dataset][i] = result_for_iteration
    with open(f'results/results_new_function_4860_.pickle', 'wb') as f:
            pickle.dump(results, f)
    return results