In [1]:
import scipy
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_predict
import numpy as np
from tqdm import tqdm
import random
from collections import defaultdict
import pickle
from sklearn.utils import resample

In [2]:
with open('results/selected_datasets.pickle', 'rb') as f:
    selected_datasets = pickle.load(f)

In [3]:
best_datasets = set(selected_datasets['best_ba']).intersection(selected_datasets['best_r2'])
worst_datasets = set(selected_datasets['worst_ba']).intersection(selected_datasets['worst_r2'])

In [11]:
def perfect_model_classification(dataset):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[4], data[5]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    len_of_dataset = len(x_train_ex) + len(x_test_ex)
    results = []
    iteration = 0
    if len_of_dataset < 300:
        number_of_points = 100
    elif 300 <= len_of_dataset <= 600:
        number_of_points = 300
    else:
        number_of_points = 500
    
    while len(y_train_in) < 150:
        rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfc.fit(x_train_in.values, y_train_in['activity'].values)
        classes = set(y_train_in['activity'].values)

        pred = rfc.predict(x_test_ex.values)
        proba = rfc.predict_proba(x_test_ex.values)
        if len(classes) == 1:
            if 1 in classes:
                proba = np.insert(proba, 0, 0, axis = 1)
            else:
                proba = np.insert(proba, 1, 0, axis = 1)
        ba = balanced_accuracy_score(y_test_ex.values, pred)
        fpr, tpr, _ = roc_curve(y_test_ex.values, [x[1] for x in proba])
        AUC = auc(fpr, tpr)
        results.append([iteration, ba, AUC])
        
        try:
            adding_points = random.sample(list(x_test_in.index), number_of_points)
        except ValueError:
            adding_points = list(x_test_in.index)
        
        results_of_iteration = []

        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)        
            rfc = RandomForestClassifier(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
            rfc.fit(x_train_in.values, y_train_in['activity'].values)      
            pred = rfc.predict(x_test_in.values)
            ba = balanced_accuracy_score(y_test_in['activity'].values, pred)       
            results_of_iteration.append([point, ba])
            x_test_in, y_test_in = x_test_in.append(x_train_in.loc[point]), y_test_in.append(y_train_in.loc[point])
            x_train_in, y_train_in = x_train_in.drop(point), y_train_in.drop(point) 
          
        best_point = sorted(results_of_iteration, key=lambda x: x[1], reverse=True)[0][0]

        x_train_in, y_train_in = x_train_in.append(x_test_in.loc[best_point]), y_train_in.append(y_test_in.loc[best_point])
        x_test_in, y_test_in = x_test_in.drop(best_point), y_test_in.drop(best_point) 
        appended_points = list(x_train_in.index)
        iteration += 1
    return results, appended_points

In [None]:
def perfect_model_regression(dataset):
    data = []
    with open(f'x_and_y/{dataset}.pickle', "rb") as f:
        for _ in range(pickle.load(f)):
            data.append(pickle.load(f))
    x_train_ex, x_test_ex, y_train_ex, y_test_ex = data[0], data[1], data[2], data[3]
    x_train_in, x_test_in, y_train_in, y_test_in, = train_test_split(x_train_ex, y_train_ex, test_size=len(y_train_ex)-10)
    len_of_dataset = len(x_train_ex) + len(x_test_ex)
    results = []
    iteration = 0
    if len_of_dataset < 300:
        number_of_points = 100
    elif 300 <= len_of_dataset <= 600:
        number_of_points = 300
    else:
        number_of_points = 500

    while len(y_train_in) < 150:
        rfr_ex = RandomForestRegressor(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
        rfr_ex.fit(x_train_in.values, y_train_in['pKi'].values)
        
        pred_ex = rfr_ex.predict(x_test_ex.values)        
        r2_ex = r2_score(y_test_ex.values, pred_ex)
        rmse = mean_squared_error(y_test_ex.values, pred_ex, squared=False)
        
        results.append([iteration, r2_ex, rmse])

        try:
            adding_points = random.sample(list(x_test_in.index), number_of_points)
        except ValueError:
            adding_points = list(x_test_in.index)
        results_of_iteration = []

        for point in adding_points:
            x_train_in, y_train_in = x_train_in.append(x_test_in.loc[point]), y_train_in.append(y_test_in.loc[point])
            x_test_in, y_test_in = x_test_in.drop(point), y_test_in.drop(point)        
            rfr = RandomForestRegressor(random_state=42, n_estimators=500, max_features='log2', n_jobs=20)
            rfr.fit(x_train_in.values, y_train_in['pKi'].values)
            pred = rfr.predict(x_test_in.values)
            r2 = r2_score(y_test_in.values, pred) 
            results_of_iteration.append([point, r2])
            x_test_in, y_test_in = x_test_in.append(x_train_in.loc[point]), y_test_in.append(y_train_in.loc[point])
            x_train_in, y_train_in = x_train_in.drop(point), y_train_in.drop(point) 
          
        best_point = sorted(results_of_iteration, key=lambda x: x[1], reverse=True)[0][0]

        x_train_in, y_train_in = x_train_in.append(x_test_in.loc[best_point]), y_train_in.append(y_test_in.loc[best_point])
        x_test_in, y_test_in = x_test_in.drop(best_point), y_test_in.drop(best_point) 
        appended_points = list(x_train_in.index)
        iteration += 1
    return results, appended_points

In [15]:
def code_class(datasets):
    result = defaultdict(dict)
    for dataset in tqdm(datasets):
        result[dataset] = perfect_model_classification(dataset)  
        with open(f'new_results/perfect_model_class_{dataset}.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result

In [None]:
def code_regr(datasets):
    result = defaultdict(dict)
    for dataset in tqdm(datasets):
        result[dataset] = perfect_model_regression(dataset)  
        with open(f'new_results/perfect_model_regr_{dataset}.pickle', 'wb') as f:
            pickle.dump(result, f)
    return result