# Random Forest Self Distillation

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Import section

In [1]:
from selfdest_toolkit.data_tools import preprocessing, loading, cleaning, sd_data_utils

In [5]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm
import json
from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint
from sklearn.model_selection import cross_validate

## Data Preprocessing

In [3]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [6]:
aids = preprocessing.experiment_whole_preprocess(PATH_MAIN_DATASET, PATH_DATA)

Data file already present, no need for download.


100%|████████████████████████████████████████████████████████████████████████████| 2481/2481 [00:00<00:00, 5353.54it/s]


Chemical descriptor data already generated
Fingerprints already generated


## Individual Data Loading

In [7]:
c_sampledata, c_samplelabel = loading.load_chem_desc_data(411)

In [8]:
f_sampledata, f_samplelabel = loading.load_fingerprint_data(411)

In [9]:
c_sampledata

array([[12.0147147 , -0.2333767 , 12.0147147 , ...,  0.        ,
         1.        ,  0.        ],
       [13.68844293, -1.04921672, 13.68844293, ...,  0.        ,
         0.        ,  0.        ],
       [13.75921567, -0.69660748, 13.75921567, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [12.81165769, -1.13973262, 12.81165769, ...,  0.        ,
         0.        ,  0.        ],
       [13.06424981, -1.12117352, 13.06424981, ...,  0.        ,
         0.        ,  0.        ],
       [12.39603269, -1.12400132, 12.39603269, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
c_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
f_sampledata

array([[0, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 1, 1, ..., 0, 0, 1],
       ...,
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1]])

In [12]:
f_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
print(c_sampledata.shape)
print(f_sampledata.shape)

(68285, 208)
(68285, 2048)


In [14]:
f_samplelabel.sum()

1536

## Preloading all data

In [4]:
# loading.preload_fingerprint_data_all(aids)

In [5]:
# loading.preload_chem_data_all(aids)

## Preparing for self distillation

In [4]:
# define experiment id
aid = 411

In [5]:
# get the prediction data
data, labels = loading.load_chem_desc_data(411)

In [6]:
# determine number of elements to fetch for self destillation
number_sd = int(data.shape[0]*0.2+0.5) # set to percentage perhaps

In [7]:
# get self distillation elements
sd_data = sd_data_utils.generate_self_distillation_elements(
    aid=aid,
    number_to_generate=number_sd,
    data_gen_method="chem-desc",
    path_data=PATH_DATA
)

## Normal Random Forest

In [9]:
from selfdest_toolkit.randomforest_tools import creation, normal
import json
import typing

In [2]:
# define experiment id
aid = 411

In [3]:
# create random forest
rf = creation.generate_default_rf()

In [4]:
# execute normal random forest testing
accuracy_dict = normal.execute_normal_rf_test(
    rf=rf,
    aid=aid,
    mode="chem-desc"
)

5it [03:29, 41.84s/it]


In [7]:
print(json.dumps(accuracy_dict, indent=4))

{
    "accuracy": [
        0.978692245734788,
        0.9787654682580361,
        0.9786190232115398,
        0.9789119133045324,
        0.9781796880720509
    ],
    "balanced_accuracy": [
        0.5435619563493516,
        0.5404169869096853,
        0.5419332918542376,
        0.5516303724578804,
        0.5336788932095696
    ],
    "roc": [
        0.5435619563493517,
        0.5404169869096854,
        0.5419332918542376,
        0.5516303724578804,
        0.5336788932095696
    ],
    "precision": [
        0.9733954812766568,
        0.9743169963902334,
        0.9731493164173118,
        0.9737573577524699,
        0.9716583967760595
    ],
    "recall": [
        0.978692245734788,
        0.9787654682580361,
        0.9786190232115398,
        0.9789119133045324,
        0.9781796880720509
    ]
}


In [10]:
def convert_acc_dict(
    acc_dict: typing.Dict[str, typing.List[float]]
) -> typing.Dict[str, float]:
    
    # create placeholder dict
    output = {}
    
    # iterate over old dict
    for key in acc_dict:
        
        # calculate average
        output[key] = sum(acc_dict[key])/len(acc_dict[key])
    
    # return the dict
    return output

In [11]:
normal_mean_dict = convert_acc_dict(accuracy_dict)
print(json.dumps(normal_mean_dict, indent=4))

{
    "accuracy": 0.9786336677161895,
    "balanced_accuracy": 0.5422443001561448,
    "roc": 0.5422443001561449,
    "precision": 0.9732555097225463,
    "recall": 0.9786336677161895
}


## Self distillation Random Forest

In [10]:
scaled_data, scaled_sdist = cleaning.rf_scale_clean_data(data, sd_data)

In [11]:
scaled_sdist

array([[-0.66396962,  0.53449563, -0.66396962, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [-0.85064459, -0.00846818, -0.85064459, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [-1.06951623,  1.14994239, -1.06951623, ..., -0.27632667,
        -0.16993454, -0.21708494],
       ...,
       [ 0.14740551,  0.53735336,  0.14740551, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [ 0.68401431,  0.25112809,  0.68401431, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [ 0.49936869,  0.07506617,  0.49936869, ..., -0.27632667,
        -0.16993454, -0.21708494]])

In [12]:
scaled_data

array([[ 0.11232432,  0.51223857,  0.11232432, ..., -0.27632667,
         1.53590601, -0.21708494],
       [ 0.87025184, -0.04022904,  0.87025184, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [ 0.90230042,  0.19854962,  0.90230042, ..., -0.27632667,
        -0.16993454, -0.21708494],
       ...,
       [ 0.47321028, -0.10152427,  0.47321028, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [ 0.58759356, -0.08895649,  0.58759356, ..., -0.27632667,
        -0.16993454, -0.21708494],
       [ 0.28499954, -0.09087141,  0.28499954, ..., -0.27632667,
        -0.16993454, -0.21708494]])

In [20]:
rf1 = RandomForestClassifier(
    class_weight="balanced",
    random_state=131313
)

In [21]:
rf2 = RandomForestClassifier(
    class_weight="balanced",
    random_state=131313
)

In [22]:
accuracy = [0., 0.]
balanced_accuracy = [0., 0.]
roc = [0., 0.]
precision = [0., 0.]
recall = [0., 0.]

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=131313)
for train_index, test_index in tqdm(skf.split(scaled_data, labels)):
    # split dataset
    x_train, y_train = scaled_data[train_index], labels[train_index]
    x_test, y_test = scaled_data[test_index], labels[test_index]
    
    # TEACHER RANDOM FOREST
    # fit data
    rf1.fit(x_train, y_train)
    # generate prediction for self distillation data
    sd_pred = rf1.predict(scaled_sdist)
    # prediction of test set
    pred = rf1.predict(x_test)
    # calculate and append accuracies
    accuracy[0] += accuracy_score(y_test, pred)
    balanced_accuracy[0] += balanced_accuracy_score(y_test, pred)
    precision[0] += precision_score(y_test, pred, average="weighted")
    recall[0] += recall_score(y_test, pred, average="weighted")
    roc[0] += roc_auc_score(y_test, pred, average="weighted")
    
    # STUDENT RANDOM FOREST
    rf2.fit(*sd_data_utils.merge_data_self_dist_data(x_train, scaled_sdist, y_train, sd_pred))
    pred = rf2.predict(x_test)
    
    # calculate and append accuracies
    accuracy[1] += accuracy_score(y_test, pred)
    balanced_accuracy[1] += balanced_accuracy_score(y_test, pred)
    precision[1] += precision_score(y_test, pred, average="weighted")
    recall[1] += recall_score(y_test, pred, average="weighted")
    roc[1] += roc_auc_score(y_test, pred, average="weighted")

# rescale values
accuracy[0]/=n_splits
balanced_accuracy[0]/=n_splits
roc[0]/=n_splits
precision[0]/=n_splits
recall[0]/=n_splits

accuracy[1]/=n_splits
balanced_accuracy[1]/=n_splits
roc[1]/=n_splits
precision[1]/=n_splits
recall[1]/=n_splits

5it [07:22, 88.57s/it] 


0.9786043787068902

In [23]:
print("accuracy: ", accuracy)
print("balanced accuracy: ", balanced_accuracy)
print("roc: ", roc)
print("precision: ", precision)
print("recall: ", recall)

accuracy:  [4.892509335871714, 4.893021893534451]
balanced accuracy:  [2.7013212878385744, 2.6888802230176188]
roc:  [2.7013212878385744, 2.6888802230176188]
precision:  [4.863548589555977, 4.868850278875003]
recall:  [4.892509335871714, 4.893021893534451]


In [24]:
print(accuracy[0]<accuracy[1])
print(balanced_accuracy[0]<balanced_accuracy[1])
print(roc[0]<roc[1])
print(precision[0]<precision[1])
print(recall[0]<recall[1])

True
False
False
True
True


## Testing for self distillation success for a number of experiments

In [1]:
from selfdest_toolkit.data_tools import analysis

In [2]:
# get the experiment list to test
exp_to_test = analysis.get_good_experiment_ids(
    number_to_sample=10
)

100%|██████████████████████████████████████████████████████████████████████████████| 2481/2481 [01:52<00:00, 22.02it/s]


In [3]:
exp_to_test

array([  1688, 624297,    902, 485314, 651965,   1461,   1458, 485313,
       652104,   2551], dtype=int64)