# Random Forest Self Distillation

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Import section

In [1]:
from selfdest_toolkit.data_tools import preprocessing, loading, cleaning, sd_data_utils, analysis
from selfdest_toolkit.randomforest_tools import self_distillation, rf_analysis, normal, creation

import os
from tqdm import tqdm
import json
import pandas as pd
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint

## Data Preprocessing

In [2]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [5]:
aids = preprocessing.experiment_whole_preprocess(PATH_MAIN_DATASET, PATH_DATA)

Data file already present, no need for download.


100%|███████████████████████████████████████████████████████████████████████████| 2481/2481 [00:00<00:00, 12025.35it/s]


Chemical descriptor data already generated
Fingerprints already generated


## Individual Data Loading

In [6]:
c_sampledata, c_samplelabel = loading.load_chem_desc_data(411)

In [7]:
f_sampledata, f_samplelabel = loading.load_fingerprint_data(411)

In [8]:
c_sampledata

array([[12.0147147 , -0.2333767 , 12.0147147 , ...,  0.        ,
         1.        ,  0.        ],
       [13.68844293, -1.04921672, 13.68844293, ...,  0.        ,
         0.        ,  0.        ],
       [13.75921567, -0.69660748, 13.75921567, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [12.81165769, -1.13973262, 12.81165769, ...,  0.        ,
         0.        ,  0.        ],
       [13.06424981, -1.12117352, 13.06424981, ...,  0.        ,
         0.        ,  0.        ],
       [12.39603269, -1.12400132, 12.39603269, ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
c_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
f_sampledata

array([[0, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 1, 1, ..., 0, 0, 1],
       ...,
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1]])

In [11]:
f_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
print(c_sampledata.shape)
print(f_sampledata.shape)

(68285, 208)
(68285, 2048)


In [13]:
f_samplelabel.sum()

1536

## Preloading all data

In [14]:
# loading.preload_fingerprint_data_all(aids)

In [15]:
# loading.preload_chem_data_all(aids)

## Preparing for self distillation

In [16]:
# define experiment id
aid = 411

In [23]:
# get the prediction data
data, labels = loading.load_chem_desc_data(aid)

In [24]:
# determine number of elements to fetch for self destillation
number_sd = int(data.shape[0]*0.2+0.5) # set to percentage perhaps

In [25]:
# get self distillation elements
sd_data = sd_data_utils.generate_self_distillation_elements(
    aid=aid,
    number_to_generate=number_sd,
    data_gen_method="chem-desc",
    path_data=PATH_DATA
)

## Normal Random Forest

In [3]:
# define experiment id
# aid = 411
aid = 1688

In [4]:
# create random forest
rf = creation.generate_default_rf()

In [5]:
# execute normal random forest testing
accuracy_dict = normal.execute_normal_rf_test(
    rf=rf,
    aid=aid,
    mode="chem-desc"
)

  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
1it [00:34, 34.09s/it]


KeyboardInterrupt: 

In [None]:
print(json.dumps(accuracy_dict, indent=4))

In [None]:
normal_mean_dict = rf_analysis.convert_acc_dict(accuracy_dict)
print(json.dumps(normal_mean_dict, indent=4))

## Self distillation Random Forest

In [25]:
# generating random forests
rf_teacher = creation.generate_default_rf()
rf_student = creation.generate_default_rf()

In [26]:
normal_accuracy_dict, sd_accuracy_dict = self_distillation.execute_sd_rf_test(
    rf_teacher=rf_teacher,
    rf_student=rf_student,
    aid=aid,
    mode="chem-desc"
)

5it [06:50, 82.12s/it]


In [27]:
normal_mean_dict = rf_analysis.convert_acc_dict(normal_accuracy_dict)
sd_mean_dict = rf_analysis.convert_acc_dict(sd_accuracy_dict)

In [28]:
print(json.dumps(normal_mean_dict, indent=4))

{
    "accuracy": 0.9785018671743428,
    "balanced_accuracy": 0.5402642575677149,
    "roc": 0.5402642575677149,
    "precision": 0.9727097179111954,
    "recall": 0.9785018671743428
}


In [29]:
print(json.dumps(sd_mean_dict, indent=4))

{
    "accuracy": 0.9786043787068902,
    "balanced_accuracy": 0.5377760446035238,
    "roc": 0.5377760446035238,
    "precision": 0.9737700557750006,
    "recall": 0.9786043787068902
}


In [30]:
comparison_dict = rf_analysis.compare_accuracy_dict(normal_mean_dict, sd_mean_dict)

In [31]:
print(json.dumps(comparison_dict, indent=4))

{
    "accuracy": 0.00010251153254736689,
    "balanced_accuracy": -0.0024882129641911055,
    "roc": -0.0024882129641911055,
    "precision": 0.0010603378638052163,
    "recall": 0.00010251153254736689
}


## Testing for self distillation success for a number of experiments

### Chemical descriptor data mode

In [32]:
# get the experiment list to test
exp_to_test = analysis.get_good_experiment_ids(
    number_to_sample=10
)

100%|██████████████████████████████████████████████████████████████████████████████| 2481/2481 [01:28<00:00, 28.15it/s]


In [33]:
exp_to_test

array([   902,   1458,   1461,   1688,   2551, 485313, 485314, 624297,
       651965, 652104], dtype=int64)

In [34]:
# take balanced accuracy and roc into account as it seems to be the most expressive out of all, 
# because it actually is not over 90% from the beginning and from the documentation it seemed 
# to be the best for imbalanced datasets. Although it is astonishing that the average parameter
# to weighted does not work. Maybe a weight needs to be supplied besides the parameter but I
# rather go with the other options

In [35]:
# better worse counter
roc_better = []
roc_worse = []
ba_better = []
ba_worse = []

In [36]:
mode = "chem-desc"
# mode = "fingerprint"

In [37]:
# iterate over experiments to test
for aid in tqdm(exp_to_test):
    
    # generating random forests
    rf_teacher = creation.generate_default_rf()
    rf_student = creation.generate_default_rf()
    
    # execute self destillation test
    normal_accuracy_dict, sd_accuracy_dict = self_distillation.execute_sd_rf_test(
        rf_teacher=rf_teacher,
        rf_student=rf_student,
        aid=aid,
        mode=mode,
        verbose=False
    )
    
    # calculate mean accuracy values
    normal_mean_dict = rf_analysis.convert_acc_dict(normal_accuracy_dict)
    sd_mean_dict = rf_analysis.convert_acc_dict(sd_accuracy_dict)
    
    # get the difference in accuracy scores
    comparison_dict = rf_analysis.compare_accuracy_dict(normal_mean_dict, sd_mean_dict)
    
    # analyze it
    if comparison_dict["roc"] > 0:
        roc_better.append(aid)
    else:
        roc_worse.append(aid)
    if comparison_dict["balanced_accuracy"] > 0:
        ba_better.append(aid)
    else:
        ba_worse.append(aid)

  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
 30%|███████████████████████                                                      | 3/10 [1:01:52<2:24:23, 1237.66s/it]


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
print("roc wise {}({}%) experiments worked better with self destillation".format(len(roc_better), len(roc_better)/(len(roc_better)+len(roc_worse))))
print("balanced accuracy wise {}({}%) experiments worked better with self destillation".format(len(ba_better), len(ba_better)/(len(ba_better)+len(ba_worse))))

### Fingerprint mode

In [None]:
# get the experiment list to test
exp_to_test = analysis.get_good_experiment_ids(
    number_to_sample=10
)

In [None]:
exp_to_test

In [None]:
# take balanced accuracy and roc into account as it seems to be the most expressive out of all, 
# because it actually is not over 90% from the beginning and from the documentation it seemed 
# to be the best for imbalanced datasets. Although it is astonishing that the average parameter
# to weighted does not work. Maybe a weight needs to be supplied besides the parameter but I
# rather go with the other options

In [None]:
# better worse counter
roc_better = []
roc_worse = []
ba_better = []
ba_worse = []

In [None]:
# mode = "chem-desc"
mode = "fingerprint"

In [None]:
# iterate over experiments to test
for aid in tqdm(exp_to_test):
    
    # generating random forests
    rf_teacher = creation.generate_default_rf()
    rf_student = creation.generate_default_rf()
    
    # execute self destillation test
    normal_accuracy_dict, sd_accuracy_dict = self_distillation.execute_sd_rf_test(
        rf_teacher=rf_teacher,
        rf_student=rf_student,
        aid=aid,
        mode=mode,
        verbose=False
    )
    
    # calculate mean accuracy values
    normal_mean_dict = rf_analysis.convert_acc_dict(normal_accuracy_dict)
    sd_mean_dict = rf_analysis.convert_acc_dict(sd_accuracy_dict)
    
    # get the difference in accuracy scores
    comparison_dict = rf_analysis.compare_accuracy_dict(normal_mean_dict, sd_mean_dict)
    
    # analyze it
    if comparison_dict["roc"] > 0:
        roc_better.append(aid)
    else:
        roc_worse.append(aid)
    if comparison_dict["balanced_accuracy"] > 0:
        ba_better.append(aid)
    else:
        ba_worse.append(aid)

In [None]:
print("roc wise {}({}%) experiments worked better with self destillation".format(len(roc_better), len(roc_better)/(len(roc_better)+len(roc_worse))))
print("balanced accuracy wise {}({}%) experiments worked better with self destillation".format(len(ba_better), len(ba_better)/(len(ba_better)+len(ba_worse))))