# Random Forest Self Distillation

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Import section

In [1]:
from selfdest_toolkit.data_tools import preprocessing, loading
from selfdest_toolkit.data_tools import sd_data_utils

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm
import json
from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint
from sklearn.model_selection import cross_validate

## Data Preprocessing

In [3]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [6]:
aids = preprocessing.experiment_whole_preprocess(PATH_MAIN_DATASET, PATH_DATA)

Data file already present, no need for download.


100%|████████████████████████████████████████████████████████████████████████████| 2481/2481 [00:00<00:00, 5353.54it/s]


Chemical descriptor data already generated
Fingerprints already generated


## Individual Data Loading

In [7]:
c_sampledata, c_samplelabel = loading.load_chem_desc_data(411)

In [8]:
f_sampledata, f_samplelabel = loading.load_fingerprint_data(411)

In [9]:
c_sampledata

array([[12.0147147 , -0.2333767 , 12.0147147 , ...,  0.        ,
         1.        ,  0.        ],
       [13.68844293, -1.04921672, 13.68844293, ...,  0.        ,
         0.        ,  0.        ],
       [13.75921567, -0.69660748, 13.75921567, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [12.81165769, -1.13973262, 12.81165769, ...,  0.        ,
         0.        ,  0.        ],
       [13.06424981, -1.12117352, 13.06424981, ...,  0.        ,
         0.        ,  0.        ],
       [12.39603269, -1.12400132, 12.39603269, ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
c_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
f_sampledata

array([[0, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 1, 1, ..., 0, 0, 1],
       ...,
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1],
       [1, 1, 0, ..., 0, 0, 1]])

In [12]:
f_samplelabel

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
print(c_sampledata.shape)
print(f_sampledata.shape)

(68285, 208)
(68285, 2048)


In [14]:
f_samplelabel.sum()

1536

## Preloading all data

In [15]:
# loading.preload_fingerprint_data_all(aids)

In [16]:
# loading.preload_chem_data_all(aids)

## Preparing for self distillation

In [4]:
# define experiment id
aid = 411

In [5]:
# get the prediction data
data, labels = loading.load_chem_desc_data(411)

In [6]:
# determine number of elements to fetch for self destillation
number_sd = 5 # set to percentage perhaps

In [7]:
# get self distillation elements
sd_data = sd_data_utils.generate_self_distillation_elements(
    aid=aid,
    number_to_generate=number_sd,
    data_gen_method="chem-desc",
    path_data=PATH_DATA
)

## Normal Random Forest

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score

In [9]:
scaler = StandardScaler()

In [10]:
scaled_data = scaler.fit_transform(np.nan_to_num(data))

In [11]:
np.nan_to_num(scaled_data)

array([[ 0.13386196,  0.49298247,  0.13386196, ..., -0.2739182 ,
         1.79498337, -0.21515261],
       [ 0.88315961, -0.06740821,  0.88315961, ..., -0.2739182 ,
        -0.193957  , -0.21515261],
       [ 0.91484327,  0.17479483,  0.91484327, ..., -0.2739182 ,
        -0.193957  , -0.21515261],
       ...,
       [ 0.49063882, -0.12958249,  0.49063882, ..., -0.2739182 ,
        -0.193957  , -0.21515261],
       [ 0.60371971, -0.11683447,  0.60371971, ..., -0.2739182 ,
        -0.193957  , -0.21515261],
       [ 0.30457108, -0.11877685,  0.30457108, ..., -0.2739182 ,
        -0.193957  , -0.21515261]])

In [15]:
rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=131313
)

In [37]:
cv_results = cross_validate(rf, scaled_data, labels, cv=3)

In [38]:
cv_results

{'fit_time': array([31.26763964, 34.84175944, 33.04539967]),
 'score_time': array([0.46308899, 0.64827251, 0.46558285]),
 'test_score': array([0.97724277, 0.97825323, 0.97895523])}

In [13]:
np.any(data>=10000000)

True

In [16]:
from tqdm import tqdm

In [17]:
accuracy = []
balanced_accuracy = []
roc = []
precision = []
recall = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=131313)
for train_index, test_index in tqdm(skf.split(scaled_data, labels)):
    # split dataset
    x_train, y_train = scaled_data[train_index], labels[train_index]
    x_test, y_test = scaled_data[test_index], labels[test_index]
    # fit data
    rf.fit(x_train, y_train)
    # generate prediction
    pred = rf.predict(x_test)
    # calculate and append accuracies
    accuracy.append(accuracy_score(y_test, pred))
    balanced_accuracy.append(balanced_accuracy_score(y_test, pred))
    precision.append(precision_score(y_test, pred, average="weighted"))
    recall.append(recall_score(y_test, pred, average="weighted"))
    roc.append(roc_auc_score(y_test, pred, average="weighted"))

5it [02:50, 34.06s/it]


In [18]:
print("accuracy: ", accuracy)
print("balanced accuracy: ", balanced_accuracy)
print("roc: ", roc)
print("precision: ", precision)
print("recall: ", recall)

accuracy:  [0.978692245734788, 0.9787654682580361, 0.9786190232115398, 0.9789119133045324, 0.9781796880720509]
balanced accuracy:  [0.5435619563493516, 0.5404169869096853, 0.5419332918542376, 0.5516303724578804, 0.5336788932095696]
roc:  [0.5435619563493517, 0.5404169869096854, 0.5419332918542376, 0.5516303724578804, 0.5336788932095696]
precision:  [0.9733954812766568, 0.9743169963902334, 0.9731493164173118, 0.9737573577524699, 0.9716583967760595]
recall:  [0.978692245734788, 0.9787654682580361, 0.9786190232115398, 0.9789119133045324, 0.9781796880720509]
