# Random Forest Self Destillation

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Import section

In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm

## Data Preprocessing

In [4]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [5]:
# load whole dataset
# df = pd.read_csv(PATH_MAIN_DATASET)
# df.head(10)

In [10]:
# split into experiments
def experiment_split(
    df: pd.DataFrame,
    PATH_DATA: str
) -> np.ndarray:

    # check if folder is present and if not create it
    assert os.path.exists(PATH_DATA)
    PATH_EXPERIMENTS = PATH_DATA + "experiment-wise/"
    if not os.path.exists(PATH_EXPERIMENTS):
        os.makedirs(PATH_EXPERIMENTS)

    # get unique aids
    aid_unique = np.unique(df.aid.to_numpy())
    
    # save aids as a content table
    np.save(PATH_EXPERIMENTS + "ToC.npy", aid_unique)

    # iterate over aids and compute subset - save subset to file
    for id in tqdm(aid_unique):
        # create file name
        file_name = PATH_EXPERIMENTS + str(id) + ".csv"
        
        # check if the dataset to this has already been created
        if os.path.isfile(file_name):
            continue
        
        # get subset
        subset = df[df.aid==id]
        # save subset to folder
        subset.to_csv(
            path_or_buf = file_name,
            index = False
        )
    
    # return experiment ids
    return aid_unique

In [11]:
# load and split into experiments
def experiment_loadsplit(
    PATH_DATA: str,
    PATH_MAIN_DATASET: str = PATH_DATA + "df_assay_entries.csv"
) -> np.ndarray:
    
    # load the dataset into memory
    df = pd.read_csv(PATH_MAIN_DATASET)

    # execute normal split
    return experiment_split(df, PATH_DATA)

In [12]:
experiment_loadsplit(PATH_DATA, PATH_MAIN_DATASET)

100%|████████████████████████████████████████████████████████████████████████████| 2481/2481 [00:00<00:00, 7865.47it/s]


array([    411,     519,     523, ..., 1347425, 1479145, 1479148],
      dtype=int64)

## Individual Data Loading