# Random Forest Self Destillation

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

## Import section

In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm
import json
from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint

## Data Preprocessing

In [4]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [5]:
def check_data_file(
    PATH_MAIN_DATASET: str
):
    #check if data folder is present and if not create it
    # if not os.path.exists(PATH_DATA):
    #     os.makedirs(PATH_DATA)
    # if file not present - download it from link to ucloud
    if not os.path.isfile(PATH_MAIN_DATASET):
        print("Starting download of dataset")
        urlretrieve("https://ucloud.univie.ac.at/index.php/s/qRoEjX26GVH5jnx/download?path=%2F&files=df_assay_entries.csv&downloadStartSecret=0t4mnkxbz8am", PATH_MAIN_DATASET)
        print("Download finished")
    else:
        print("Data file already present")

In [6]:
check_data_file(PATH_MAIN_DATASET)

Data file already present


In [7]:
# load whole dataset
# df = pd.read_csv(PATH_MAIN_DATASET)
# df.head(10)

In [8]:
# split into experiments
def experiment_preprocess(
    PATH_MAIN_DATASET: str,
    PATH_DATA: str
) -> np.ndarray:
    
    # load the dataset into memory
    df = pd.read_csv(PATH_MAIN_DATASET)

    # check if folder is present and if not create it
    assert os.path.exists(PATH_DATA)
    PATH_EXPERIMENTS = PATH_DATA + "experiment-wise/"
    if not os.path.exists(PATH_EXPERIMENTS):
        os.makedirs(PATH_EXPERIMENTS)

    # get unique aids
    aid_unique = np.unique(df.aid.to_numpy())
    
    # save aids as a content table
    np.save(PATH_EXPERIMENTS + "ToC.npy", aid_unique)

    # iterate over aids and compute subset - save subset to file
    for id in tqdm(aid_unique):
        # create file name
        file_name = PATH_EXPERIMENTS + str(id) + ".csv"
        
        # check if the dataset to this has already been created
        if os.path.isfile(file_name):
            continue
        
        # get subset
        subset = df[df.aid==id]
        # save subset to folder
        subset.to_csv(
            path_or_buf = file_name,
            index = False
        )
    
    # check if smiles string file is present or else write list of all smiles strings to file
    FILENAME_SMILES = "smiles.npy"
    if not os.path.isfile(PATH_DATA + FILENAME_SMILES):
        np.save(PATH_DATA + FILENAME_SMILES, np.unique(df.smiles.to_numpy()))
    
    # return experiment ids
    return aid_unique

In [9]:
def generate_chem_smiles(
    PATH_DATA: str,
    PATH_MAIN_DATASET: str
):
    # create saving paths
    CHEM_DATA_PATH = {
        "map": PATH_DATA + "chem-desc_map.npy",
        "data": PATH_DATA + "chem-desc_data.npy"
    }
    
    # check if data already exists
    if (not os.path.isfile(CHEM_DATA_PATH["map"])) and (not os.path.isfile(CHEM_DATA_PATH["data"])):
        print("Generating chemical descriptor data")
        
        # load dataframe
        df = pd.read_csv(PATH_MAIN_DATASET)

        # select subset of dataframe
        df = df[['cid', 'smiles']].sort_values(by=['cid']).drop_duplicates(subset=['cid']).reset_index()

        # pre-allocate storage to put data into
        storage = np.zeros((len(df), 208))

        # iterate over rows of dataset
        for idx, row in tqdm(df.iterrows()):
            storage[idx, :] = np.array([func(MolFromSmiles(row.smiles)) for _, func in descList])

        # save resulting data into files
        np.save(CHEM_DATA_PATH["map"], df.cid.to_numpy())
        np.save(CHEM_DATA_PATH["data"], storage)
    else:
        print("Chemical descriptor data already generated")
    return

In [10]:
# generate_chem_smiles(PATH_DATA, PATH_MAIN_DATASET)

In [11]:
def generate_fingerprints(
    PATH_DATA: str,
    PATH_MAIN_DATASET: str
):
    # create saving paths
    FINGERPRINT_DATA_PATH = {
        "map": PATH_DATA + "fingerprints_map.npy",
        "data": PATH_DATA + "fingerprints_data.npy"
    }
    
    # check if data already exists
    if (not os.path.isfile(FINGERPRINT_DATA_PATH["map"])) and (not os.path.isfile(FINGERPRINT_DATA_PATH["data"])):
        print("Generating fingerprints")
        
        # load dataframe
        df = pd.read_csv(PATH_MAIN_DATASET)

        # select subset of dataframe
        df = df[['cid', 'smiles']].sort_values(by=['cid']).drop_duplicates(subset=['cid']).reset_index()

        # pre-allocate storage to put data into
        storage = np.zeros((len(df), 2048))

        # iterate over rows of dataset
        for idx, row in tqdm(df.iterrows()):
            storage[idx, :] = RDKFingerprint(MolFromSmiles(row.smiles))

        # save resulting data into files
        np.save(FINGERPRINT_DATA_PATH["map"], df.cid.to_numpy())
        np.save(FINGERPRINT_DATA_PATH["data"], storage)
    else:
        print("Fingerprints already generated")
    return

In [12]:
# generate_fingerprints(PATH_DATA, PATH_MAIN_DATASET)

In [13]:
def experiment_whole_preprocess(
    PATH_DATA: str,
    PATH_MAIN_DATASET: str = PATH_DATA + "df_assay_entries.csv"
) -> np.ndarray:

    # check if dataset is downloaded
    check_data_file(PATH_MAIN_DATASET)
    
    # execute normal split preprocessing
    aids = experiment_preprocess(PATH_MAIN_DATASET, PATH_DATA)
    
    # generate the chemical descriptor data
    generate_chem_smiles(PATH_DATA, PATH_MAIN_DATASET)
    
    # generate the fingerprint data
    generate_fingerprints(PATH_DATA, PATH_MAIN_DATASET)
    
    return

In [14]:
experiment_whole_preprocess(PATH_DATA, PATH_MAIN_DATASET)

Data file already present


100%|███████████████████████████████████████████████████████████████████████████| 2481/2481 [00:00<00:00, 12428.81it/s]


Chemical descriptor data already generated
Fingerprints already generated


## Individual Data Loading

In [15]:
def load_pure_data(
    aid_to_load: int,
    PATH_EXPERIMENTS: str = "data/experiment-wise/"
) -> pd.DataFrame:
    
    # create file path
    file_path = PATH_EXPERIMENTS + str(aid_to_load) + ".csv"
    
    # check if file/path exists
    if not os.path.isfile(file_path):
        raise Exception('The experiment data with id {} could not be loaded. Either this experiment id is invalid or the data has yet to be split using the experiment_loadsplit(...) function.'.format(aid_to_load))
    
    # load and return data
    return pd.read_csv(file_path)

In [16]:
test = load_pure_data(411)
test.head(10)

Unnamed: 0,aid,cid,smiles,activity
0,411,644390,CCOC(=O)N1CCC(CC1)NCCNC(=O)C2=CC=CC=C2Cl,inactive
1,411,644391,CCOC1=CC=CC=C1N(C(C2=CC=NC=C2)C(=O)NC(C)(C)C)C...,inactive
2,411,644392,CN1C(=O)CC(SC1=NC2=CC(=CC=C2)OC)C(=O)NC3=CC=CC...,inactive
3,411,644393,CC1=CC2=CC(=C(N=C2C=C1)N3CCN(CC3)CC4=NN=NN4CC5...,inactive
4,411,644394,CC1(C(=O)N(C(=O)N1)CC(COC2CCCCC2)O)C,inactive
5,411,644395,C1CC1C2=CC=[N+](C=C2)[O-],inactive
6,411,644396,COC1=CC=C(C=C1)C(=O)NC2=NN=C(S2)C(F)(F)F,inactive
7,411,644397,CCOC1=CC=C(C=C1)C2=NN(N=N2)CC(=O)C3=C(C=CC(=C3...,inactive
8,411,5770444,CC1=CC=C(C=C1)C2=NN=C(O2)CSC3=NC4=C(C5=C(N4)C=...,inactive
9,411,644399,CC(C)CCN1C(=NN=N1)C(C2=CC=CC=C2)N3CCN(CC3)C,inactive


In [17]:
def load_chem_desc_data(
    aid: int,
    PATH_EXPERIMENTS: str = "data/experiment-wise/"
) -> np.ndarray:
    
    # load pure data
    loaded_data = load_pure_data(
        aid_to_load = aid,
        PATH_TO_EXPERIMENTS
    )
    
    # for each row load the chemical descriptor data from the file
    

SyntaxError: positional argument follows keyword argument (1164340177.py, line 10)

In [None]:
def load_fingerprint_data(
    
) -> np.ndarray: