In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm
import json
from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint
import typing

In [44]:
def load_fingerprint_data(
        aid: int,
        path_data: str = "data/"
) -> typing.Tuple[np.ndarray, np.ndarray]:
    """
    Function that loads the Morgan Fingerprint dataset for the provided experiment (id). Also cleans the data of
    repetitive or unnecessary data.
    Addition: due to timing reasons the precomputed results will be saved and loaded if possible.

    Parameters
    ----------
    aid : int
        Experiment id for which the data is to be fetched
    path_data : str, optional
        Path to data folder

    Returns
    -------
    np.ndarray
        chemical descriptor data
    np.ndarray
        labels of the data (0 for inactive, 1 for active)
    """

    # PRESAVING AND FETCHING PART
    # path where the data would be prestored
    path_fingerprint = path_data + "precomputed/fingerprint/"

    # check if folder present
    if not os.path.exists(path_fingerprint):
        os.makedirs(path_fingerprint)

    # check if file already exists and can be loaded instead of creating it
    file_names = {
        "data": str(aid) + "_data.npy",
        "label": str(aid) + "_label.npy"
    }
    if os.path.isfile(path_fingerprint + file_names["data"]) and os.path.isfile(path_fingerprint + file_names["label"]):
        return np.load(path_fingerprint + file_names["data"]).astype(float), \
               np.load(path_fingerprint + file_names["label"]).astype(int)

    # NORMAL LOADING PART
    # load the pure data
    loaded_data = load_pure_data(
        aid_to_load=aid,
        path_data=path_data
    )

    # load pure chemical descriptor data
    fingerprint_map = np.load(path_data + "fingerprints_map.npy")
    fingerprint_data = np.load(path_data + "fingerprints_data.npy").astype(int)

    # MAPPING CIDS TO DATA
    # get cids
    cids = loaded_data.cid.to_numpy()
    # get positions of cids in fingerprint_map (fingerprint map is already sorted by default)
    pos = np.searchsorted(fingerprint_map, cids)
    data = fingerprint_data[pos]
    

    # fetch the labels of the data elements:
    labels = (loaded_data.activity.to_numpy()=="active").astype(int)

    # save the generated data to disk
    np.save(path_fingerprint + file_names["data"], data.astype(bool))
    np.save(path_fingerprint + file_names["label"], labels.astype(int))

    # return data and labels
    return data, labels

In [3]:
from time import time
from selfdest_toolkit.data_tools.loading import load_pure_data
from selfdest_toolkit.data_tools.cleaning import clean_numpy_data

In [13]:
t = time()
load_fingerprint_data(411)
print(time()-t)

79.96671652793884


In [45]:
t = time()
load_fingerprint_data(411)
print(time()-t)

2.846043825149536


In [4]:
aid = 411
path_data = "data/"

In [5]:
# working

In [6]:
# load the pure data
loaded_data = load_pure_data(
    aid_to_load=aid,
    path_data=path_data
)

In [7]:
# load pure chemical descriptor data
fingerprint_map = np.load(path_data + "fingerprints_map.npy")
fingerprint_data = np.load(path_data + "fingerprints_data.npy").astype(int)

In [8]:
cids = loaded_data.cid.to_numpy()

In [9]:
cids.shape

(68285,)

In [34]:
index = np.argsort(fingerprint_map)

In [35]:
index[np.searchsorted(fingerprint_map, cids, sorter=index)]

array([ 23786,  23787,  23788, ..., 355206, 355207, 355208], dtype=int64)

In [36]:
cids[0]

644390

In [37]:
fingerprint_map[23786]

644390

In [38]:
fingerprint_data[23786]

array([0, 1, 1, ..., 0, 1, 1])

In [40]:
pos = np.searchsorted(fingerprint_map, cids)

In [41]:
data = fingerprint_data[pos]

In [43]:
data.shape

(68285, 2048)