In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
import os
from tqdm import tqdm
import json
from rdkit.Chem.Descriptors import descList
from rdkit.Chem import MolFromSmiles, RDKFingerprint

In [2]:
def load_pure_data(
    aid_to_load: int,
    PATH_EXPERIMENTS: str = "data/experiment-wise/"
) -> pd.DataFrame:
    
    # create file path
    file_path = PATH_EXPERIMENTS + str(aid_to_load) + ".csv"
    
    # check if file/path exists
    if not os.path.isfile(file_path):
        raise Exception('The experiment data with id {} could not be loaded. Either this experiment id is invalid or the data has yet to be split using the experiment_loadsplit(...) function.'.format(aid_to_load))
    
    # load and return data
    return pd.read_csv(file_path)

In [2]:
# path to dataset
PATH_DATA = "data/"
PATH_MAIN_DATASET = PATH_DATA + "df_assay_entries.csv"

In [8]:
df = pd.read_csv(PATH_MAIN_DATASET)

In [5]:
len(df)

41620091

In [6]:
df

Unnamed: 0,aid,cid,smiles,activity
0,891,3232584,CCNC1=NC=C2C(=N1)N(C(=O)C(=N2)C3=CC=CC(=C3)C#N...,active
1,891,3232585,COC1=CC=C(C=C1)OC2=NC=C3C(=N2)N(C(=O)C(=N3)C4=...,inactive
2,891,3232587,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,active
3,891,3232589,C1CN(CCC12CCN(CC2)C(=O)OC3=CC=CC=C3)C4=CC=CC=C4,inactive
4,891,3232590,COCCN1C2=NC(=NC=C2N=C(C1=O)C3=CC=CC(=C3)C#N)N4...,inactive
...,...,...,...,...
41620086,1479148,73111,C1=CC2=C(C(=C1)O[C@H]3[C@@H]([C@H]([C@@H]([C@H...,inactive
41620087,1479148,4724,CC(C)(C)NCC(COC1=CC=CC=C1C2CCCC2)O,active
41620088,1479148,6708778,COC1=CC(=CC(=C1O)OC)[C@H]2[C@@H]3C(COC3=O)C(C4...,active
41620089,1479148,54728271,CC1=NN=C(O1)C(=O)NC(C)(C)C2=NC(=C(C(=O)N2C)[O-...,active


In [7]:
df = df[['cid', 'smiles']].sort_values(by=['cid']).drop_duplicates(subset=['cid']).reset_index()

In [11]:
storage = np.zeros((len(df), 2048))

In [12]:
df.iat[0, 2]

'CC(CN)O'

In [13]:
storage[0]=RDKFingerprint(MolFromSmiles(df.iat[0,2]))

In [14]:
for idx, row in tqdm(df.iterrows()):
    storage[idx, :] = RDKFingerprint(MolFromSmiles(row.smiles))

455079it [21:54, 346.16it/s]


In [16]:
# save stuff
np.save(PATH_DATA + "fingerprints_map.npy", df.cid.to_numpy())
np.save(PATH_DATA + "fingerprints_data.npy", storage)

In [4]:
len(descList)

208

In [3]:
load = np.load(PATH_DATA + "fingerprints_data.npy")

In [4]:
load.shape

(455079, 2048)

In [5]:
load2 = np.load(PATH_DATA + "fingerprints_map.npy")

In [6]:
load2.shape

(455079,)

In [7]:
def generate_chem_smiles(
    PATH_DATA: str,
    PATH_MAIN_DATASET: str
):
    # create saving paths
    CHEM_DATA_PATH = {
        "map": PATH_DATA + "chem-desc_map.npy",
        "data": PATH_DATA + "chem-desc_data.npy"
    }
    
    # check if data already exists
    if not (os.path.isfile(CHEM_DATA_PATH["map"]) and os.path.isfile(CHEM_DATA_PATH["data"])):
        print("Generating fingerprints")
        
        # load dataframe
        df = pd.read_csv(PATH_MAIN_DATASET)

        # select subset of dataframe
        df = df[['cid', 'smiles']].sort_values(by=['cid']).drop_duplicates(subset=['cid']).reset_index()

        # save resulting data into files
        np.save(CHEM_DATA_PATH["map"], df.cid.to_numpy())
        np.save(CHEM_DATA_PATH["data"], storage)
    else:
        print("Fingerprints already generated")
    return

In [8]:
generate_chem_smiles(PATH_DATA, PATH_MAIN_DATASET)

Generating fingerprints


NameError: name 'storage' is not defined