## Selecting the fingerprints

In this notebook I'm going to select the fingerprints that store the information in a more appropriate manner for a CNN model of 1 dimension. To do that, I'm going to program a script that uses different fingerprints to train a CNN and I will chose the ones giving a model with the best accuracy. The model is build to be fast to speed up the process.

## Imports

In [39]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings

### Load data

In [40]:
pd.options.mode.chained_assignment = None

In [41]:
drugs = pd.read_pickle(os.path.join('res', 'pickles', 'all_label_drugs.pkl')).drop('FeatInvariants', axis=1) #I discard the Feature Invariants because they give an error I have to look into yet
display(drugs.head())
drugs.iloc[0]['Morgan2FP'].shape
drugs.info()
drugs.dropna(inplace=True)

Unnamed: 0,CID,ATC_Code_Short,Molecule,ATC_Code_#,ConnInvariants,Morgan2FP,MACCSKeys,AtomPairFP,TopTorFP,AvalonFP,PubchemFP,CactvsFP
0,1,N,<rdkit.Chem.rdchem.Mol object at 0x00000236B52...,10,"[2246728737, 2246699815, 864942730, 864674487,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."
1,119,N,<rdkit.Chem.rdchem.Mol object at 0x00000236B52...,10,"[2245384272, 2245384272, 2246699815, 864942730...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
2,137,L,<rdkit.Chem.rdchem.Mol object at 0x00000236B52...,8,"[2245384272, 2245384272, 2246699815, 864942730...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
3,176,G,<rdkit.Chem.rdchem.Mol object at 0x00000236B52...,4,"[2246728737, 2246699815, 864942730, 864662311]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
4,187,S,<rdkit.Chem.rdchem.Mol object at 0x00000236B52...,14,"[2246728737, 2246699815, 864942730, 864674487,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...","[1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10183 entries, 0 to 10182
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CID             10183 non-null  int64 
 1   ATC_Code_Short  10183 non-null  object
 2   Molecule        10183 non-null  object
 3   ATC_Code_#      10183 non-null  int32 
 4   ConnInvariants  10183 non-null  object
 5   Morgan2FP       10183 non-null  object
 6   MACCSKeys       10183 non-null  object
 7   AtomPairFP      10183 non-null  object
 8   TopTorFP        10183 non-null  object
 9   AvalonFP        10183 non-null  object
 10  PubchemFP       10183 non-null  object
 11  CactvsFP        10183 non-null  object
dtypes: int32(1), int64(1), object(10)
memory usage: 915.0+ KB


## Functions

In [42]:
def select_fp(column_list):
    """
    Function that takes a list of columns and makes a train/test split for each of them with the column containing the labels as y.
    Then returns a dictionary with the column name as a key and the tuple with X_train, X_test, y_train, y_test as value.
    Input: list with column names
    Output: dictionary
    """
    splits_dic = {}
    for column in column_list:
        X = drugs[column]
        y = drugs['ATC_Code_#'] #atc_code_# drug_class_code	
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        splits_dic[column] = X_train, X_test, y_train, y_test
    return splits_dic

In [72]:
fp_list = list(drugs.columns[4:12])
print(fp_list)
splits_dic = select_fp(fp_list)
print(type(splits_dic['ConnInvariants'][0]))
print(splits_dic['ConnInvariants'][0].shape)
print(splits_dic['ConnInvariants'][0])

['ConnInvariants', 'Morgan2FP', 'MACCSKeys', 'AtomPairFP', 'TopTorFP', 'AvalonFP', 'PubchemFP', 'CactvsFP']
<class 'pandas.core.series.Series'>
(7128,)
8947    [2246728737, 2092489639, 3218693969, 204143449...
5371    [2246728737, 2245273601, 2246728737, 224538427...
3397    [2246728737, 2245384272, 2976816164, 296896809...
3082    [2246728737, 2246699815, 864942730, 847961216,...
5474    [2246728737, 2245273601, 2245384272, 224538427...
                              ...                        
5734    [2246728737, 3217380708, 3217380708, 321738070...
5191    [2246728737, 2245273601, 2245273601, 321738070...
5390    [2246728737, 2976033787, 2968968094, 296896809...
860     [2968968094, 2976033787, 2976033787, 318945755...
7270    [2246728737, 2092489639, 2968968094, 296896809...
Name: ConnInvariants, Length: 7128, dtype: object


In [68]:
def build_model(inshape, nclasses):
    """
    Function that builds a CNN model with non optimal conditions.
    Input: tuple with inner shape of arrays, integer with number of classes
    Output: a CNN model
    """
    model = Sequential()
    model.add(Conv1D(100, 9, activation='relu', kernel_initializer='he_uniform', input_shape=inshape))
    model.add(MaxPool1D(2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dropout(0.5))
    model.add(Dense(nclasses, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [81]:
def reshape_and_build(splits_dic):
    """
    Function that takes the dictionary generated by "select_fp" and reshapes X_train to fit a CNN model.
    Then builds the CNN model using the function 'build_model'. Return both the reshaped arrays and the models.
    Input: dictionary with column names as keys and a tuple of arrays as values
    Output: dictionary with column names as keys and a tuple with both reshaped arrays and CNN models 
    """
    arrays_models_dic = {}
    for column, tup in splits_dic.items():
        print(column)
        print(tup[0])
        X_train = np.array(list(tup[0]), dtype=object)
        print('Shape X_train: ', X_train.shape, type(X_train))
        display(X_train)
        X_test = np.array(list(tup[1]), dtype=object)
        n_classes = len(np.unique(tup[2]))
        print('Number of classes: ', n_classes)
        X_train= X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test= X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
        print('Reshaped X_train: ', X_train.shape)
        in_shape = X_train.shape[1:]
        print('In_shape: ', in_shape)
        print('Shape y_train: ', tup[2].shape)
        
        model = build_model(in_shape, n_classes)
        arrays_models_dic[column] = X_train, X_test, tup[2], tup[3], model

    return arrays_models_dic

In [82]:
array_model_dic= reshape_and_build(splits_dic)

ConnInvariants
8947    [2246728737, 2092489639, 3218693969, 204143449...
5371    [2246728737, 2245273601, 2246728737, 224538427...
3397    [2246728737, 2245384272, 2976816164, 296896809...
3082    [2246728737, 2246699815, 864942730, 847961216,...
5474    [2246728737, 2245273601, 2245384272, 224538427...
                              ...                        
5734    [2246728737, 3217380708, 3217380708, 321738070...
5191    [2246728737, 2245273601, 2245273601, 321738070...
5390    [2246728737, 2976033787, 2968968094, 296896809...
860     [2968968094, 2976033787, 2976033787, 318945755...
7270    [2246728737, 2092489639, 2968968094, 296896809...
Name: ConnInvariants, Length: 7128, dtype: object
Shape X_train:  (7128,) <class 'numpy.ndarray'>
0: 7128


array([array([2246728737, 2092489639, 3218693969, 2041434490, 3218693969,
              3217380708, 2245277810, 3217380708, 3218693969, 3218693969,
              3217380708, 3218693969, 3218693969, 1016841875, 3217380708,
              3218693969, 3217380708, 3217380708, 3218693969, 3218693969,
              2092489639, 3217380708,  864942730, 3218693969, 3217380708,
              3217380708, 3218693969, 3217380708, 3218693969, 3218693969,
              3218693969, 1016841875, 2246728737,  847957139], dtype=int64),
       array([2246728737, 2245273601, 2246728737, 2245384272, 2976033787,
              3217380708,  864942730, 2092489639, 2968968094, 2968968094,
              2968968094, 2976033787, 2976816164, 2092489639, 3217380708,
               864942730, 2976816164, 3189457552, 2245273601, 2246728737,
              2246728737,  847961216, 2246699815,  864942730, 2976033787,
              2968968094, 2092489639, 2976033787, 2968968094, 3217380708,
              3217380708, 213251183

Number of classes:  16


IndexError: tuple index out of range

In [19]:
def fit_model(arrays_models_dic):
    """
    Function that takes the dictionary generated by 'reshape_and_build' and fits the CNN model contained in the values with the X_train and y_train arrays also in the values.
    Then evaluates the model against X_test e y_test and returns the accuracy of each model.
    Input: dictionary with column names as keys and a tuple with reshaped arrays and CNN models
    Output: a dictionary with column names as keys and the accuracy obtained by the CNN model for that column
    """
    accuracies_dic = {}
    es = EarlyStopping(monitor='val_loss', patience=1)
    for column, tup in arrays_models_dic.items():
        print(f"Analysing {column}")
        tup[4].fit(tup[0], tup[2], epochs=10, batch_size=128, verbose=1, validation_split = 0.2, callbacks = [es])
        loss, acc = tup[4].evaluate(tup[1], tup[3], verbose=1)
        accuracies_dic[column] = f'{acc:.3f}'
    return accuracies_dic

## Selection of the best fingerprints

Select the columns containing fingerprints

In [56]:
fp_list = list(drugs.columns[4:12])
fp_list

['ConnInvariants',
 'Morgan2FP',
 'MACCSKeys',
 'AtomPairFP',
 'TopTorFP',
 'AvalonFP',
 'PubchemFP',
 'CactvsFP']

Run the functions to obtain the dictionary with the accuracies

In [57]:
# Note for the evaluator: This step may take long,
# you may want to run it partially and see that it works and then skip it and see the result below
# or decrease substantially the number of epochs
splits_dic = select_fp(fp_list)
array_model_dic= reshape_and_build(splits_dic)
accuracies_dic = fit_model(array_model_dic)

ConnInvariants
Shape X_train:  (7128,) <class 'numpy.ndarray'>
0: 7128


array([2246728737, 2092489639, 3218693969, 2041434490, 3218693969,
       3217380708, 2245277810, 3217380708, 3218693969, 3218693969,
       3217380708, 3218693969, 3218693969, 1016841875, 3217380708,
       3218693969, 3217380708, 3217380708, 3218693969, 3218693969,
       2092489639, 3217380708,  864942730, 3218693969, 3217380708,
       3217380708, 3218693969, 3217380708, 3218693969, 3218693969,
       3218693969, 1016841875, 2246728737,  847957139], dtype=int64)

Number of classes:  16


IndexError: tuple index out of range

In [30]:
accuracies_dic

{'Morgan2FP': '0.556',
 'MACCSKeys': '0.484',
 'AtomPairFP': '0.253',
 'TopTorFP': '0.572',
 'AvalonFP': '0.261',
 'PubchemFP': '0.353',
 'CactvsFP': '0.353'}

With 100 epochs and a patiente of 3, the best accuracy is achieved with the Morgan fingerprints. These will be used to tune and train our CNN model.

In [2]:
accuracies_dic = {'Morgan2FP': '0.556',
 'MACCSKeys': '0.484',
 'AtomPairFP': '0.253',
 'TopTorFP': '0.572',
 'AvalonFP': '0.261',
 'PubchemFP': '0.353',
 'CactvsFP': '0.353'}

In [3]:
selected_fp = max(accuracies_dic, key=accuracies_dic.get)

In [4]:
selected_fp

'TopTorFP'