## Installing required library and import fingerprints

In [1]:
! pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.14-py2.py3-none-any.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.14


In [2]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2023-08-10 01:12:24--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2023-08-10 01:12:24--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip’


2023-08-10 01:12:24 (93.5 MB/s) - ‘fingerprints_xml.zip’ saved [10871/10871]

Archive:  fingerprints_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFin

In [3]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [4]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'Extended',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

## Creating a dictionary of the fingerprints

In [5]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'Extended': 'ExtendedFingerprinter.xml',
 'CDKextended': 'Fingerprinter.xml',
 'CDK': 'GraphOnlyFingerprinter.xml',
 'CDKgraphonly': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprinter.xml',
 'KlekotaRoth': 'MACCSFingerprinter.xml',
 'MACCS': 'PubchemFingerprinter.xml',
 'PubChem': 'SubstructureFingerprintCount.xml',
 'SubstructureCount': 'SubstructureFingerprinter.xml'}

## Load HIV dataset

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('/content/HIV-all-without-pubchem-PAINS_pIC50.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,Smiles,Activity,pIC50
0,0,O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1C...,Yes,6.69897
1,1,Cc1c(C(C)(C)C)s/c(=N\S(=O)(=O)c2cc(Cl)ccc2N)n1C,No,4.8


## creating .smi file and do the molecular description step

In [8]:
df1 = df['Smiles']
df1.to_csv('molecule.smi', sep='\t', index=False, header=False)
df1

0       O=C(NCc1nccs1)c1[nH]c2ccc(Br)cc2c1S(=O)(=O)N1C...
1         Cc1c(C(C)(C)C)s/c(=N\S(=O)(=O)c2cc(Cl)ccc2N)n1C
2       O=C(COc1ccc(Cl)cc1C(=O)c1cc(F)cc(F)c1)Nc1ccc(C...
3                CCCOC(=O)N(C(=S)OC(C)COc1ccccc1)c1ccccc1
4       Cc1[nH]nc(OCC(=O)Nc2ccc(C#CC(C)(C)CO)cc2Cl)c1-...
                              ...                        
9207    c1cc(C=CC(=S)O2)c2c(C(OC(C(OC3=O)(C(C)(C)C34C)...
9208             N1C(=O)C(C)=CN(COCCCOCC(c2ccccc2)=O)C1=O
9209    N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCC[Se]c3ccccc3)O...
9210    N1C(=O)C(C)=CN(C(O2)CC(N(O)C(CCCC[Se]c3ccccc3)...
9211    C1(NC(=O)C(C)=CN1C(O2)CC(N(O)C(CC[Se]C#N)OC3)C...
Name: Smiles, Length: 9212, dtype: object

In [9]:
from padelpy import padeldescriptor

fingerprint = 'Extended'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='/content/ExtendedFingerprinter.xml',
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

## Look into the descriptors

In [10]:
descriptors = pd.read_csv('/content/Extended.csv')
descriptors

Unnamed: 0,Name,ExtFP1,ExtFP2,ExtFP3,ExtFP4,ExtFP5,ExtFP6,ExtFP7,ExtFP8,ExtFP9,...,ExtFP1015,ExtFP1016,ExtFP1017,ExtFP1018,ExtFP1019,ExtFP1020,ExtFP1021,ExtFP1022,ExtFP1023,ExtFP1024
0,AUTOGEN_molecule_1,0,0,1,1,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,AUTOGEN_molecule_2,0,1,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_molecule_3,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_molecule_4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_molecule_5,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9207,AUTOGEN_molecule_9208,1,0,1,0,1,0,1,0,1,...,1,1,0,0,0,0,0,0,0,0
9208,AUTOGEN_molecule_9209,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9209,AUTOGEN_molecule_9210,1,1,0,1,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9210,AUTOGEN_molecule_9211,1,0,0,1,1,1,1,0,1,...,1,0,0,0,0,0,0,0,0,0


## Assign the X and y value

In [11]:
X = descriptors.drop('Name', axis=1)
y = df['Activity']

## Remove low variance features

In [12]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.1)
X

Unnamed: 0,ExtFP1,ExtFP2,ExtFP3,ExtFP4,ExtFP5,ExtFP6,ExtFP7,ExtFP8,ExtFP9,ExtFP10,...,ExtFP994,ExtFP995,ExtFP997,ExtFP998,ExtFP999,ExtFP1012,ExtFP1013,ExtFP1015,ExtFP1016,ExtFP1017
0,0,0,1,1,1,1,0,0,0,0,...,0,1,1,0,0,1,1,1,0,0
1,0,1,0,0,1,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9207,1,0,1,0,1,0,1,0,1,1,...,0,1,0,0,0,1,1,1,1,0
9208,0,0,0,1,0,0,1,0,1,0,...,0,1,1,0,0,0,0,0,0,0
9209,1,1,0,1,1,1,1,0,0,1,...,1,1,1,1,1,1,1,1,0,0
9210,1,0,0,1,1,1,1,0,1,1,...,1,1,1,1,0,1,1,1,0,0


In [13]:
! pip install torch skorch scikit-learn

Collecting skorch
  Downloading skorch-0.14.0-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.3/221.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.14.0


In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from skorch import NeuralNetBinaryClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score

###NeuralNetBinaryClassifier

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_test.shape, y_test.shape

((1843, 998), (1843,))

In [17]:
X_train.shape, y_train.shape

((7369, 998), (7369,))

In [18]:
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)

# Convert to 2D PyTorch tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)

In [19]:
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

# Convert to 2D PyTorch tensors
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [21]:
class HIVClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(998,998)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(998, 998)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(998, 998)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(998, 1)

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.output(x)
        return x

In [22]:
# create the skorch wrapper
model = NeuralNetBinaryClassifier(
    HIVClassifier,
    criterion=torch.nn.BCEWithLogitsLoss,
    optimizer=torch.optim.Adam,
    lr=0.0001,
    max_epochs=150,
    batch_size=50,
    verbose=False
)

This NeuralNetBinaryClassifier instance is not initialized yet. Call 'initialize' or 'fit'

In [23]:
model.fit(X_train, y_train)

<class 'skorch.classifier.NeuralNetBinaryClassifier'>[initialized](
  module_=HIVClassifier(
    (layer1): Linear(in_features=998, out_features=998, bias=True)
    (act1): ReLU()
    (layer2): Linear(in_features=998, out_features=998, bias=True)
    (act2): ReLU()
    (layer3): Linear(in_features=998, out_features=998, bias=True)
    (act3): ReLU()
    (output): Linear(in_features=998, out_features=1, bias=True)
  ),
)

In [25]:
y_train_pred = model.predict(X_train)

In [26]:
y_test_pred = model.predict(X_test)

In [27]:
from sklearn.metrics import matthews_corrcoef

In [28]:
mcc_train = matthews_corrcoef(y_train, y_train_pred)
mcc_train

0.9185215595958222

In [29]:
mcc_test = matthews_corrcoef(y_test, y_test_pred)
mcc_test

0.6600572360311973

In [30]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(model, X_train, y_train, cv=kfold)
print("mean = %.3f; std = %.3f" % (results.mean(), results.std()))

mean = 0.913; std = 0.007


In [31]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results_test = cross_val_score(model, X_test, y_test, cv=kfold)
print("mean = %.3f; std = %.3f" % (results_test.mean(), results_test.std()))

mean = 0.892; std = 0.008


In [32]:

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [33]:
#prediction probability
r_probs_train = [0 for _ in range(len(y_train))]
model_probs_train = model.predict_proba(X_train)
#nb_probs = nb.predict_proba(X_testscaled)

In [34]:
model_probs_train = model_probs_train[:, 1]
model_probs_train

array([8.6896679e-10, 2.1214204e-09, 9.9999666e-01, ..., 2.5360569e-05,
       9.9999881e-01, 4.2187551e-09], dtype=float32)

In [35]:
r_fpr_train, r_tpr_train, _ = roc_curve(y_train, r_probs_train, pos_label=2)
model_fpr_train, model_tpr_train, _ = roc_curve(y_train, model_probs_train, pos_label=2)



In [36]:
r_auc_train = roc_auc_score(y_train, r_probs_train)
model_auc_train = roc_auc_score(y_train, model_probs_train)

In [37]:
print('NNBinary Classifier Training Data set: ROC Score = %.3f' % (model_auc_train))

NNBinary Classifier Training Data set: ROC Score = 0.989


In [38]:
#prediction probability
r_probs = [0 for _ in range(len(y_test))]
model_probs = model.predict_proba(X_test)
#nb_probs = nb.predict_proba(X_testscaled)

In [39]:

model_probs = model_probs[:, 1]
model_probs


array([1.9915904e-09, 1.4803180e-06, 9.9999952e-01, ..., 1.3343093e-08,
       9.9996877e-01, 4.4813864e-09], dtype=float32)

In [40]:
r_fpr, r_tpr, _ = roc_curve(y_test, r_probs, pos_label=2)
model_fpr, model_tpr, _ = roc_curve(y_test, model_probs, pos_label=2)



In [41]:
r_auc = roc_auc_score(y_test, r_probs)
model_auc = roc_auc_score(y_test, model_probs)

In [42]:
print('NNBinary Classifier Testing Data set: ROC Score = %.3f' % (model_auc_train))

NNBinary Classifier Testing Data set: ROC Score = 0.989


"model.predict" refers to a method in machine learning where a trained model is used to make predictions on new data. The "model" refers to the machine learning algorithm that has been trained on a dataset, and "predict" is the function used to make predictions on new data. In order to use this method, the input data must be preprocessed in the same way as the training data, and the output of the prediction will depend on the type of model being used (e.g. regression for continuous output, classification for categorical output).

method called "predict_proba" in a machine learning model. "predict_proba" is a method used to predict the probability of a certain outcome or class in a classification problem. It is commonly used in supervised learning algorithms such as logistic regression, decision trees, and random forests. The output of this method is a probability score between 0 and 1 for each class, indicating the likelihood of the input belonging to that class.