In [2]:
import pandas as pd
import numpy as np

from padelpy import padeldescriptor

In [3]:
df = pd.read_csv('aromatase_inhibitors.csv')
df.head()

Unnamed: 0,Molecule ChEMBL ID,Smiles,pChEMBL Value
0,CHEMBL1170678,COc1cc(/C=C(\Cn2ccnc2)c2ccc([N+](=O)[O-])cc2)c...,7.16
1,CHEMBL308537,CO[C@@H]1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12,9.89
2,CHEMBL1083353,O=c1c2ccccc2sc2c(Cn3ccnc3)cccc12,8.4
3,CHEMBL454705,CC1=C[C@@H]2c3c(O)cc(-c4cc5ccc(O)cc5o4)cc3O[C@...,5.12
4,CHEMBL457679,CC(C)=CCc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,7.0


In [4]:
df.to_csv('molecule.smi', sep='\t', index=False, header=False)
df

Unnamed: 0,Molecule ChEMBL ID,Smiles,pChEMBL Value
0,CHEMBL1170678,COc1cc(/C=C(\Cn2ccnc2)c2ccc([N+](=O)[O-])cc2)c...,7.16
1,CHEMBL308537,CO[C@@H]1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12,9.89
2,CHEMBL1083353,O=c1c2ccccc2sc2c(Cn3ccnc3)cccc12,8.40
3,CHEMBL454705,CC1=C[C@@H]2c3c(O)cc(-c4cc5ccc(O)cc5o4)cc3O[C@...,5.12
4,CHEMBL457679,CC(C)=CCc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,7.00
...,...,...,...
2290,CHEMBL4859743,C[C@@H]1O[C@]2(C(C)(C)O)C=C3CC[C@H]4C(C)(C)CCC...,4.97
2291,CHEMBL5186519,COc1ccc2oc(C(c3ccc(Cl)cc3)n3cncn3)cc2c1,8.70
2292,CHEMBL16782,COc1ccc2c(=O)cc(-c3ccccc3)oc2c1,6.29
2293,CHEMBL276915,O=c1cc(-c2ccccc2)oc2cc(O)ccc12,5.72


In [9]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [10]:

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [11]:

fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphonly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'PubChem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [15]:

df2 = pd.concat( [df['Smiles'],df['Molecule ChEMBL ID']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2


Unnamed: 0,Smiles,Molecule ChEMBL ID
0,COc1cc(/C=C(\Cn2ccnc2)c2ccc([N+](=O)[O-])cc2)c...,CHEMBL1170678
1,CO[C@@H]1CC2C3CCC(=O)C3(C)CCC2C2(C)CCCC=C12,CHEMBL308537
2,O=c1c2ccccc2sc2c(Cn3ccnc3)cccc12,CHEMBL1083353
3,CC1=C[C@@H]2c3c(O)cc(-c4cc5ccc(O)cc5o4)cc3O[C@...,CHEMBL454705
4,CC(C)=CCc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2O)ccc1O,CHEMBL457679
...,...,...
2290,C[C@@H]1O[C@]2(C(C)(C)O)C=C3CC[C@H]4C(C)(C)CCC...,CHEMBL4859743
2291,COc1ccc2oc(C(c3ccc(Cl)cc3)n3cncn3)cc2c1,CHEMBL5186519
2292,COc1ccc2c(=O)cc(-c3ccccc3)oc2c1,CHEMBL16782
2293,O=c1cc(-c2ccccc2)oc2cc(O)ccc12,CHEMBL276915


In [16]:
from padelpy import padeldescriptor

fingerprint = 'Substructure'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)
     

In [17]:
descriptors = pd.read_csv(fingerprint_output_file)
descriptors

Unnamed: 0,Name,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,CHEMBL1170678,0,0,0,0,1,0,0,0,0,...,1,1,1,1,1,0,0,0,0,1
1,CHEMBL308537,1,1,1,1,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2,CHEMBL1083353,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,CHEMBL454705,1,1,1,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,CHEMBL457679,1,1,0,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290,CHEMBL4859743,1,1,1,1,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2291,CHEMBL5186519,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2292,CHEMBL16782,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2293,CHEMBL276915,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,1


In [18]:
X = descriptors.drop('Name', axis=1)
y = df['pChEMBL Value']

In [19]:
X.shape, y.shape

((2295, 307), (2295,))

REMOVE CORRELATED FEATURES

In [20]:
def remove_correlated_features(descriptors):
    # Calculate correlation
    correlated_matrix = descriptors.corr().abs()

    # Upper triangle of correlation matrix
    upper_triangle = correlated_matrix.where(np.triu(np.ones(correlated_matrix.shape),k=1).astype(np.bool))

    # Identify columns that have above 0.9 values of correlation
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] >= 0.9)]
    print(to_drop)
    descriptors_correlated_dropped = descriptors.drop(columns=to_drop, axis=1)
    return descriptors_correlated_dropped   

In [21]:
descriptors_new = remove_correlated_features(X)
descriptors_new

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_triangle = correlated_matrix.where(np.triu(np.ones(correlated_matrix.shape),k=1).astype(np.bool))


['SubFP37', 'SubFP38', 'SubFP184', 'SubFP278', 'SubFP280', 'SubFP284', 'SubFP285', 'SubFP286', 'SubFP288', 'SubFP296', 'SubFP297', 'SubFP298', 'SubFP299', 'SubFP301']


Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,SubFP10,...,SubFP293,SubFP294,SubFP295,SubFP300,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1,1,1,1,1,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,1,1,1,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,1,1,0,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290,1,1,1,1,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2291,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2292,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2293,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,1


In [22]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(descriptors_new, threshold=0.1)
X

Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP18,SubFP49,SubFP88,SubFP133,SubFP137,SubFP180,SubFP181,SubFP274,SubFP275,SubFP287,SubFP303
0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,0
1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0
3,1,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0
4,1,1,0,0,1,0,1,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
2291,0,0,0,0,0,1,0,0,0,0,1,1,1,1,0,0
2292,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0
2293,0,1,0,0,0,0,0,0,0,1,0,0,1,1,1,1


In [35]:

from sklearn.metrics import matthews_corrcoef, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, RandomizedSearchCV



In [41]:


# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define the hyperparameter grids for each model
param_grids = {
    'LinearRegression': {},
    'RandomForest': {
        'n_estimators': [100, 200, 500],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 6, 10],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Train and tune the models
grids = {}
for model_name, model in models.items():
    #print(f'Training and tuning {model_name}...')
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
    
    y_train_pred = grids[model_name].predict(X_train)
    y_test_pred = grids[model_name].predict(X_test)
    r_score = r2_score(y_train, y_train_pred)
    rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')
    print(f'Best Rscore for {model_name}: {r_score}\n')
    print(f'Best RMSE for {model_name}: {rmse}\n')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 1.0530824357908433

Best Rscore for LinearRegression: 0.3101746190187379

Best RMSE for LinearRegression: 1.0434792233711259

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 500}
Best RMSE for RandomForest: 0.9845190551700921

Best Rscore for RandomForest: 0.5036712619977792

Best RMSE for RandomForest: 0.8851134343882691

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200}
Best RMSE for XGBoost: 0.9948294326310559

Best Rscore for XGBoost: 0.5062029462495508

Best RMSE for XGBoost: 0.8828531455940027



In [32]:

mse = mean_squared_error(y_test, y_test_pred)

In [34]:
r = r2_score(y_test, y_test_pred)
r

0.3968043047816301