In [1]:
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd
import numpy as np
from mordred import Calculator, descriptors

In [None]:
#Use this function to generate descriptors based on a molecular descriptor calculator known as mordred
def All_Mordred_descriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    
    df = calc.pandas(mols)
    return df
mordred_descriptors = All_Mordred_descriptors(dataset_new['SMILES'])
mordred_descriptors.to_csv('mordred_descriptors.csv', index = False)

In [None]:
#This function can be used to generate molecular descriptors from SMILES column
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
#Mol_descriptors,desc_names = RDkit_descriptors(dataset_new['SMILES'])

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm


original_data = pd.read_csv('morgan_fpts.csv')

X = original_data.drop(['label'], axis=1)  
y = original_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam', loss='binary_crossentropy')

epochs = 10
batch_size = 32
steps_per_epoch = len(X_train_normalized) // batch_size

with tqdm(total=epochs * steps_per_epoch) as pbar:
    for epoch in range(epochs):
        for step in range(steps_per_epoch):
            start_idx = step * batch_size
            end_idx = (step + 1) * batch_size
            X_batch = X_train_normalized[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]

            model.train_on_batch(X_batch, y_batch)

            pbar.update(1)


y_pred = model.predict(X_test_normalized)
_pred_labels = (y_pred >= 0.7).astype(int)

accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels)
recall = recall_score(y_test, y_pred_labels)
f1 = f1_score(y_test, y_pred_labels)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC: {auc_roc:.4f}")


In [None]:
model.save('model.h5')

In [3]:
dataset_new = pd.read_csv('phyto_with_smiles.csv',low_memory = False)
dataset_new.shape

(3651, 2)

In [8]:
def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles] 
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

Canon_SMILES = canonical_smiles(dataset_new.SMILES)
duplicates_smiles = dataset_new[dataset_new['SMILES'].duplicated()]['SMILES'].values
len(duplicates_smiles)
dataset_new = dataset_new.drop_duplicates(subset=['SMILES'])
dataset_new.shape

(3316, 2)

In [9]:
def morgan_fpts(data):
    Morgan_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i) 
        fpts =  AllChem.GetMorganFingerprintAsBitVect(mol,2,2048)
        mfpts = np.array(fpts)
        Morgan_fpts.append(mfpts)  
    return np.array(Morgan_fpts)

In [10]:
Morgan_fpts = morgan_fpts(dataset_new['SMILES'])
morgan_fpts_prediction = pd.DataFrame(Morgan_fpts)

In [11]:
morgan_fpts_prediction.shape

(3316, 2048)

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

new_data = morgan_fpts


X_new = new_data

scaler = MinMaxScaler()
X_new_normalized = scaler.fit_transform(X_new)

model = tf.keras.models.load_model('model.h5')

y_pred = model.predict(X_new_normalized)
y_pred_labels = (y_pred >= 0.5).astype(int)

prediction_scores = y_pred.flatten()
predictions = pd.DataFrame({
    'label': y_pred_labels.flatten(),
    'ranking': pd.Series(prediction_scores).rank(ascending=False),
    'score': prediction_scores
})
predictions