<a href="https://colab.research.google.com/github/JyothyVariyampat/Benchmark-ML-and-DL-Models-for-Small-Molecule-Immunity-Target-Bioactivity-Prediction./blob/main/Ensemble_of_CNN_Deepclassifier_1239_Full_RDKitDescriptors_17Nov2024_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""Deepclassifier-aid1239_revised.ipynb"""

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score

from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

from google.colab import drive


In [None]:
# Ensure reproducibility
import random
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Load the dataset
#df = pd.read_csv('/content/drive/MyDrive/Datasets/AID_1239/AID1239_extracted_descriptor_data.csv')
#/content/drive/MyDrive/Datasets/AID_1239/concatenated_AID_1239_Active_Inactive.csv
df = pd.read_csv('/content/drive/MyDrive/Datasets/AID_1239/concatenated_AID_1239_Active_Inactive.csv')
df_copy = df.copy()


In [None]:
df.shape

(4000, 24283)

In [None]:
# EDA 1
# Calculate the percentage of NaN values in each column


nan_percentage = df.isna().mean() * 100

# Find columns where more than 50% of the values are NaN
columns_with_nan_above_50 = nan_percentage[nan_percentage > 50].index.tolist()

print(f"Number of columns with more than 50% NaN values: {len(columns_with_nan_above_50)}")
print("Columns:", columns_with_nan_above_50)


Number of columns with more than 50% NaN values: 8416
Columns: ['morgan_counts_331095558', 'morgan_counts_670649866', 'morgan_counts_4104655381', 'morgan_counts_3310046752', 'morgan_counts_810097194', 'morgan_counts_729008687', 'morgan_counts_2995670577', 'morgan_counts_1279384625', 'morgan_counts_616215095', 'morgan_counts_3183596602', 'morgan_counts_2742604860', 'morgan_counts_3703709255', 'morgan_counts_3867751498', 'morgan_counts_2190227530', 'morgan_counts_270573645', 'morgan_counts_672648274', 'morgan_counts_4018573394', 'morgan_counts_1766270060', 'morgan_counts_3020277366', 'morgan_counts_670700536', 'morgan_counts_4021193861', 'morgan_counts_752412296', 'morgan_counts_3660960393', 'morgan_counts_4173243026', 'morgan_counts_4019697815', 'morgan_counts_3766531226', 'morgan_counts_810098337', 'morgan_counts_4263347881', 'morgan_counts_296409257', 'morgan_counts_178375861', 'morgan_counts_2641106110', 'morgan_counts_318115522', 'morgan_counts_1496549075', 'morgan_counts_3020085465

In [None]:
# EDA 2
# Calculate the percentage of zero values in each column



zero_percentage = (df == 0).mean() * 100

# Find columns where more than 50% of the values are 0
columns_with_zero_above_50 = zero_percentage[zero_percentage > 50].index.tolist()

print(f"Number of columns with more than 50% zero values: {len(columns_with_zero_above_50)}")
print("Columns:", columns_with_zero_above_50)


Number of columns with more than 50% zero values: 3336
Columns: ['rdf27', 'rdf28', 'rdf29', 'rdf56', 'rdf57', 'rdf58', 'rdf59', 'rdf86', 'rdf87', 'rdf88', 'rdf89', 'rdf117', 'rdf118', 'rdf119', 'rdf147', 'rdf148', 'rdf149', 'rdf177', 'rdf178', 'rdf179', 'rdf207', 'rdf208', 'rdf209', 'maccs0', 'maccs1', 'maccs2', 'maccs3', 'maccs4', 'maccs5', 'maccs6', 'maccs7', 'maccs8', 'maccs9', 'maccs10', 'maccs11', 'maccs12', 'maccs13', 'maccs14', 'maccs15', 'maccs16', 'maccs17', 'maccs18', 'maccs19', 'maccs20', 'maccs21', 'maccs22', 'maccs23', 'maccs24', 'maccs25', 'maccs26', 'maccs27', 'maccs28', 'maccs29', 'maccs30', 'maccs31', 'maccs32', 'maccs33', 'maccs34', 'maccs35', 'maccs36', 'maccs37', 'maccs38', 'maccs39', 'maccs40', 'maccs41', 'maccs42', 'maccs43', 'maccs44', 'maccs45', 'maccs46', 'maccs47', 'maccs48', 'maccs49', 'maccs50', 'maccs51', 'maccs52', 'maccs53', 'maccs54', 'maccs55', 'maccs56', 'maccs57', 'maccs58', 'maccs59', 'maccs60', 'maccs61', 'maccs62', 'maccs63', 'maccs64', 'maccs66', 

In [None]:
# Remove all the columns which falls in this categories:
# Number of columns with more than 50% NaN values:
# Number of columns with more than 50% zero values:

import pandas as pd

# Assuming 'data' is your DataFrame

# 1. Identify columns with more than 50% NaN values
threshold_nan = len(df) * 0.5
columns_with_nan = df.columns[df.isna().sum() > threshold_nan]

# 2. Identify columns with more than 50% zero values
threshold_zero = len(df) * 0.5
columns_with_zero = df.columns[(df == 0).sum() > threshold_zero]

# 3. Combine both sets of columns to drop
columns_to_drop = set(columns_with_nan).union(set(columns_with_zero))

# 4. Drop these columns from the DataFrame
data_cleaned = df.drop(columns=columns_to_drop)

# Check the shape of the new DataFrame to confirm the columns have been removed
print(f"Original shape: {df.shape}")
print(f"New shape after dropping columns: {data_cleaned.shape}")


Original shape: (4000, 24283)
New shape after dropping columns: (4000, 12531)


In [None]:
df = data_cleaned.copy()

In [None]:
# Step 1: Fill null values with zero
df.fillna(0, inplace=True)
df = df.round(3)


In [None]:
# Save the preprocessed dataset for future reference
df.to_csv('/content/drive/MyDrive/Datasets/AID_1239/preprocessed_AID1239_12531Columns_CNN__Active_Inactive.csv', index=False)
print(df.head())

   Unnamed: 0  PUBCHEM_CID  PUBCHEM_SID  \
0           0      1449342     24817956   
1           1      3242114      4247974   
2           2      9594900     17507393   
3           3       823601      7975245   
4           4      3239831      4245352   

                                            SMILES PUBCHEM_ACTIVITY_OUTCOME  \
0  C1=CC=C(C=C1)C(=O)NC2=NC=C(C=C2)NC(=O)C3=CC=CS3                   Active   
1          C1COCCN1C(=O)C2=NOC(=C2)C3=CC=C(C=C3)Cl                   Active   
2      CC1=CC(=NC2=CC=CC=C12)N/N=C/C3=CC(=CC=C3)Br                   Active   
3            CC1=C(C=CO1)C(=O)NC2=CC3=CC=CC=C3C=C2                   Active   
4            C1=CC=C(C(=C1)NC(=O)C2=CC=C(C=C2)Cl)O                   Active   

                                        SMILES.1 MOLECULEID  autocorr2d0  \
0  O=C(NC1=CN=C(NC(=O)C2=CC=CC=C2)C=C1)C1=CC=CS1   M3503957        3.470   
1        ClC1=CC=C(C=C1)C1=CC(=NO1)C(=O)N1CCOCC1   M2730310        3.353   
2    CC1=CC(N\N=C\C2=CC(Br)=CC=C2)=NC2

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,PUBCHEM_CID,PUBCHEM_SID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,SMILES.1,MOLECULEID,autocorr2d0,autocorr2d1,autocorr2d2,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
0,0,1449342,24817956,C1=CC=C(C=C1)C(=O)NC2=NC=C(C=C2)NC(=O)C3=CC=CS3,Active,O=C(NC1=CN=C(NC(=O)C2=CC=CC=C2)C=C1)C1=CC=CS1,M3503957,3.47,3.799,3.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3242114,4247974,C1COCCN1C(=O)C2=NOC(=C2)C3=CC=C(C=C3)Cl,Active,ClC1=CC=C(C=C1)C1=CC(=NO1)C(=O)N1CCOCC1,M2730310,3.353,3.709,3.697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,9594900,17507393,CC1=CC(=NC2=CC=CC=C12)N/N=C/C3=CC(=CC=C3)Br,Active,CC1=CC(N\N=C\C2=CC(Br)=CC=C2)=NC2=CC=CC=C12,M3349978,3.462,3.85,3.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,823601,7975245,CC1=C(C=CO1)C(=O)NC2=CC3=CC=CC=C3C=C2,Active,CC1=C(C=CO1)C(=O)NC1=CC2=CC=CC=C2C=C1,M2603287,3.196,3.537,3.501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3239831,4245352,C1=CC=C(C(=C1)NC(=O)C2=CC=C(C=C2)Cl)O,Active,OC1=CC=CC=C1NC(=O)C1=CC=C(Cl)C=C1,M1281287,3.128,3.483,3.514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Define target and columns to drop
target_column = 'PUBCHEM_ACTIVITY_OUTCOME'
drop_columns = ['Unnamed: 0','SMILES.1','PUBCHEM_SID', 'PUBCHEM_CID', 'SMILES', 'MOLECULEID']


In [None]:
# Drop unnecessary columns
df.drop(columns=drop_columns, inplace=True)


In [None]:
# # Encode target column (Active/Inactive)
# label_encoder = LabelEncoder()
# df[target_column] = label_encoder.fit_transform(df[target_column])

# prompt: i wnat ed to do label encoding:
# # Encode target column (Active/Inactive)
# label_encoder = LabelEncoder()
# df[target_column] = label_encoder.fit_transform(df[target_column])
# and wanted to know 0 for which class and 1 for which class

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# ... (Your existing code)

# Encode target column (Active/Inactive)
label_encoder = LabelEncoder()
df[target_column] = label_encoder.fit_transform(df[target_column])

# Print the mapping
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

{'Active': 0, 'Inactive': 1}


In [None]:
# Split the features and target
X = df.drop(columns=[target_column]).values
y = df[target_column].values

In [None]:
# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Check for NaN and infinity values, replace if needed
X_train[np.isinf(X_train)] = np.nan
X_train = np.nan_to_num(X_train, nan=np.nanmax(X_train))

X_test[np.isinf(X_test)] = np.nan
X_test = np.nan_to_num(X_test, nan=np.nanmax(X_test))

In [None]:
# Step 3: Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Define the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1), kernel_regularizer=l2(0.01)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=32, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the model
optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Early stopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, callbacks=[early_stopping])


Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 52ms/step - accuracy: 0.6137 - loss: 2.2498 - val_accuracy: 0.7713 - val_loss: 1.7399
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7737 - loss: 1.6177 - val_accuracy: 0.7862 - val_loss: 1.4294
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8122 - loss: 1.3315 - val_accuracy: 0.7950 - val_loss: 1.2602
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8316 - loss: 1.1566 - val_accuracy: 0.8075 - val_loss: 1.1566
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8530 - loss: 1.0433 - val_accuracy: 0.8050 - val_loss: 1.0821
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.8829 - loss: 0.9329 - val_accuracy: 0.8050 - val_loss: 1.0277
Epoch 7/50
[1m50/50[0m [32m━━━

In [None]:
# Evaluate the model
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)



In [None]:
# Make predictions
y_train_pred = model.predict(X_train).flatten()
y_train_pred_classes = (y_train_pred > 0.5).astype(int)
y_pred = model.predict(X_test).flatten()
y_pred_classes = (y_pred > 0.5).astype(int)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [None]:
# Calculate metrics for training data
train_roc_auc = roc_auc_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred_classes)
train_kappa = cohen_kappa_score(y_train, y_train_pred_classes)
train_precision = precision_score(y_train, y_train_pred_classes)
train_recall = recall_score(y_train, y_train_pred_classes)


In [None]:
# Calculate metrics for testing data
test_roc_auc = roc_auc_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred_classes)
test_kappa = cohen_kappa_score(y_test, y_pred_classes)
test_precision = precision_score(y_test, y_pred_classes)
test_recall = recall_score(y_test, y_pred_classes)

In [None]:
# Output the results
results = {
    "train_accuracy": train_acc,
    "train_roc_auc": train_roc_auc,
    "train_f1": train_f1,
    "train_kappa": train_kappa,
    "train_precision": train_precision,
    "train_recall": train_recall,
    "test_accuracy": test_acc,
    "test_roc_auc": test_roc_auc,
    "test_f1": test_f1,
    "test_kappa": test_kappa,
    "test_precision": test_precision,
    "test_recall": test_recall
}

print("\nModel Evaluation Metrics:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


Model Evaluation Metrics:
train_accuracy: 0.9388
train_roc_auc: 0.9830
train_f1: 0.9409
train_kappa: 0.8774
train_precision: 0.9204
train_recall: 0.9624
test_accuracy: 0.8087
test_roc_auc: 0.8752
test_f1: 0.7989
test_kappa: 0.6166
test_precision: 0.7937
test_recall: 0.8042


In [None]:

# Find unique values in y_test and y_pred_classes
unique_y_test = np.unique(y_test)
unique_y_pred = np.unique(y_pred_classes)

print("Unique values in y_test:", unique_y_test)
print("Unique values in y_pred_classes:", unique_y_pred)

Unique values in y_test: [0 1]
Unique values in y_pred_classes: [0 1]


In [None]:

results_df = pd.DataFrame({
    'True_Label': y_test,
    'Predicted_Label': y_pred_classes
})

# Map the numerical values back to their respective class labels if required
label_mapping = {0: 'Active', 1: 'Inactive'}
results_df['True_Label'] = results_df['True_Label'].map(label_mapping)
results_df['Predicted_Label'] = results_df['Predicted_Label'].map(label_mapping)

# Display the DataFrame
print(results_df.head())

# Save the DataFrame to a CSV file for reference
results_df.to_csv('/content/drive/MyDrive/Datasets/AID_1239/CNN_AID_1239_Predicted_Results.csv', index=False)


  True_Label Predicted_Label
0     Active          Active
1   Inactive        Inactive
2     Active          Active
3   Inactive        Inactive
4   Inactive        Inactive


In [None]:


# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_classes)
print("Confusion Matrix:\n", conf_matrix)

# Generate classification report
class_report = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report)

Confusion Matrix:
 [[343  79]
 [ 74 304]]
Classification Report:
               precision    recall  f1-score   support

      Active       0.82      0.81      0.82       422
    Inactive       0.79      0.80      0.80       378

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



In [None]:
# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred_classes)
print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))


Confusion Matrix:
[[343  79]
 [ 74 304]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       422
           1       0.79      0.80      0.80       378

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



In [None]:
# prompt: save the model name "FFNN_Deepclassifier_1239_Full_RDKitDescriptors_17Nov2024_final.mdl"

model_name = "CNN_Deepclassifier_1239_Full_RDKitDescriptors_17Nov2024_final.h5"
model.save(f'/content/drive/MyDrive/Models/AID_1239/{model_name}')



In [None]:
# Change the model name to use the native Keras format
model_name = "CNN_Deepclassifier_1239_Full_RDKitDescriptors_17Nov2024_final.keras"
model.save(f'/content/drive/MyDrive/Models/AID_1239/{model_name}')

**This code is for checking AND and OR operation for ensemble learning on the result with same test sample data**


In [None]:
CNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/CNN_AID_1239_Predicted_Results.csv")
FFNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/FFNN_AID_1239_Predicted_Results.csv")

In [None]:
CNN_1239_same_test_data.head()

Unnamed: 0,True_Label,Predicted_Label
0,Active,Active
1,Inactive,Inactive
2,Active,Active
3,Inactive,Inactive
4,Inactive,Inactive


In [None]:
FFNN_1239_same_test_data.head()

Unnamed: 0,True_Label,Predicted_Label
0,Active,Active
1,Inactive,Inactive
2,Active,Active
3,Inactive,Inactive
4,Inactive,Inactive


In [None]:
# prompt: # Perform logical AND operation between CNN_1239_same_test_data and FFNN_1239_same_test_data for the column Predicted_Label.
# Before logical AND operation change the column names of each df's Predicted_Label with prefix CNN_1239 and FFNN_1239.
# ALso give following metrics as result: test_accuracy	roc_auc	f1 Score	kappa	precision	recall

import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score

# Load the dataframes
CNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/CNN_AID_1239_Predicted_Results.csv")
FFNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/FFNN_AID_1239_Predicted_Results.csv")

# Rename the 'Predicted_Label' columns
CNN_1239_same_test_data = CNN_1239_same_test_data.rename(columns={'Predicted_Label': 'CNN_1239_Predicted_Label'})
FFNN_1239_same_test_data = FFNN_1239_same_test_data.rename(columns={'Predicted_Label': 'FFNN_1239_Predicted_Label'})

# Merge the dataframes
merged_df = pd.merge(CNN_1239_same_test_data, FFNN_1239_same_test_data, on='True_Label')

# Perform logical AND operation
merged_df['Logical_AND'] = (merged_df['CNN_1239_Predicted_Label'] == 'Inactive') & (merged_df['FFNN_1239_Predicted_Label'] == 'Inactive')

# Convert boolean to string 'Inactive' or 'Active'
merged_df['Logical_AND'] = merged_df['Logical_AND'].map({True: 'Inactive', False: 'Active'})


# Calculate metrics
y_true = merged_df['True_Label']
y_pred = merged_df['Logical_AND']

test_accuracy = accuracy_score(y_true, y_pred)
# Assuming 'Inactive' represents the positive class (1) and 'Active' the negative class (0)
# Create a mapping for label encoding (binary classification)
label_mapping = {'Inactive': 1, 'Active': 0}
y_true_encoded = y_true.map(label_mapping)
y_pred_encoded = y_pred.map(label_mapping)

roc_auc = roc_auc_score(y_true_encoded, y_pred_encoded)
f1 = f1_score(y_true_encoded, y_pred_encoded)
kappa = cohen_kappa_score(y_true, y_pred)
precision = precision_score(y_true_encoded, y_pred_encoded)
recall = recall_score(y_true_encoded, y_pred_encoded)


# Output the results
print(f"test_accuracy: {test_accuracy:.2f}")
print(f"roc_auc: {roc_auc:.2f}")
print(f"f1 Score: {f1:.2f}")
print(f"kappa: {kappa:.2f}")
print(f"precision: {precision:.2f}")
print(f"recall: {recall:.2f}")

test_accuracy: 0.83
roc_auc: 0.82
f1 Score: 0.78
kappa: 0.65
precision: 0.93
recall: 0.67


In [None]:
#  logical OR

# Perform logical OR operation
merged_df['Logical_OR'] = (merged_df['CNN_1239_Predicted_Label'] == 'Inactive') | (merged_df['FFNN_1239_Predicted_Label'] == 'Inactive')

# Convert boolean to string 'Inactive' or 'Active'
merged_df['Logical_OR'] = merged_df['Logical_OR'].map({True: 'Inactive', False: 'Active'})

# Calculate metrics
y_true = merged_df['True_Label']
y_pred = merged_df['Logical_OR']

test_accuracy = accuracy_score(y_true, y_pred)
# Assuming 'Inactive' represents the positive class (1) and 'Active' the negative class (0)
# Create a mapping for label encoding (binary classification)
label_mapping = {'Inactive': 1, 'Active': 0}
y_true_encoded = y_true.map(label_mapping)
y_pred_encoded = y_pred.map(label_mapping)

roc_auc = roc_auc_score(y_true_encoded, y_pred_encoded)
f1 = f1_score(y_true_encoded, y_pred_encoded)
kappa = cohen_kappa_score(y_true, y_pred)
precision = precision_score(y_true_encoded, y_pred_encoded)
recall = recall_score(y_true_encoded, y_pred_encoded)

# Output the results
print(f"test_accuracy: {test_accuracy:.2f}")
print(f"roc_auc: {roc_auc:.2f}")
print(f"f1 Score: {f1:.2f}")
print(f"kappa: {kappa:.2f}")
print(f"precision: {precision:.2f}")
print(f"recall: {recall:.2f}")

test_accuracy: 0.79
roc_auc: 0.80
f1 Score: 0.80
kappa: 0.58
precision: 0.68
recall: 0.97


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score

# Load the dataframes
CNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/CNN_AID_1239_Predicted_Results.csv")
FFNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/FFNN_AID_1239_Predicted_Results.csv")

# Rename the 'Predicted_Label' columns
CNN_1239_same_test_data.rename(columns={'Predicted_Label': 'Predicted_Label_CNN_1239'}, inplace=True)
FFNN_1239_same_test_data.rename(columns={'Predicted_Label': 'Predicted_Label_FFNN_1239'}, inplace=True)

# Merge the dataframes on the common ground truth column
merged_df = pd.merge(CNN_1239_same_test_data, FFNN_1239_same_test_data, on='True_Label')

# Map 'Active' to 0 and 'Inactive' to 1 for logical operations
label_mapping = {'Active': 0, 'Inactive': 1}
merged_df['Predicted_Label_CNN_1239'] = merged_df['Predicted_Label_CNN_1239'].map(label_mapping)
merged_df['Predicted_Label_FFNN_1239'] = merged_df['Predicted_Label_FFNN_1239'].map(label_mapping)
merged_df['True_Label'] = merged_df['True_Label'].map(label_mapping)

# Perform logical AND operation between the predicted labels
merged_df['Logical_AND'] = (
    merged_df['Predicted_Label_CNN_1239'] & merged_df['Predicted_Label_FFNN_1239']
)

# Map back the results to 'Active' and 'Inactive' for readability
reverse_mapping = {0: 'Active', 1: 'Inactive'}
merged_df['Logical_AND'] = merged_df['Logical_AND'].map(reverse_mapping)
merged_df['True_Label'] = merged_df['True_Label'].map(reverse_mapping)

# Calculate metrics
y_true = merged_df['True_Label'].map(label_mapping).values
y_pred = merged_df['Logical_AND'].map(label_mapping).values

# Metrics Calculation
metrics = {
    "test_accuracy": accuracy_score(y_true, y_pred),
    "roc_auc": roc_auc_score(y_true, y_pred),
    "f1_score": f1_score(y_true, y_pred),
    "kappa": cohen_kappa_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
}

# Print metrics
print("Metrics after Logical AND:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Metrics after Logical AND:
test_accuracy: 0.8329
roc_auc: 0.8173
f1_score: 0.7823
kappa: 0.6523
precision: 0.9313
recall: 0.6744


In [None]:
merged_df

Unnamed: 0,True_Label,Predicted_Label_CNN_1239,Predicted_Label_FFNN_1239,Logical_AND
0,Active,0,0,Active
1,Active,0,0,Active
2,Active,0,0,Active
3,Active,0,0,Active
4,Active,0,0,Active
...,...,...,...,...
320963,Inactive,1,1,Inactive
320964,Inactive,1,1,Inactive
320965,Inactive,1,1,Inactive
320966,Inactive,1,1,Inactive


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score

# Load the dataframes
CNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/CNN_AID_1239_Predicted_Results.csv")
FFNN_1239_same_test_data = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/FFNN_AID_1239_Predicted_Results.csv")

# Rename the 'Predicted_Label' columns
CNN_1239_same_test_data.rename(columns={'Predicted_Label': 'Predicted_Label_CNN_1239'}, inplace=True)
FFNN_1239_same_test_data.rename(columns={'Predicted_Label': 'Predicted_Label_FFNN_1239'}, inplace=True)

# Merge the dataframes
merged_df = pd.merge(CNN_1239_same_test_data, FFNN_1239_same_test_data, on='True_Label')

# Map Active/Inactive to binary values
label_mapping = {'Active': 0, 'Inactive': 1}
merged_df['Predicted_Label_CNN_1239'] = merged_df['Predicted_Label_CNN_1239'].map(label_mapping)
merged_df['Predicted_Label_FFNN_1239'] = merged_df['Predicted_Label_FFNN_1239'].map(label_mapping)
merged_df['True_Label'] = merged_df['True_Label'].map(label_mapping)

# Perform Logical OR operation
merged_df['Logical_OR'] = (
    merged_df['Predicted_Label_CNN_1239'] | merged_df['Predicted_Label_FFNN_1239']
)

# Map back to readable labels
reverse_mapping = {0: 'Active', 1: 'Inactive'}
merged_df['Logical_OR'] = merged_df['Logical_OR'].map(reverse_mapping)
merged_df['True_Label'] = merged_df['True_Label'].map(reverse_mapping)

# Prepare data for metric calculations
y_true = merged_df['True_Label'].map(label_mapping).values
y_pred = merged_df['Logical_OR'].map(label_mapping).values

# Calculate evaluation metrics
metrics = {
    "test_accuracy": accuracy_score(y_true, y_pred),
    "roc_auc": roc_auc_score(y_true, y_pred),
    "f1_score": f1_score(y_true, y_pred),
    "kappa": cohen_kappa_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
}

# Output the metrics
print("Metrics after Logical OR:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Metrics after Logical OR:
test_accuracy: 0.7859
roc_auc: 0.8039
f1_score: 0.8011
kappa: 0.5838
precision: 0.6830
recall: 0.9684


**Logical AND**: Focuses on precision, improving test accuracy, precision, and agreement (Kappa) at the cost of recall and F1 score.

**Logical OR:** Prioritizes inclusivity, achieving the highest recall but slightly lower precision and Kappa scores.

**Below code is with different sample test data**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/AID_1239/preprocessed_AID1239_12531Columns_CNN__Active_Inactive.csv')

In [None]:
df

Unnamed: 0.1,Unnamed: 0,PUBCHEM_CID,PUBCHEM_SID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,SMILES.1,MOLECULEID,autocorr2d0,autocorr2d1,autocorr2d2,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
0,0,1449342,24817956,C1=CC=C(C=C1)C(=O)NC2=NC=C(C=C2)NC(=O)C3=CC=CS3,Active,O=C(NC1=CN=C(NC(=O)C2=CC=CC=C2)C=C1)C1=CC=CS1,M3503957,3.470,3.799,3.750,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3242114,4247974,C1COCCN1C(=O)C2=NOC(=C2)C3=CC=C(C=C3)Cl,Active,ClC1=CC=C(C=C1)C1=CC(=NO1)C(=O)N1CCOCC1,M2730310,3.353,3.709,3.697,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,9594900,17507393,CC1=CC(=NC2=CC=CC=C12)N/N=C/C3=CC(=CC=C3)Br,Active,CC1=CC(N\N=C\C2=CC(Br)=CC=C2)=NC2=CC=CC=C12,M3349978,3.462,3.850,3.880,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,823601,7975245,CC1=C(C=CO1)C(=O)NC2=CC3=CC=CC=C3C=C2,Active,CC1=C(C=CO1)C(=O)NC1=CC2=CC=CC=C2C=C1,M2603287,3.196,3.537,3.501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,3239831,4245352,C1=CC=C(C(=C1)NC(=O)C2=CC=C(C=C2)Cl)O,Active,OC1=CC=CC=C1NC(=O)C1=CC=C(Cl)C=C1,M1281287,3.128,3.483,3.514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3995,826714,22411915,C1=CC=C(C=C1)CNC(=S)NC2=CC(=CC=C2)Cl,Inactive,ClC1=CC=CC(NC(=S)NCC2=CC=CC=C2)=C1,M1909727,3.235,3.589,3.529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,3996,884669,17411789,CCCC(=O)NC(=S)NC1=CC=CC=C1C(=O)OC,Inactive,CCCC(=O)NC(=S)NC1=CC=CC=C1C(=O)OC,M1293272,3.220,3.548,3.580,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,3997,2427661,22404811,COC(=O)C1=CC(=CC(=C1)[N+](=O)[O-])C(=O)OCC(=O)...,Inactive,COC(=O)C1=CC(=CC(=C1)[N+]([O-])=O)C(=O)OCC(=O)...,M1153565,3.679,4.027,4.047,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,3998,662514,861255,CCOC(=O)C1=CC=C(C=C1)N2C(C3=C(NN=C3C2=O)C)C4=C...,Inactive,CCOC(=O)C1=CC=C(C=C1)N1C(C2=C(C)NN=C2C1=O)C1=C...,M1203346,3.709,4.079,4.206,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# prompt: from the above dataframe select 10 % Active and 10% Inactive randomly and  reproducible and name it df_test

# Assuming 'df' is your DataFrame and 'target_column' is defined as before
target_column = "PUBCHEM_ACTIVITY_OUTCOME"
active_indices = df[df[target_column] == 'Active'].index
inactive_indices = df[df[target_column] == 'Inactive'].index

# Calculate the number of samples to select
n_active = int(0.1 * len(active_indices))
n_inactive = int(0.1 * len(inactive_indices))

# Create a RandomState instance with a seed for reproducibility
rng = np.random.RandomState(42)

# Randomly select indices (reproducible with random_state)
active_test_indices = rng.choice(active_indices, size=n_active, replace=False) # Use rng.choice()
inactive_test_indices = rng.choice(inactive_indices, size=n_inactive, replace=False) # Use rng.choice()

# Combine the selected indices
test_indices = np.concatenate((active_test_indices, inactive_test_indices))

# Create the test DataFrame
df_test = df.loc[test_indices]

In [None]:
df_test

Unnamed: 0.1,Unnamed: 0,PUBCHEM_CID,PUBCHEM_SID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,SMILES.1,MOLECULEID,autocorr2d0,autocorr2d1,autocorr2d2,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
1860,1860,652981,850460,CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3,Active,CC(C)COC1=CC=C(C=C1)C1=NN=C(O1)C1=CC=CO1,M2602639,3.333,3.659,3.553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,353,3242714,4248662,CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3,Active,CC1=CC(=CC=C1)C1=NSC(SCC(=O)N2CCCC2)=N1,M2839771,3.523,3.931,3.790,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333,1333,2999012,3714920,CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O,Active,CC1=CC=C(C=C1)N=CC1=C(O)OC(=N1)C1=CC=CO1,M3719518,3.272,3.613,3.494,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
905,905,6870079,14724612,C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br,Active,Br\C(\C=N\NC(=O)C1=NNC2=C1CCC2)=C/C1=CC=CC=C1,M3462495,3.516,3.901,3.897,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1289,1289,714147,17402026,C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl,Active,ClC1=CC=C(NC2=NC(=CS2)C2CC2)C=C1,M3356492,3.254,3.581,3.438,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3260,3260,2099465,24783803,CCN1C2=C(C=C(C=C2)NC(=O)COC(=O)C3=CC=CC=C3O)C4...,Inactive,CCN1C2=CC=CC=C2C2=C1C=CC(NC(=O)COC(=O)C1=CC=CC...,M1493870,3.617,3.985,4.074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2357,2357,2947126,7964816,C1C(NC2=NC(=NN2C1C3=CC(=CC=C3)Br)N)C4=CC=CC=C4,Inactive,NC1=NN2C(CC(NC2=N1)C1=CC=CC=C1)C1=CC(Br)=CC=C1,M1076572,3.576,4.014,4.031,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2534,2534,2205262,24791987,CCCCC(=O)NC1=CC=C(C=C1)C(=O)NCCC2=CC=C(C=C2)S(...,Inactive,CCCCC(=O)NC1=CC=C(C=C1)C(=O)NCCC1=CC=C(C=C1)S(...,M3494096,3.747,3.967,3.963,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
3118,3118,2200259,4260589,COC(=O)C1=CC=CC=C1CON2C3=CC=CC=C3[N+](=CC2=O)[O-],Inactive,COC(=O)C1=CC=CC=C1CON1C(=O)C=[N+]([O-])C2=CC=C...,M3454341,3.454,3.794,3.934,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#df_test.to_csv('/content/drive/MyDrive/Datasets/AID_1239/df_test_10percent_Active_Inactive.csv')

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Datasets/AID_1239/df_test_10percent_Active_Inactive.csv')

In [None]:
df_test['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

Unnamed: 0_level_0,count
PUBCHEM_ACTIVITY_OUTCOME,Unnamed: 1_level_1
Active,200
Inactive,200


In [None]:
df_test_copy = df_test.copy()

In [None]:
df_test_copy.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,PUBCHEM_CID,PUBCHEM_SID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,SMILES.1,MOLECULEID,autocorr2d0,autocorr2d1,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
0,1860,1860,652981,850460,CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3,Active,CC(C)COC1=CC=C(C=C1)C1=NN=C(O1)C1=CC=CO1,M2602639,3.333,3.659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,353,353,3242714,4248662,CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3,Active,CC1=CC(=CC=C1)C1=NSC(SCC(=O)N2CCCC2)=N1,M2839771,3.523,3.931,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1333,1333,2999012,3714920,CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O,Active,CC1=CC=C(C=C1)N=CC1=C(O)OC(=N1)C1=CC=CO1,M3719518,3.272,3.613,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,905,905,6870079,14724612,C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br,Active,Br\C(\C=N\NC(=O)C1=NNC2=C1CCC2)=C/C1=CC=CC=C1,M3462495,3.516,3.901,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1289,1289,714147,17402026,C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl,Active,ClC1=CC=C(NC2=NC(=CS2)C2CC2)C=C1,M3356492,3.254,3.581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Define target and columns to drop
target_column = 'PUBCHEM_ACTIVITY_OUTCOME'
drop_columns = ['Unnamed: 0','SMILES.1','PUBCHEM_SID', 'PUBCHEM_CID', 'SMILES', 'MOLECULEID','PUBCHEM_ACTIVITY_OUTCOME']
#drop_columns = [ 'SMILES', 'MOLECULEID']


In [None]:
# Drop unnecessary columns
df_test_copy.drop(columns=drop_columns, inplace=True)

In [None]:
df_test_copy.head()

Unnamed: 0,Unnamed: 0.1,autocorr2d0,autocorr2d1,autocorr2d2,autocorr2d3,autocorr2d4,autocorr2d5,autocorr2d6,autocorr2d7,autocorr2d8,...,"atom_pairs_((N,1,2),14,(*,1,0))","atom_pairs_((N,3,0),12,(O,1,1))",morgan_counts_943520092,"atom_pairs_((O,2,0),6,(S,4,0))",morgan_counts_640577968,"atom_pairs_((C,2,2),14,(*,1,0))","bpf_((B,5,0),16,(B,5,0))","atom_pairs_((C,4,0),15,(Cl,1,0))",morgan_counts_469020719,morgan_counts_594640005
0,1860,3.333,3.659,3.553,3.486,3.318,3.179,3.075,2.872,3.242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,353,3.523,3.931,3.79,3.71,3.747,3.549,3.348,2.989,3.303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1333,3.272,3.613,3.494,3.338,3.237,3.218,3.02,2.836,3.165,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,905,3.516,3.901,3.897,3.901,3.826,3.767,3.497,3.66,3.296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1289,3.254,3.581,3.438,3.338,3.132,2.881,3.07,2.592,3.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# prompt: use standard scaler to the df_test_copy

from sklearn.preprocessing import StandardScaler

# Assuming df_test_copy is defined as in the previous code

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
df_test_copy_scaled = scaler.fit_transform(df_test_copy)

# Convert back to a DataFrame (optional, but often useful)
df_test_copy_scaled = pd.DataFrame(df_test_copy_scaled, columns=df_test_copy.columns)

In [None]:
from tensorflow.keras.models import load_model
import pandas as pd

# Load the trained model
model_path = "/content/drive/MyDrive/Models/AID_1239/CNN_Deepclassifier_1239_Full_RDKitDescriptors_17Nov2024_final.keras"
model = load_model(model_path)

# Predict using the scaled test data
predictions = model.predict(df_test_copy_scaled).flatten()

# Convert predictions to binary classes (0 or 1) based on a threshold of 0.5
predicted_classes = (predictions > 0.5).astype(int)

# Add the predicted outcomes to the original test DataFrame
df_test["PUBCHEM_ACTIVITY_OUTCOME_Predicted"] = predicted_classes

# Create the final DataFrame with the required columns
final_df = df_test[["PUBCHEM_CID", "SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]].copy()
final_df["PUBCHEM_ACTIVITY_OUTCOME_Predicted"] = df_test["PUBCHEM_ACTIVITY_OUTCOME_Predicted"]

# Save the final DataFrame as a CSV for reference
output_path = "/content/drive/MyDrive/Datasets/AID_1239/CNN_predictions.csv"
final_df.to_csv(output_path, index=False)

# Display the first few rows of the final DataFrame
print(final_df.head())


  saveable.load_own_variables(weights_store.get(inner_path))


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3   
1      3242714          CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3   
2      2999012         CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O   
3      6870079  C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br   
4       714147                 C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl   

  PUBCHEM_ACTIVITY_OUTCOME  PUBCHEM_ACTIVITY_OUTCOME_Predicted  
0                   Active                                   0  
1                   Active                                   0  
2                   Active                                   0  
3                   Active                                   0  
4                   Active                                   0  


In [None]:
# Convert predictions (0 or 1) back to their original labels
label_mapping = {0: "Active", 1: "Inactive"}
df_test["PUBCHEM_ACTIVITY_OUTCOME_Predicted"] = df_test["PUBCHEM_ACTIVITY_OUTCOME_Predicted"].map(label_mapping)

# Create the final DataFrame with the required columns
final_df = df_test[["PUBCHEM_CID", "SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]].copy()
final_df["PUBCHEM_ACTIVITY_OUTCOME_Predicted"] = df_test["PUBCHEM_ACTIVITY_OUTCOME_Predicted"]

# Save the final DataFrame as a CSV for reference
output_path = "/content/drive/MyDrive/Datasets/AID_1239/CNN_1239_10Percent_Data_predictions_with_labels_final_result.csv"
final_df.to_csv(output_path, index=False)

# Display the first few rows of the final DataFrame
print(final_df.head())


   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3   
1      3242714          CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3   
2      2999012         CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O   
3      6870079  C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br   
4       714147                 C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl   

  PUBCHEM_ACTIVITY_OUTCOME PUBCHEM_ACTIVITY_OUTCOME_Predicted  
0                   Active                             Active  
1                   Active                             Active  
2                   Active                             Active  
3                   Active                             Active  
4                   Active                             Active  


In [None]:
final_df['PUBCHEM_ACTIVITY_OUTCOME_Predicted'].value_counts()

Unnamed: 0_level_0,count
PUBCHEM_ACTIVITY_OUTCOME_Predicted,Unnamed: 1_level_1
Inactive,203
Active,197


In [None]:
# Code till abov is ok as on 27-11-2024 and make it the final code

**This code is for checking AND and OR operation for ensemble learning on the result with different test sample data**

1.   List item
2.   List item




In [None]:
CNN_1239 = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/CNN_1239_10Percent_Data_predictions_with_labels_final_result.csv")
FFNN_1239 = pd.read_csv("/content/drive/MyDrive/Datasets/AID_1239/FFNN_1239_10Percent_Data_predictions_with_labels_final_result.csv")


In [None]:
CNN_1239.head()

Unnamed: 0,PUBCHEM_CID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_OUTCOME_Predicted
0,652981,CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3,Active,Active
1,3242714,CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3,Active,Active
2,2999012,CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O,Active,Active
3,6870079,C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br,Active,Active
4,714147,C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl,Active,Active


In [None]:
FFNN_1239.head()

Unnamed: 0,PUBCHEM_CID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_OUTCOME_Predicted
0,652981,CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3,Active,Active
1,3242714,CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3,Active,Active
2,2999012,CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O,Active,Active
3,6870079,C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br,Active,Active
4,714147,C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl,Active,Active


In [None]:
# Rename column in CNN_1239 DataFrame
CNN_1239.rename(columns={"PUBCHEM_ACTIVITY_OUTCOME_Predicted": "PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239"}, inplace=True)

# Rename column in FFNN_1239 DataFrame
FFNN_1239.rename(columns={"PUBCHEM_ACTIVITY_OUTCOME_Predicted": "PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239"}, inplace=True)

# Display the updated DataFrames to verify
print("Updated CNN_1239 DataFrame:")
print(CNN_1239.head())

print("\nUpdated FFNN_1239 DataFrame:")
print(FFNN_1239.head())


Updated CNN_1239 DataFrame:
   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3   
1      3242714          CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3   
2      2999012         CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O   
3      6870079  C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br   
4       714147                 C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl   

  PUBCHEM_ACTIVITY_OUTCOME PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239  
0                   Active                                      Active  
1                   Active                                      Active  
2                   Active                                      Active  
3                   Active                                      Active  
4                   Active                                      Active  

Updated FFNN_1239 DataFrame:
   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=C

In [None]:
CNN_1239.columns

Index(['PUBCHEM_CID', 'SMILES', 'PUBCHEM_ACTIVITY_OUTCOME',
       'PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'],
      dtype='object')

In [None]:
FFNN_1239.columns

Index(['PUBCHEM_CID', 'SMILES', 'PUBCHEM_ACTIVITY_OUTCOME',
       'PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'],
      dtype='object')

In [None]:
# Create copies of the DataFrames to avoid modifying the originals
CNN_1239_copy = CNN_1239.copy()
FFNN_1239_copy = FFNN_1239.copy()

# Map 'Active' to 0 and 'Inactive' to 1 for logical operations
mapping = {'Active': 0, 'Inactive': 1}
CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] = CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'].map(mapping)
FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'] = FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'].map(mapping)

# Perform logical AND operation
CNN_1239_AND_FFNN_1239 = CNN_1239_copy[['PUBCHEM_CID', 'SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']].copy()
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] = CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239']
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'] = FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239']
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239'] = (
    CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] & FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239']
)

# Map back 'Active' and 'Inactive' for readability
reverse_mapping = {0: 'Active', 1: 'Inactive'}
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] = CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'].map(reverse_mapping)
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'] = CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'].map(reverse_mapping)
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239'] = CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239'].map(reverse_mapping)

# Display the resulting DataFrame
print("CNN_1239_AND_FFNN_1239 DataFrame:")
print(CNN_1239_AND_FFNN_1239.head())


CNN_1239_AND_FFNN_1239 DataFrame:
   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3   
1      3242714          CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3   
2      2999012         CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O   
3      6870079  C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br   
4       714147                 C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl   

  PUBCHEM_ACTIVITY_OUTCOME PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239  \
0                   Active                                      Active   
1                   Active                                      Active   
2                   Active                                      Active   
3                   Active                                      Active   
4                   Active                                      Active   

  PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239  \
0                                       Active   
1                 

In [None]:
CNN_1239_AND_FFNN_1239.head()

Unnamed: 0,PUBCHEM_CID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239,PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239,PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239
0,652981,CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3,Active,Active,Active,Active
1,3242714,CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3,Active,Active,Active,Active
2,2999012,CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O,Active,Active,Active,Active
3,6870079,C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br,Active,Active,Active,Active
4,714147,C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl,Active,Active,Active,Active


In [None]:
CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239'].value_counts()

Unnamed: 0_level_0,count
PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239,Unnamed: 1_level_1
Active,206
Inactive,194


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score

# Map 'Active' and 'Inactive' to 0 and 1 for metric calculations
mapping = {'Active': 0, 'Inactive': 1}
ground_truth = CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME'].map(mapping)
predictions_and = CNN_1239_AND_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_&_FFNN_1239'].map(mapping)

# Calculate metrics
test_accuracy = accuracy_score(ground_truth, predictions_and)
roc_auc = roc_auc_score(ground_truth, predictions_and)
f1 = f1_score(ground_truth, predictions_and)
kappa = cohen_kappa_score(ground_truth, predictions_and)
precision = precision_score(ground_truth, predictions_and)
recall = recall_score(ground_truth, predictions_and)

# Display the results
metrics = {
    "test_accuracy": test_accuracy,
    "roc_auc": roc_auc,
    "f1_score": f1,
    "kappa": kappa,
    "precision": precision,
    "recall": recall
}

print("Metrics after Logical AND:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")


Metrics after Logical AND:
test_accuracy: 0.8850
roc_auc: 0.8850
f1_score: 0.8832
kappa: 0.7700
precision: 0.8969
recall: 0.8700


In [None]:
# logical OR

# Perform logical OR operation
CNN_1239_OR_FFNN_1239 = CNN_1239_copy[['PUBCHEM_CID', 'SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']].copy()
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] = CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239']
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'] = FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239']
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_OR_FFNN_1239'] = (
    CNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] | FFNN_1239_copy['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239']
)

# Map back 'Active' and 'Inactive' for readability
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'] = CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239'].map(reverse_mapping)
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'] = CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239'].map(reverse_mapping)
CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_OR_FFNN_1239'] = CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_OR_FFNN_1239'].map(reverse_mapping)

# Display the resulting DataFrame
print("CNN_1239_OR_FFNN_1239 DataFrame:")
print(CNN_1239_OR_FFNN_1239.head())


CNN_1239_OR_FFNN_1239 DataFrame:
   PUBCHEM_CID                                           SMILES  \
0       652981         CC(C)COC1=CC=C(C=C1)C2=NN=C(O2)C3=CC=CO3   
1      3242714          CC1=CC(=CC=C1)C2=NSC(=N2)SCC(=O)N3CCCC3   
2      2999012         CC1=CC=C(C=C1)N=CC2=C(OC(=N2)C3=CC=CO3)O   
3      6870079  C1CC2=C(C1)NN=C2C(=O)N/N=C/C(=C/C3=CC=CC=C3)/Br   
4       714147                 C1CC1C2=CSC(=N2)NC3=CC=C(C=C3)Cl   

  PUBCHEM_ACTIVITY_OUTCOME PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239  \
0                   Active                                      Active   
1                   Active                                      Active   
2                   Active                                      Active   
3                   Active                                      Active   
4                   Active                                      Active   

  PUBCHEM_ACTIVITY_OUTCOME_Predicted_FFNN_1239  \
0                                       Active   
1                  

In [None]:
# Map 'Active' and 'Inactive' to 0 and 1 for metric calculations
ground_truth = CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME'].map(mapping)
predictions_or = CNN_1239_OR_FFNN_1239['PUBCHEM_ACTIVITY_OUTCOME_Predicted_CNN_1239_OR_FFNN_1239'].map(mapping)

# Calculate metrics
test_accuracy_or = accuracy_score(ground_truth, predictions_or)
roc_auc_or = roc_auc_score(ground_truth, predictions_or)
f1_or = f1_score(ground_truth, predictions_or)
kappa_or = cohen_kappa_score(ground_truth, predictions_or)
precision_or = precision_score(ground_truth, predictions_or)
recall_or = recall_score(ground_truth, predictions_or)

# Display the results
metrics_or = {
    "test_accuracy": test_accuracy_or,
    "roc_auc": roc_auc_or,
    "f1_score": f1_or,
    "kappa": kappa_or,
    "precision": precision_or,
    "recall": recall_or
}

print("Metrics after Logical OR:")
for metric, value in metrics_or.items():
    print(f"{metric}: {value:.4f}")


Metrics after Logical OR:
test_accuracy: 0.9000
roc_auc: 0.9000
f1_score: 0.9061
kappa: 0.8000
precision: 0.8540
recall: 0.9650


**Below is Code with Optuna**

In [None]:
!pip install --quiet optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/364.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m358.4/364.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/233.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score
from tensorflow.keras.callbacks import EarlyStopping

# Define the Optuna objective function
def objective(trial):
    # Hyperparameter suggestions
    n_layers = trial.suggest_int("n_layers", 2, 4)
    units = trial.suggest_int("units", 32, 256, step=32)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
    l2_reg = trial.suggest_float("l2_reg", 1e-5, 1e-2, log=True)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    # Build the model
    model = Sequential()
    model.add(Dense(units, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    for _ in range(n_layers - 1):
        model.add(Dense(units // 2, activation='relu', kernel_regularizer=l2(l2_reg)))
        model.add(Dropout(dropout_rate))
        units //= 2
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50,
                        batch_size=batch_size, callbacks=[early_stopping], verbose=0)

    # Evaluate the model on test data
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

    # Calculate additional metrics on test data
    y_pred = model.predict(X_test).flatten()
    y_pred_classes = (y_pred > 0.5).astype(int)

    if len(np.unique(y_test)) < 2:
        test_roc_auc = None
    else:
        test_roc_auc = roc_auc_score(y_test, y_pred)

    test_f1 = f1_score(y_test, y_pred_classes)
    test_kappa = cohen_kappa_score(y_test, y_pred_classes)
    test_precision = precision_score(y_test, y_pred_classes)
    test_recall = recall_score(y_test, y_pred_classes)

    # Return the test accuracy as the objective to maximize
    return test_acc

# Run Optuna for hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best hyperparameters from Optuna
best_params = study.best_params
print("Best hyperparameters:", best_params)

# Build the final model with the best hyperparameters
n_layers = best_params["n_layers"]
units = best_params["units"]
dropout_rate = best_params["dropout_rate"]
l2_reg = best_params["l2_reg"]
learning_rate = best_params["learning_rate"]
batch_size = best_params["batch_size"]

model = Sequential()
model.add(Dense(units, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(l2_reg)))
model.add(Dropout(dropout_rate))
for _ in range(n_layers - 1):
    model.add(Dense(units // 2, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    units //= 2
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the final model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50,
                    batch_size=batch_size, callbacks=[early_stopping], verbose=1)

# Evaluate the final model
train_loss, train_acc = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

# Predictions
y_train_pred = model.predict(X_train).flatten()
y_train_pred_classes = (y_train_pred > 0.5).astype(int)
y_pred = model.predict(X_test).flatten()
y_pred_classes = (y_pred > 0.5).astype(int)

# Train metrics
train_roc_auc = roc_auc_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred_classes)
train_kappa = cohen_kappa_score(y_train, y_train_pred_classes)
train_precision = precision_score(y_train, y_train_pred_classes)
train_recall = recall_score(y_train, y_train_pred_classes)

# Test metrics
test_roc_auc = roc_auc_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred_classes)
test_kappa = cohen_kappa_score(y_test, y_pred_classes)
test_precision = precision_score(y_test, y_pred_classes)
test_recall = recall_score(y_test, y_pred_classes)

# Output the results
results = {
    "train_accuracy": train_acc,
    "train_roc_auc": train_roc_auc,
    "train_f1": train_f1,
    "train_kappa": train_kappa,
    "train_precision": train_precision,
    "train_recall": train_recall,
    "test_accuracy": test_acc,
    "test_roc_auc": test_roc_auc,
    "test_f1": test_f1,
    "test_kappa": test_kappa,
    "test_precision": test_precision,
    "test_recall": test_recall
}

print("\nModel Evaluation Metrics:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


[I 2024-11-17 07:57:02,101] A new study created in memory with name: no-name-47791280-ab13-4c1e-ac5c-c1a02c421cef
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:57:19,625] Trial 0 finished with value: 0.8199999928474426 and parameters: {'n_layers': 4, 'units': 128, 'dropout_rate': 0.2, 'l2_reg': 0.00613117000369051, 'learning_rate': 3.0445345360974923e-05, 'batch_size': 64}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:57:29,379] Trial 1 finished with value: 0.8025000095367432 and parameters: {'n_layers': 2, 'units': 256, 'dropout_rate': 0.5, 'l2_reg': 5.00508760282657e-05, 'learning_rate': 7.603619929208971e-05, 'batch_size': 64}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:57:38,468] Trial 2 finished with value: 0.7887499928474426 and parameters: {'n_layers': 2, 'units': 224, 'dropout_rate': 0.4, 'l2_reg': 0.0005271686746807884, 'learning_rate': 0.001811928657497483, 'batch_size': 128}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step   


[I 2024-11-17 07:57:47,305] Trial 3 finished with value: 0.7887499928474426 and parameters: {'n_layers': 3, 'units': 256, 'dropout_rate': 0.4, 'l2_reg': 8.78084765381797e-05, 'learning_rate': 0.007951949127645082, 'batch_size': 64}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step   


[I 2024-11-17 07:57:56,943] Trial 4 finished with value: 0.8187500238418579 and parameters: {'n_layers': 4, 'units': 256, 'dropout_rate': 0.2, 'l2_reg': 0.0022214389467747238, 'learning_rate': 0.00031640887816355973, 'batch_size': 32}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step   


[I 2024-11-17 07:58:10,758] Trial 5 finished with value: 0.7724999785423279 and parameters: {'n_layers': 3, 'units': 192, 'dropout_rate': 0.1, 'l2_reg': 0.006186017439482903, 'learning_rate': 0.0014109223175600963, 'batch_size': 64}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:58:30,033] Trial 6 finished with value: 0.8174999952316284 and parameters: {'n_layers': 3, 'units': 192, 'dropout_rate': 0.2, 'l2_reg': 0.006548393161534661, 'learning_rate': 2.8611757901139377e-05, 'batch_size': 32}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:58:39,026] Trial 7 finished with value: 0.8187500238418579 and parameters: {'n_layers': 3, 'units': 128, 'dropout_rate': 0.5, 'l2_reg': 1.2639695052745843e-05, 'learning_rate': 0.00013860870002292214, 'batch_size': 64}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:58:50,328] Trial 8 finished with value: 0.8050000071525574 and parameters: {'n_layers': 2, 'units': 160, 'dropout_rate': 0.1, 'l2_reg': 9.766071234939887e-05, 'learning_rate': 1.3589368015365794e-05, 'batch_size': 32}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step   


[I 2024-11-17 07:59:00,596] Trial 9 finished with value: 0.793749988079071 and parameters: {'n_layers': 3, 'units': 160, 'dropout_rate': 0.5, 'l2_reg': 0.00032458815688287226, 'learning_rate': 0.0031481771161035554, 'batch_size': 32}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:59:15,226] Trial 10 finished with value: 0.731249988079071 and parameters: {'n_layers': 4, 'units': 32, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.0014577195976956153, 'learning_rate': 1.0514396091705045e-05, 'batch_size': 128}. Best is trial 0 with value: 0.8199999928474426.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step   


[I 2024-11-17 07:59:25,765] Trial 11 finished with value: 0.8237500190734863 and parameters: {'n_layers': 4, 'units': 96, 'dropout_rate': 0.2, 'l2_reg': 0.0019386779877959854, 'learning_rate': 0.0004069095802872238, 'batch_size': 32}. Best is trial 11 with value: 0.8237500190734863.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:59:35,477] Trial 12 finished with value: 0.8149999976158142 and parameters: {'n_layers': 4, 'units': 96, 'dropout_rate': 0.2, 'l2_reg': 0.0014069094276978702, 'learning_rate': 0.00048595316395452516, 'batch_size': 32}. Best is trial 11 with value: 0.8237500190734863.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 07:59:52,244] Trial 13 finished with value: 0.824999988079071 and parameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.008067222122029354, 'learning_rate': 8.194807693408146e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:00:03,411] Trial 14 finished with value: 0.8012499809265137 and parameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.0030130761301285568, 'learning_rate': 0.00013613760513210877, 'batch_size': 128}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:00:14,030] Trial 15 finished with value: 0.7850000262260437 and parameters: {'n_layers': 4, 'units': 32, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.0007011207429540752, 'learning_rate': 0.0005412726700465679, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:00:35,656] Trial 16 finished with value: 0.8075000047683716 and parameters: {'n_layers': 4, 'units': 96, 'dropout_rate': 0.4, 'l2_reg': 0.009751460474883658, 'learning_rate': 9.025677524340041e-05, 'batch_size': 32}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:00:44,337] Trial 17 finished with value: 0.8162500262260437 and parameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.1, 'l2_reg': 0.0030320576911370537, 'learning_rate': 0.00022214418237994263, 'batch_size': 32}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:00:53,705] Trial 18 finished with value: 0.8050000071525574 and parameters: {'n_layers': 4, 'units': 96, 'dropout_rate': 0.2, 'l2_reg': 0.0008787359550847693, 'learning_rate': 0.0009051107346186635, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:01:05,761] Trial 19 finished with value: 0.8174999952316284 and parameters: {'n_layers': 3, 'units': 64, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.003507392238956847, 'learning_rate': 4.3584565272110385e-05, 'batch_size': 128}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:01:15,441] Trial 20 finished with value: 0.8125 and parameters: {'n_layers': 3, 'units': 32, 'dropout_rate': 0.4, 'l2_reg': 0.00028538185208063876, 'learning_rate': 0.0002603560258403787, 'batch_size': 32}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:01:32,328] Trial 21 finished with value: 0.8237500190734863 and parameters: {'n_layers': 4, 'units': 128, 'dropout_rate': 0.2, 'l2_reg': 0.005884177456543147, 'learning_rate': 2.9177539229891745e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:01:49,944] Trial 22 finished with value: 0.8125 and parameters: {'n_layers': 4, 'units': 128, 'dropout_rate': 0.2, 'l2_reg': 0.009723412899617907, 'learning_rate': 7.011575596023955e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:02:07,172] Trial 23 finished with value: 0.8087499737739563 and parameters: {'n_layers': 4, 'units': 96, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.004060661744867665, 'learning_rate': 2.4338287063547195e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:02:19,484] Trial 24 finished with value: 0.8100000023841858 and parameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.2, 'l2_reg': 0.0015981711416874456, 'learning_rate': 4.32369968356205e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:02:30,416] Trial 25 finished with value: 0.8025000095367432 and parameters: {'n_layers': 4, 'units': 128, 'dropout_rate': 0.1, 'l2_reg': 0.004973710025808831, 'learning_rate': 0.00017987309205662433, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step   


[I 2024-11-17 08:02:42,788] Trial 26 finished with value: 0.8187500238418579 and parameters: {'n_layers': 4, 'units': 160, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.0017719697851798916, 'learning_rate': 0.000491051485210933, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:02:58,531] Trial 27 finished with value: 0.8149999976158142 and parameters: {'n_layers': 3, 'units': 96, 'dropout_rate': 0.2, 'l2_reg': 0.008922524592342064, 'learning_rate': 1.800806582457366e-05, 'batch_size': 64}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:03:08,057] Trial 28 finished with value: 0.7975000143051147 and parameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.1, 'l2_reg': 0.0009793389900293237, 'learning_rate': 5.031478648618523e-05, 'batch_size': 32}. Best is trial 13 with value: 0.824999988079071.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


[I 2024-11-17 08:03:23,948] Trial 29 finished with value: 0.8112499713897705 and parameters: {'n_layers': 4, 'units': 128, 'dropout_rate': 0.2, 'l2_reg': 0.00556319846504643, 'learning_rate': 0.00011187754354679655, 'batch_size': 128}. Best is trial 13 with value: 0.824999988079071.


Best hyperparameters: {'n_layers': 4, 'units': 64, 'dropout_rate': 0.30000000000000004, 'l2_reg': 0.008067222122029354, 'learning_rate': 8.194807693408146e-05, 'batch_size': 64}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 35ms/step - accuracy: 0.5328 - loss: 2.3513 - val_accuracy: 0.6938 - val_loss: 2.1955
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5810 - loss: 2.2211 - val_accuracy: 0.7262 - val_loss: 2.1503
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6124 - loss: 2.1521 - val_accuracy: 0.7513 - val_loss: 2.0999
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6350 - loss: 2.1145 - val_accuracy: 0.7688 - val_loss: 2.0581
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6452 - loss: 2.0769 - val_accuracy: 0.7788 - val_loss: 2.0170
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6493 - loss: 2.0135 - val_accuracy: 0.7875 - val_loss: 1.9684
Epoch 7/50
[1m50/50[0m [32m━━━━━━━━━

below code with old ones

In [None]:
# Step 4: Build the model
model = Sequential()


In [None]:
# Input layer with L2 regularization
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))  # Dropout for regularization


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Hidden layers with L2 regularization
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))

In [None]:
# Output layer (binary classification)
model.add(Dense(1, activation='sigmoid'))


In [None]:
# Compile the model with Adam optimizer and binary cross-entropy loss
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Add callbacks for early stopping and model checkpoint
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [None]:
# Early stopping to prevent overfitting, model checkpoint to save the best model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Jyothi Research Paper/best_model_fulldescriptors_aid1239.keras', save_best_only=True)


In [None]:
# Step 5: Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=64,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping, model_checkpoint])

Epoch 1/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - accuracy: 0.6455 - loss: 4.4548 - val_accuracy: 0.7788 - val_loss: 3.5156
Epoch 2/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7659 - loss: 3.3761 - val_accuracy: 0.7987 - val_loss: 2.7593
Epoch 3/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8290 - loss: 2.5753 - val_accuracy: 0.7950 - val_loss: 2.1734
Epoch 4/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8600 - loss: 1.9603 - val_accuracy: 0.8050 - val_loss: 1.8138
Epoch 5/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8813 - loss: 1.6092 - val_accuracy: 0.8050 - val_loss: 1.6028
Epoch 6/50
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8898 - loss: 1.4058 - val_accuracy: 0.8200 - val_loss: 1.4967
Epoch 7/50
[1m50/50[0m [32m━━━━━━━━━

In [None]:
# Step 6: Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8195 - loss: 0.9831 
Test Accuracy: 81.12%


In [None]:
# Step 7: Make predictions
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)  # Use 0.5 as threshold for binary classification


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


In [None]:
# Assuming you have 'df_copy' (original dataframe) and 'y_pred_classes' available

# Predict on the entire dataset to get predictions for all rows
# (Assuming 'X' contains features for all rows in 'df_copy')
y_pred_all = model.predict(X)
y_pred_classes_all = (y_pred_all > 0.5).astype(int)

# Create a new DataFrame with the specified columns
new_df = pd.DataFrame()

# Assuming 'df_copy' contains 'PUBCHEM_CID' and 'PUBCHEM_ACTIVITY_OUTCOME'
new_df['PUBCHEM_CID'] = df_copy['PUBCHEM_CID']  # Use values from the original df
new_df['PUBCHEM_ACTIVITY_OUTCOME'] = df_copy['PUBCHEM_ACTIVITY_OUTCOME'] # Use values from original df

# Add the predicted classes for all rows and map to "Active"/"Inactive"
new_df['y_pred_classes'] = y_pred_classes_all.flatten()
new_df['y_pred_classes'] = new_df['y_pred_classes'].map({1: 'Active', 0: 'Inactive'}) # Map 1 to 'Active' and 0 to 'Inactive'

# Display or save the new DataFrame as needed
print(new_df.head())
#new_df.to_csv('predicted_outcomes.csv', index=False)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
   PUBCHEM_CID PUBCHEM_ACTIVITY_OUTCOME y_pred_classes
0      1449342                   Active       Inactive
1      3242114                   Active       Inactive
2      9594900                   Active       Inactive
3       823601                   Active       Inactive
4      3239831                   Active       Inactive


In [None]:
# Step 8: Evaluate the results
compound_ids = df_copy['PUBCHEM_CID'].values
compound_ids_train, compound_ids_test = train_test_split(compound_ids, test_size=0.2, random_state=42)


In [None]:
results = [
   f"Compound {compound_id}: This is active" if pred == 1 else f"Compound {compound_id}: This is inactive"
   for compound_id, pred in zip(compound_ids_test, y_pred_classes.flatten())
]

results

['Compound 2986465: This is inactive',
 'Compound 1319487: This is active',
 'Compound 2999802: This is inactive',
 'Compound 16190573: This is active',
 'Compound 666181: This is active',
 'Compound 135472859: This is inactive',
 'Compound 711992: This is inactive',
 'Compound 6898097: This is inactive',
 'Compound 2986519: This is inactive',
 'Compound 135490211: This is inactive',
 'Compound 1867778: This is active',
 'Compound 5762397: This is active',
 'Compound 1481967: This is inactive',
 'Compound 2831188: This is active',
 'Compound 2515723: This is active',
 'Compound 4120298: This is active',
 'Compound 1481828: This is inactive',
 'Compound 684082: This is inactive',
 'Compound 5730347: This is inactive',
 'Compound 3237354: This is inactive',
 'Compound 752122: This is inactive',
 'Compound 5721129: This is inactive',
 'Compound 5724809: This is active',
 'Compound 755827: This is inactive',
 'Compound 4248181: This is active',
 'Compound 704968: This is inactive',
 'Compo

In [None]:
# Confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred_classes)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[332  90]
 [ 61 317]]


In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))



Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.81       422
           1       0.78      0.84      0.81       378

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



In [None]:
# Optional: Save the model for future use
model.save('/content/drive/MyDrive/Jyothi Research Paper/final_deep_classifier_model.keras')


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, cohen_kappa_score, precision_score, recall_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/Datasets/AID_1239/concatenated_AID_1239_Active_Inactive.csv')

# Drop unnecessary columns
columns_to_drop = ['Unnamed: 0', 'PUBCHEM_CID', 'PUBCHEM_SID', 'SMILES', 'SMILES.1', 'MOLECULEID']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Encode target column
df['PUBCHEM_ACTIVITY_OUTCOME'] = df['PUBCHEM_ACTIVITY_OUTCOME'].apply(lambda x: 1.0 if x == 'Active' else 0.0)

# Split features and target
X = df.drop(columns=['PUBCHEM_ACTIVITY_OUTCOME']).values
y = df['PUBCHEM_ACTIVITY_OUTCOME'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

# Make predictions
y_pred = model.predict(X_test).flatten()
y_pred_classes = (y_pred > 0.5).astype(int)

# Calculate metrics
test_roc_auc = roc_auc_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred_classes)
test_kappa = cohen_kappa_score(y_test, y_pred_classes)
test_precision = precision_score(y_test, y_pred_classes)
test_recall = recall_score(y_test, y_pred_classes)

# Output the results
results = {
    "accuracy": test_acc,
    "roc_auc": test_roc_auc,
    "f1": test_f1,
    "kappa": test_kappa,
    "precision": test_precision,
    "recall": test_recall
}

print("Model Evaluation Metrics:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


ValueError: Input X contains infinity or a value too large for dtype('float64').