In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Dense, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, SeparableConv1D, MaxPooling1D, Dropout, Flatten, Dense, BatchNormalization, Add, Activation, GlobalAveragePooling1D

from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau


from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization


from tensorflow.keras.layers import LSTM, Dense, Dropout



In [6]:
# Define input-output file names
datasets = {
    "B": {"input": "./Training_data/input_cathepsin_B.csv", "output": "./Training_data/output_cathepsin_B.csv"},
    "S": {"input": "./Training_data/input_cathepsin_S.csv", "output": "./Training_data/output_cathepsin_S.csv"},
    "D": {"input": "./Training_data/input_cathepsin_D.csv", "output": "./Training_data/output_cathepsin_D.csv"},
    "K": {"input": "./Training_data/input_cathepsin_K.csv", "output": "./Training_data/output_cathepsin_K.csv"},
}

In [7]:
# Function to preprocess data
def preprocess_data(input_path, output_path):
    # Load input and output data
    input_data = pd.read_csv(input_path, header=None, skiprows=1)
    output_data = pd.read_csv(output_path, header=None, skiprows=1)

    # Normalize input data
    scaler = MinMaxScaler()
    input_data_normalized = scaler.fit_transform(input_data)

    # Handle NaN values
    imputer = SimpleImputer(strategy='mean')
    input_data_imputed = imputer.fit_transform(input_data_normalized)

    # One-hot encode output data
    output_labels = output_data[0].values
    output_data_encoded = pd.get_dummies(output_labels, dtype=np.float32).values

    # Reshape input data for CNN
    input_data_reshaped = input_data_imputed.reshape((input_data_imputed.shape[0], input_data_imputed.shape[1], 1))

    return input_data_reshaped, output_data_encoded

# Function to build the model



def build_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(64, 3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model





In [8]:
# Process each dataset
for key, paths in datasets.items():
    print(f"Processing dataset {key}...")

    # Preprocess the data
    input_data, output_data = preprocess_data(paths['input'], paths['output'])

    # Apply SMOTE to balance classes
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(
        input_data.reshape(input_data.shape[0], -1), output_data
    )
    X_resampled = X_resampled.reshape((X_resampled.shape[0], input_data.shape[1], 1))

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42
    )


    # Build the model
    model = build_model(X_train.shape[1:], num_classes=output_data.shape[1])

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    print(f"Training model for dataset {key}...")
    history = model.fit(
        X_train, y_train, 
        epochs=20, 
        batch_size=16, 
        validation_data=(X_test, y_test), 
        # callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate the model
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
    print(f"Test accuracy for dataset {key}: {test_acc}")
    from lime import lime_tabular

    # Define the LIME explainer
    explainer = lime_tabular.LimeTabularExplainer(
        X_train.reshape(X_train.shape[0], -1),  # Flatten for LIME
        mode="classification",
        training_labels=np.argmax(y_train, axis=1),  # Convert one-hot to class labels
        feature_names=[f"Feature_{i}" for i in range(X_train.shape[1])],
        discretize_continuous=True
    )

    # Select a test sample to explain
    sample_index = 0  # Change this to any test sample index you want
    sample = X_test[sample_index].reshape(1, -1)  # Flatten for LIME

    # Generate LIME explanation
    exp = explainer.explain_instance(
        sample[0],  # Flattened sample
        model.predict,  # Your trained model's prediction function
        num_features=10  # Number of top features to show
    )

    # Visualize the explanation
    exp.show_in_notebook()  # Shows in Jupyter Notebook (or)
    exp.as_pyplot_figure()


    # Predict and generate classification report
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    print(f"Classification Report for dataset {key}:")
    print(classification_report(np.argmax(y_test, axis=1), y_pred_classes))
    # Save the trained model
    model.save(f'./Saved_Models/model_{key}.h5')
    # Plot training history
    plt.figure(figsize=(12, 5))
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f"Accuracy for Dataset {key}")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

Processing dataset B...
Training model for dataset B...
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6057 - loss: 0.8969 - val_accuracy: 0.8104 - val_loss: 0.5123
Epoch 2/20
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8021 - loss: 0.4863 - val_accuracy: 0.8428 - val_loss: 0.4090
Epoch 3/20
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8403 - loss: 0.3964 - val_accuracy: 0.8572 - val_loss: 0.3516
Epoch 4/20
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8655 - loss: 0.3418 - val_accuracy: 0.8760 - val_loss: 0.3590
Epoch 5/20
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8949 - loss: 0.2975 - val_accuracy: 0.8645 - val_loss: 0.3277
Epoch 6/20
[1m347/347[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9026 - loss: 0.2558 - val_accuracy: 0.8904 - val_loss: 0.2955
Epoch 7/20
[1m347/347[0m [32m━━━━━━━

ModuleNotFoundError: No module named 'lime'