In [4]:
# Final code for Protein Secondary Structure Prediction using Multi Scale CNN + Windowed Transformer Encoder

# === Import necessary libraries ===
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
import tensorflow as tf


# === Transformer Encoder over Windowed Sequences ===
def windowed_transformer_encoder(x, window_size=16, num_heads=8, key_dim=64):
    """Applies a transformer encoder over non-overlapping windows of the input sequence."""

    # Pad the sequence to make its length divisible by the window size
    def pad_to_window(x_in, win_size=window_size):
        seq_len = tf.shape(x_in)[1]
        padding = (win_size - (seq_len % win_size)) % win_size
        return tf.pad(x_in, [[0, 0], [0, padding], [0, 0]])

    x = layers.Lambda(pad_to_window)(x)

    # Reshape input to separate into windows of size `window_size`
    def dynamic_reshape(x_in, win_size=window_size):
        batch_size = tf.shape(x_in)[0]
        seq_len = tf.shape(x_in)[1]
        feature_dim = tf.shape(x_in)[2]
        num_windows = seq_len // win_size
        return tf.reshape(x_in, (batch_size, num_windows, win_size, feature_dim))

    x_reshaped = layers.Lambda(dynamic_reshape)(x)

    # Apply multi-head self-attention within each window
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, dropout=0.3)(x_reshaped, x_reshaped)

    # Reshape the attention output back to the original shape
    def reshape_back(x_in):
        batch_size = tf.shape(x_in)[0]
        seq_len = tf.shape(x_in)[1] * tf.shape(x_in)[2]
        feature_dim = tf.shape(x_in)[3]
        return tf.reshape(x_in, (batch_size, seq_len, feature_dim))

    attn_output = layers.Lambda(reshape_back)(attn_output)

    # Add residual connection and normalization
    x = layers.Add()([x, attn_output])
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x


# === Build the complete model architecture ===
def build_model(input_shape, num_classes):
    """Builds the CNN + Transformer model for protein secondary structure prediction."""
    inputs = layers.Input(shape=input_shape)

    # Multi-scale CNNs to capture various local contexts
    x1 = layers.Conv1D(128, 3, activation='relu', padding='same')(inputs)
    x2 = layers.Conv1D(128, 5, activation='relu', padding='same')(inputs)
    x3 = layers.Conv1D(128, 7, activation='relu', padding='same')(inputs)

    # Combine multi-scale CNN outputs
    x = layers.Concatenate()([x1, x2, x3])
    x = layers.MaxPooling1D(pool_size=2, padding='valid')(x)
    x = layers.BatchNormalization()(x)

    # Apply the custom windowed transformer encoder
    x = windowed_transformer_encoder(x, window_size=16)

    # Global pooling and final dense layers for classification
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    # Compile the model with categorical crossentropy loss
    model = models.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model


# === Load and preprocess the input protein sequence data ===
def load_and_process_data(file_path):
    """Loads protein sequences and structure labels, encodes them for training."""
    df = pd.read_excel(file_path)

    # Amino acid mapping to integers
    amino_acids = 'ACDEFGHIKLMNPQRSTVWYU'
    aa_dict = {aa: i for i, aa in enumerate(amino_acids)}

    # Convert each protein sequence to list of integer-encoded amino acids
    X_seq = np.array([[aa_dict.get(aa, 20) for aa in seq] for seq in df['Primary_S'].values])
    X_onehot = np.eye(21)[X_seq]  # One-hot encode sequences (21 symbols)

    # Encode secondary structure labels to categorical format
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['Secondary_S'].values)
    y = to_categorical(y)
    return X_onehot, y, label_encoder.classes_


# === Helper function to truncate float values for readability ===
def truncate(x):
    """Truncates floating point values to 6 decimal places for cleaner output."""
    return np.floor(x * 10**6) / 10**6


# === Train and evaluate the model using 5-fold cross-validation ===
def train_and_evaluate(file_path):
    """Trains the model using stratified 5-fold cross-validation and evaluates it with multiple metrics."""
    X, y, class_names = load_and_process_data(file_path)

    # Initialize metrics for each class and overall
    metrics = {cls: {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mcc': []} for cls in range(y.shape[1])}
    q3_scores = []
    overall_precisions, overall_recalls, overall_f1s, overall_mccs = [], [], [], []

    # Perform stratified 5-fold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, test_idx in kf.split(X, np.argmax(y, axis=1)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Build and train model with early stopping
        model = build_model((X_train.shape[1], X_train.shape[2]), y.shape[1])
        early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        model.fit(X_train, y_train, validation_split=0.1, epochs=30, batch_size=32, callbacks=[early_stop], verbose=0)

        # Predict class labels
        y_pred = np.argmax(model.predict(X_test), axis=1)
        y_test_classes = np.argmax(y_test, axis=1)

        # Store overall evaluation metrics
        q3_scores.append(accuracy_score(y_test_classes, y_pred))
        overall_precisions.append(precision_score(y_test_classes, y_pred, average='macro'))
        overall_recalls.append(recall_score(y_test_classes, y_pred, average='macro'))
        overall_f1s.append(f1_score(y_test_classes, y_pred, average='macro'))
        overall_mccs.append(matthews_corrcoef(y_test_classes, y_pred))

        # Compute metrics for each class individually
        for i in range(y.shape[1]):
            class_mask = (y_test_classes == i)
            if np.sum(class_mask) > 0:
                metrics[i]['accuracy'].append(accuracy_score(class_mask, (y_pred == i)))
                metrics[i]['precision'].append(precision_score((y_test_classes == i), (y_pred == i), zero_division=0))
                metrics[i]['recall'].append(recall_score((y_test_classes == i), (y_pred == i), zero_division=0))
                metrics[i]['f1'].append(f1_score((y_test_classes == i), (y_pred == i), zero_division=0))
                binary_true = (y_test_classes == i).astype(int)
                binary_pred = (y_pred == i).astype(int)
                metrics[i]['mcc'].append(matthews_corrcoef(binary_true, binary_pred))

    # Compile metrics into results dictionary
    results = { 'Metric': ['Mean', 'Std Dev'] }
    results['Q3_Accuracy'] = [np.mean(q3_scores), np.std(q3_scores, ddof=1)]
    results['Overall_Precision'] = [np.mean(overall_precisions), np.std(overall_precisions, ddof=1)]
    results['Overall_Recall'] = [np.mean(overall_recalls), np.std(overall_recalls, ddof=1)]
    results['Overall_F1'] = [np.mean(overall_f1s), np.std(overall_f1s, ddof=1)]
    results['Overall_MCC'] = [np.mean(overall_mccs), np.std(overall_mccs, ddof=1)]

    # Add per-class metrics to the results
    for i, cls_name in enumerate(class_names):
        results[f'{cls_name}_Accuracy'] = [np.mean(metrics[i]['accuracy']), np.std(metrics[i]['accuracy'], ddof=1)]
        results[f'{cls_name}_Precision'] = [np.mean(metrics[i]['precision']), np.std(metrics[i]['precision'], ddof=1)]
        results[f'{cls_name}_Recall'] = [np.mean(metrics[i]['recall']), np.std(metrics[i]['recall'], ddof=1)]
        results[f'{cls_name}_F1'] = [np.mean(metrics[i]['f1']), np.std(metrics[i]['f1'], ddof=1)]
        results[f'{cls_name}_MCC'] = [np.mean(metrics[i]['mcc']), np.std(metrics[i]['mcc'], ddof=1)]

    results_df = pd.DataFrame(results)

    # Truncate decimals for better readability
    for col in results_df.select_dtypes(include=['float64', 'float32']).columns:
        results_df[col] = results_df[col].apply(truncate)

    # Save final metrics to Excel file
    results_df.to_excel('result_pathogen1.xlsx', index=False)


# === Main execution trigger ===
if __name__ == "__main__":
    train_and_evaluate("Filename.xlsx")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 123ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 90ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step
