# Alanine Model


### 1. Install Dependencies and Setup

In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os
import cv2
import yaml as yml
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.utils.class_weight import compute_class_weight
import openpyxl
from openpyxl.drawing.image import Image
from io import BytesIO


### 2. YAML version

In [29]:
# Function to load configuration
def load_config(version):
    with open('config_alanine_model.yml', 'r') as file: #r is for reading 
        config = yml.safe_load(file)
    return config['versions'][version]

# Load the specific version of the configuration
version = 11  # Change this to the desired version
config_path = 'config_alanine_model.yml'
config = load_config(version)

### 3. Load Data

In [30]:
# Define paths
path_na = r'C:\Users\PC\Documents\BIOSFER\data\alanine\alanine_NA'
path_normal = r'C:\Users\PC\Documents\BIOSFER\data\alanine\alanine_normal'


def load_images(path, label):
    images = []
    labels = []
    data = []
    for filename in os.listdir(path):
        if filename.endswith('.png'):
            img_path = os.path.join(path, filename)
            # Read PNG with all channels
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            # Adding this correction turns the number of channels from 4 to 3, which affects the condition below!!
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if img is not None and img.shape == (600, 800, 3):
                images.append(img)
                labels.append(label)
                data.append((img, label, filename))
    df = pd.DataFrame(data, columns=['Image', 'Label', 'Filename'])
    df.set_index('Filename', inplace=True)

    return images, labels, df


# Load NA (invalid) images
na_images, na_labels, df_0 = load_images(path_na, 0)  # 0 for invalid

# Load normal (valid) images
normal_images, normal_labels, df_1 = load_images(path_normal, 1)  # 1 for valid

# Combine the data
X = na_images + normal_images
Y = na_labels + normal_labels

# Convert lists to numpy arrays
x = np.array(X)
Y = np.array(Y)

# Merging the two dfs
df = pd.concat([df_0, df_1])

### 4. data pre-processing

In [31]:
# Extract parameters
coords = config['preprocessing']['coords']
crop_size = config['preprocessing']['crop_size']
resize_shape = tuple(config['preprocessing']['resize_shape']) #convert to tuple for cv2

In [32]:
#Preporcess image using the configuaration
def preprocess_images(images, config):
    processed_images = []
     
    for img in images:
        # Crop the image
        cropped_img = img[coords[0]:(coords[0] + crop_size[0]),
                          coords[1]:(coords[1]+crop_size[1])]

        # Resize the image
        resized_img = cv2.resize(cropped_img, resize_shape, interpolation=cv2.INTER_AREA)

        # Normalize pixel values
        normalized_img = resized_img.astype(np.float32) / 255.0

        processed_images.append(normalized_img)

    return processed_images

In [33]:
# Preprocess the images using the configuration
X_processed = preprocess_images(df['Image'].tolist(), config)
df['Processed'] = X_processed

### 5. Split data

In [None]:
# Split all data to test some versions
df_small = df.sample(n=200, random_state=42)
df_small = pd.concat(df_small, ignore_index=True)
df_small.shape

In [35]:
# Ensure splits contain at least one NA sample
def ensure_na_in_split(X, Y, na_label=0):

    # Shuffle the data
    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]

    # Split data into training (70%), validation (20%), and test (10%)
    # First, split into training (70%) and temporary (30%)
    X_train, X_temp, Y_train, Y_temp = train_test_split(
        X, Y, test_size=0.3, random_state=42, stratify=Y)
    # Second, split the temporary set into validation (20% of total) and test (10% of total)
    X_val, X_test, Y_val, Y_test = train_test_split(
        X_temp, Y_temp, test_size=0.3333, random_state=42, stratify=Y_temp)

    return X_train, X_val, X_test, Y_train, Y_val, Y_test


# Split and ensure each set has at least one NA sample | use the df that you want
X_train, X_val, X_test, Y_train, Y_val, Y_test = ensure_na_in_split(
    df_small['Processed'], df_small['Label'])

### 6. Convert data

In [36]:
#convert the list of processed image arrays into a single NumPy array
X_train_array = np.stack(X_train.values)
X_val_array = np.stack(X_val.values)
X_test_array = np.stack(X_test.values)

In [None]:
# Convert to TensorFlow tensors
X_train_tensor = tf.convert_to_tensor(X_train_array, dtype=tf.float32)
X_val_tensor = tf.convert_to_tensor(X_val_array, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_array, dtype=tf.float32)

Y_train_tensor = tf.convert_to_tensor(Y_train, dtype=tf.float32)
Y_val_tensor = tf.convert_to_tensor(Y_val, dtype=tf.float32)
Y_test_tensor = tf.convert_to_tensor(Y_test, dtype=tf.float32)

# Print the shapes of the arrays
print(f'Shape of X_train_tensor: {X_train_tensor.shape}')
print(f'Shape of X_val_tensor: {X_val_tensor.shape}')
print(f'Shape of X_test_tensor: {X_test_tensor.shape}')
print(f'Shape of Y_train_tensor: {Y_train_tensor.shape}')
print(f'Shape of Y_val_tensor: {Y_val_tensor.shape}')
print(f'Shape of Y_test_tensor: {Y_test_tensor.shape}')

# Print the number of NA samples in each split
print(f'Number of NA samples in Y_train: {np.sum(Y_train == 0)}')
print(f'Number of NA samples in Y_val: {np.sum(Y_val == 0)}')
print(f'Number of NA samples in Y_test: {np.sum(Y_test == 0)}')

### 7. Build Deep Learning Model

In [None]:
def build_adaptable_model(config):
    model = tf.keras.Sequential()

    # Input layer
    model.add(tf.keras.layers.Input(shape=tuple(config['model']['input_shape'])))

    # Iterate through layers defined in the YAML
    for layer in config['model']['layers']:
        if layer['type'] in ['Conv2D', 'SeparableConv2D']:
            conv_layer = tf.keras.layers.Conv2D if layer['type'] == 'Conv2D' else tf.keras.layers.SeparableConv2D
            model.add(conv_layer(
                filters=layer['filters'],
                kernel_size=tuple(layer['kernel_size']),
                strides=(layer.get('strides', 1)),
                activation=layer['activation']
            ))
            if 'pool_size' in layer:
                model.add(tf.keras.layers.MaxPooling2D(pool_size=tuple(layer['pool_size'])))
        elif layer['type'] == 'BatchNormalization':
            model.add(tf.keras.layers.BatchNormalization())
        elif layer['type'] == 'Flatten':
            model.add(tf.keras.layers.Flatten())
        elif layer['type'] == 'GlobalAveragePooling2D':
            model.add(tf.keras.layers.GlobalAveragePooling2D())
        elif layer['type'] == 'Dense':
            model.add(tf.keras.layers.Dense(
                units=layer['units'],
                activation=layer['activation']
            ))
            if 'dropout' in layer:
                model.add(tf.keras.layers.Dropout(layer['dropout']))

    # Handle learning rate schedule (version 10 feature)
    if isinstance(config['training'].get('learning_rate'), dict):
        lr_config = config['training']['learning_rate']
        if lr_config['type'] == 'ExponentialDecay':
            lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=lr_config.get('initial_learning_rate', 0.001),
                decay_steps=lr_config.get('decay_steps', 100000),
                decay_rate=lr_config.get('decay_rate', 0.96),
                staircase=lr_config.get('staircase', True)
            )
        else:
            lr_schedule = lr_config.get('initial_learning_rate', 0.001)
    else:
        lr_schedule = config['training'].get('learning_rate', 0.001)

    # Handle different optimizers
    optimizer_name = config['training'].get('optimizer', 'Adam').lower()
    optimizer_map = {
        'adam': tf.keras.optimizers.Adam,
        'rmsprop': tf.keras.optimizers.RMSprop,
        'adamax': tf.keras.optimizers.Adamax,
        'adamw': tf.keras.optimizers.AdamW
    }
    optimizer = optimizer_map.get(optimizer_name, tf.keras.optimizers.Adam)(learning_rate=lr_schedule)

    # Compile the model
    model.compile(
        optimizer=optimizer,
        loss=config['training']['loss'],
        metrics=config['training'].get('metrics', ['accuracy'])
    )

    return model

# Create the model
model = build_adaptable_model(config)

model.summary()

# Prepare callbacks
callbacks = []

# Handle early stopping (version 8 feature)
if 'early_stopping' in config['training']:
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor=config['training']['early_stopping'].get('monitor', 'val_loss'),
        patience=config['training']['early_stopping'].get('patience', 5),
        restore_best_weights=True
    )
    callbacks.append(early_stopping)

### 8. Train the model

In [None]:
# Train the model
history = model.fit(
    X_train_tensor, 
    Y_train_tensor,
    epochs=config['training']['epochs'],
    validation_data=(X_val_tensor, Y_val_tensor),
    verbose=1,
    callbacks=callbacks
)


# # Train the model with class weights
# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_loss', patience=10, restore_best_weights=True)

# Save the model


# model_save_path = 'C:/Users/PC/Documents/BIOSFER/trained_models/alanine_model'


# os.makedirs(model_save_path, exist_ok=True)


# model.save(model_save_path)


# print(f"Model saved to {model_save_path}")



# Load the model


# loaded_model = tf.keras.models.load_model('C:/Users/PC/Documents/BIOSFER/trained_models/alanine_model')

### 9. Plot Performance

In [None]:
# Plot Performance
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(
    X_test_tensor, Y_test_tensor, verbose=2)
print(f"Test Accuracy: {test_accuracy:.4f}")

# 12. Predict and Generate Classification Report
y_pred = model.predict(X_test_tensor)
y_pred_classes = (y_pred > 0.5).astype(int).reshape(-1)

print("Classification Report:")
print(classification_report(Y_test, y_pred_classes))

print("Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred_classes))

In [None]:
# Predict probabilities
y_pred_prob = model.predict(X_test_tensor)


def plot_images_with_probabilities(series, probabilities, save_path=None):
    # Ensure probabilities are a 1-dimensional array
    probabilities = np.squeeze(probabilities)

    # Determine the number of images
    num_images = len(series)

    # Define the grid size for plotting (adjustable based on the number of images)
    grid_size = int(num_images**0.5)
    if grid_size**2 < num_images:
        grid_size += 1

    # Create a figure with a grid of subplots
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(12, 12))

    # Flatten axes for easy iteration
    axes = axes.flatten()

    for i in range(num_images):
        # Get the image data
        img = series.iloc[i]

        # Plot the image
        axes[i].imshow(img)
        axes[i].axis('off')  # Hide axis

        # Add the predictive probability as text on top of the image
        prob_text = f"{float(probabilities[i]) * 100:.4f}%"
        axes[i].text(10, 20, prob_text, color='white', fontsize=12,
                     bbox=dict(facecolor='black', alpha=0.5))

    # Hide any remaining empty subplots
    for i in range(num_images, len(axes)):
        axes[i].axis('off')

    # Show the plot
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, format='png')
    else:
        plt.show()

plot_images_with_probabilities(X_test, y_pred_prob)

### 10. Create a Excel table to compile model result

In [44]:
matplotlib.use('Agg')  # or try 'TkAgg' or 'Qt5Agg'

In [None]:
def compile_model_results_to_excel(version, model, history, X_test, Y_test, X_test_tensor, Y_test_tensor, excel_path='model_results.xlsx'):
    # Load the specific version of the configuration
    try:
        config = load_config(version)
    except KeyError:
        print(f"Error: Version {version} not found in the configuration file.")
        return
    except Exception as e:
        print(f"Error loading configuration: {e}")
        return

    # Load existing Excel file or create a new one
    try:
        workbook = openpyxl.load_workbook(excel_path)
    except FileNotFoundError:
        workbook = openpyxl.Workbook()

    # Get or create the sheet for this version
    sheet_name = f'Version {version}'
    if sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        # Clear the existing content
        for row in sheet[sheet.dimensions]:
            for cell in row:
                cell.value = None
    else:
        sheet = workbook.create_sheet(title=sheet_name)

    # 1. Model Summary
    sheet['A1'] = 'Model Summary'
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    summary_string = "\n".join(stringlist)
    for i, line in enumerate(summary_string.split('\n')):
        for j, value in enumerate(line.split()):
            sheet.cell(row=i+2, column=j+1, value=value)

    # 2. Training History
    sheet['S1'] = 'Training History'
    history_df = pd.DataFrame(history.history)
    for r, row in enumerate(history_df.values, start=2):
        for c, value in enumerate(row, start=19):
            sheet.cell(row=r, column=c, value=value)

    # 3. Performance Plots
    sheet['Y1'] = 'Performance Plots'

    # Model Accuracy Plot
    plt.figure(figsize=(7, 4))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'Y3')

    # Model Loss Plot
    plt.figure(figsize=(7, 4))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'Y27')

    # Confusion Matrix
    y_pred = model.predict(X_test_tensor)
    y_pred_classes = (y_pred > 0.5).astype(int)
    cm = confusion_matrix(Y_test, y_pred_classes)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'Y51')

    # 4. Image Probabilities
    sheet['A55'] = 'Image Probabilities'
    img_buf = BytesIO()
    plot_images_with_probabilities(X_test, y_pred, save_path=img_buf)
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'A56')

    # Save the workbook
    workbook.save(excel_path)
    print(f"Results for version {version} compiled and saved to {excel_path}")

compile_model_results_to_excel(version, model, history, X_test, Y_test, X_test_tensor, Y_test_tensor)