# Alanine Model:

#### 1. Install Dependencies and Setup

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import os
import cv2
import yaml
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import openpyxl
from openpyxl.drawing.image import Image
from openpyxl.utils import get_column_letter
from io import BytesIO
from PIL import Image as PILImage
import seaborn as sns
import tempfile

#### 2. Load Image Data

In [2]:
# Define paths
path_na = r'c:\Users\PC\Documents\BIOSFER\data\alanine\alanine_NA'
path_normal = r'c:\Users\PC\Documents\BIOSFER\data\alanine\alanine_normal'


def load_images(path, label):
    images = []
    labels = []
    data = []
    for filename in os.listdir(path):
        if filename.endswith('.png'):
            img_path = os.path.join(path, filename)
            # Read PNG with all channels
            img = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
            # Adding this correction turns the number of channels from 4 to 3, which affects the condition below!!
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            if img is not None and img.shape == (600, 800, 3):
                images.append(img)
                labels.append(label)
                data.append((img, label, filename))
    df = pd.DataFrame(data, columns=['Image', 'Label', 'Filename'])
    df.set_index('Filename', inplace=True)

    return images, labels, df


# Load NA (invalid) images
na_images, na_labels, df_0 = load_images(path_na, 0)  # 0 for invalid

# Load normal (valid) images
normal_images, normal_labels, df_1 = load_images(path_normal, 1)  # 1 for valid

# Combine the data
X = na_images + normal_images
Y = na_labels + normal_labels

# Convert lists to numpy arrays
x = np.array(X)
Y = np.array(Y)

# Merging the two dfs
df = pd.concat([df_0, df_1])

In [None]:
# Better practice is to create dataframes, and have each row with its identifier, the image, and the label
print("Invalid df description")
print("----------------------------------")
print(df_0.shape)
print(df_0.dtypes)
print(" ")
print("Valid df description")
print("----------------------------------")
print(df_1.shape)
print(df_1.dtypes)
print(" ")
print("Valid complete df description")
print("----------------------------------")
print(df.shape)
print(df.dtypes)
print(" ")

#### 3. Load YAML config

In [30]:
def load_config(version):
    with open('config_pol.yml', 'r') as file:
        config = yaml.safe_load(file)
    return config['versions'][version]

version = 7  # Change this to the desired version
config = load_config(version)

#### 4. data pre-processing

In [31]:
# Preprocess images with normalization and standardization
def preprocess_images(images, coords, crop_size, resize_shape):
    processed_images = []

    for img in images:
        # Crop the image
        cropped_img = img[coords[0]:(
            coords[0] + crop_size[0]), coords[1]:(coords[1]+crop_size[1])]

        # Resize the image
        resized_img = cv2.resize(
            cropped_img, resize_shape, interpolation=cv2.INTER_AREA)

        # Normalize pixel values
        normalized_img = resized_img.astype(np.float32) / 255.0

        processed_images.append(normalized_img)

    return processed_images

In [32]:
coords = config['preprocess']['coords']
crop_size = config['preprocess']['crop_size']
resize_shape = tuple(config['preprocess']['resize_shape'])


X_processed = preprocess_images(
    df['Image'].tolist(), coords, crop_size, resize_shape)

df['Processed'] = X_processed

#### 5. Display 2 images, original and processed images

In [7]:
# Use this to fix appropriate coordinated
def display_images(img1, img2):
    matplotlib.use('module://matplotlib_inline.backend_inline')

    fig, axes = plt.subplots(1, 2, figsize=(10, 5))


    axes[0].imshow(img1)

    axes[0].axis('off')

    axes[1].imshow(img2)

    axes[1].axis('off')


    plt.show()

In [None]:
display_images(df.iloc[378]['Image'],
               df.iloc[378]['Processed'])

#### 6. Split data

In [None]:
df_small = df.sample(n=200, random_state=42)
df_small = pd.concat(df_small, ignore_index=True)
df_small.shape

In [None]:
# Ensure splits contain at least one NA sample
def ensure_na_in_split(X, Y, na_label=0):

    # Shuffle the data
    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]

    # Split data into training (70%), validation (20%), and test (10%)
    # First, split into training (70%) and temporary (30%)
    X_train, X_temp, Y_train, Y_temp = train_test_split(
        X, Y, test_size=0.3, random_state=42, stratify=Y)
    # Second, split the temporary set into validation (20% of total) and test (10% of total)
    X_val, X_test, Y_val, Y_test = train_test_split(
        X_temp, Y_temp, test_size=0.3333, random_state=42, stratify=Y_temp)

    return X_train, X_val, X_test, Y_train, Y_val, Y_test

# Split and ensure each set has at least one NA sample
X_train, X_val, X_test, Y_train, Y_val, Y_test = ensure_na_in_split(
    df['Processed'], df['Label'])

#### 7. Convert data

In [34]:
X_train_array = np.stack(X_train.values)
X_val_array = np.stack(X_val.values)
X_test_array = np.stack(X_test.values)

In [None]:
# Convert to TensorFlow tensors
X_train_tensor = tf.convert_to_tensor(X_train_array, dtype=tf.float32)
X_val_tensor = tf.convert_to_tensor(X_val_array, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_array, dtype=tf.float32)

Y_train_tensor = tf.convert_to_tensor(Y_train, dtype=tf.float32)
Y_val_tensor = tf.convert_to_tensor(Y_val, dtype=tf.float32)
Y_test_tensor = tf.convert_to_tensor(Y_test, dtype=tf.float32)

# Print the shapes of the arrays
print(f'Shape of X_train_tensor: {X_train_tensor.shape}')
print(f'Shape of X_val_tensor: {X_val_tensor.shape}')
print(f'Shape of X_test_tensor: {X_test_tensor.shape}')
print(f'Shape of Y_train_tensor: {Y_train_tensor.shape}')
print(f'Shape of Y_val_tensor: {Y_val_tensor.shape}')
print(f'Shape of Y_test_tensor: {Y_test_tensor.shape}')

# Print the number of NA samples in each split
print(f'Number of NA samples in Y_train: {np.sum(Y_train == 0)}')
print(f'Number of NA samples in Y_val: {np.sum(Y_val == 0)}')
print(f'Number of NA samples in Y_test: {np.sum(Y_test == 0)}')

#### 8. Build Deep Learning Model

In [None]:
exec(config['code'])
model.summary()  # type: ignore

#### 9. Train the model

In [None]:
exec(config['training'])

#### 10. Plot Performance

In [None]:
matplotlib.use('module://matplotlib_inline.backend_inline')

# Plot Performance
plt.figure(figsize=(11, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'],label='Training Accuracy')  # type: ignore
plt.plot(history.history['val_accuracy'],label='Validation Accuracy')  # type: ignore
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')  # type: ignore
plt.plot(history.history['val_loss'], label='Validation Loss')  # type: ignore
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(  # type: ignore
    X_test_tensor, Y_test_tensor, verbose=2)
print(f"Test Accuracy: {test_accuracy:.4f}")

# 12. Predict and Generate Classification Report
# ================================================
# ================================================
y_pred = model.predict(X_test_tensor)  # type: ignore
y_pred_classes = (y_pred > 0.5).astype(int).reshape(-1)
# ================================================
# ================================================

print("Classification Report:")
print(classification_report(Y_test, y_pred_classes))
print("Confusion Matrix:")
print(confusion_matrix(Y_test, y_pred_classes))

In [47]:
def plot_images_with_probabilities(series, probabilities, save_path=None):
    probabilities = np.squeeze(probabilities)
    num_images = len(series)
    grid_size = int(num_images**0.5)
    if grid_size**2 < num_images:
        grid_size += 1

    fig, axes = plt.subplots(grid_size, grid_size, figsize=(100, 100))
    axes = axes.flatten()

    for i in range(num_images):
        img = series.iloc[i]
        axes[i].imshow(img)
        axes[i].axis('off')  # Hide axis

        # Determine the color based on the probability
        probability = float(probabilities[i])
        if probability >= 0.995:
            color = 'green'
        elif 0.05 < probability < 0.995:
            color = 'orange'
        else:
            color = 'red'

        prob_text = f"{probability * 100:.4f}%"
        axes[i].text(10, 20, prob_text, color='white', fontsize=12,
                     bbox=dict(facecolor=color, alpha=0.5))

    for i in range(num_images, len(axes)):
        axes[i].axis('off')

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, format='png')
    else:
        plt.show()

In [None]:
# Predict probabilities
y_pred_prob = model.predict(X_test_tensor)  # type: ignore
plot_images_with_probabilities(X_test, y_pred_prob)

#### 11. Create a Excel table to compile model result 

In [None]:
def compile_model_results_to_excel(version, model, history, X_test, Y_test, X_test_tensor, Y_test_tensor,
                                   excel_path='model_results.xlsx'):
    matplotlib.use('Agg')  # or try 'TkAgg' or 'Qt5Agg'
    # Load the specific version of the configuration
    try:
        config = load_config(version)
    except KeyError:
        print(f"Error: Version {version} not found in the configuration file.")
        return
    except Exception as e:
        print(f"Error loading configuration: {e}")
        return

    # Load existing Excel file or create a new one
    try:
        workbook = openpyxl.load_workbook(excel_path)
    except FileNotFoundError:
        workbook = openpyxl.Workbook()

    # Get or create the sheet for this version
    sheet_name = f'Version {version}'
    if sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        # Clear the existing content
        for row in sheet[sheet.dimensions]:
            for cell in row:
                cell.value = None
    else:
        sheet = workbook.create_sheet(title=sheet_name)

    # 1. Model Summary
    sheet['A1'] = 'Model Summary'
    stringlist = []
    model.summary(print_fn=lambda x: stringlist.append(x))
    summary_string = "\n".join(stringlist)

    # Save the summary to a temporary text file
    with tempfile.NamedTemporaryFile(delete=False, mode='w', encoding='utf-8', suffix='.txt') as tmpfile:
        tmpfile.write(summary_string)
        tmpfile_path = tmpfile.name

    # Convert the text file to an image
    img_buf = BytesIO()
    plt.figure(figsize=(9, 6))
    plt.text(0.01, 0.99, summary_string, va='top', ha='left', wrap=True, fontsize=10, family='monospace')
    plt.axis('off')
    plt.savefig(img_buf, format='png', bbox_inches='tight', pad_inches=0.1)
    plt.close()
    img_buf.seek(0)

    # Insert the image into the Excel sheet
    img = Image(img_buf)
    sheet.add_image(img, 'A2')
        
    # 2. Training History
    sheet['M1'] = 'Training History'
    sheet['M2'] = 'Accuracy'
    sheet['N2'] = 'Loss'
    sheet['O2'] = 'Val Accuracy'
    sheet['P2'] = 'Val Loss'
    history_df = pd.DataFrame(history.history)
    for r, row in enumerate(history_df.values, start=3):
        for c, value in enumerate(row, start=13):
            sheet.cell(row=r, column=c, value=value)

    # 3. Performance Plots

    # Model Accuracy Plot
    plt.figure(figsize=(7, 4))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'R1')

    # Model Loss Plot
    plt.figure(figsize=(7, 4))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'R22')

    # Confusion Matrix
    y_pred = model.predict(X_test_tensor)
    y_pred_classes = (y_pred > 0.5).astype(int)
    cm = confusion_matrix(Y_test, y_pred_classes)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    img_buf = BytesIO()
    plt.savefig(img_buf, format='png')
    plt.close()
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'R43')

    # 4. Image Probabilities
    sheet['AD1'] = 'Image Probabilities'
    img_buf = BytesIO()
    plot_images_with_probabilities(X_test, y_pred, save_path=img_buf)
    img_buf.seek(0)
    img = Image(img_buf)
    sheet.add_image(img, 'AD2')

    # 5. Model Code
    sheet['A35'] = 'Model Code'
    model_code = config['code']
    model_code_lines = model_code.split('\n')
    for i, line in enumerate(model_code_lines, start=36):
        sheet.cell(row=i, column=1, value=line)

    # 6. Training Code
    sheet['A76'] = 'Training Code'
    training_code = config['training']
    training_code_lines = training_code.split('\n')
    for i, line in enumerate(training_code_lines, start=77):
        sheet.cell(row=i, column=1, value=line)

    # Save the workbook
    workbook.save(excel_path)
    print(f"Results for version {version} compiled and saved to {excel_path}")
    
compile_model_results_to_excel(version, model, history, X_test, Y_test, X_test_tensor, Y_test_tensor)  # type: ignore

#### Save the final model

In [73]:
# Save the model
# model_save_path = 'C:/Users/PC/Documents/BIOSFER/trained_models/alanine_model'
# os.makedirs(model_save_path, exist_ok=True)
# model.save(model_save_path)
# print(f"Model saved to {model_save_path}")
# Load the model
# loaded_model = tf.keras.models.load_model('C:/Users/PC/Documents/BIOSFER/trained_models/alanine_model')