# 18S Model Training and Evaluation

**Objective:** To build, train, and evaluate a deep learning classifier for the 18S rRNA gene (Eukaryotes) using the pre-processed data.

**Methodology:**
1. Load the 18S-specific training/testing data and encoders from disk.
2. Define the neural network architecture.
3. Train the model on the training data using the GPU.
4. Save, reload, and evaluate the final model's accuracy on the unseen test set.

In [1]:
import numpy as np
import tensorflow as tf
from scipy.sparse import load_npz
import pickle
from pathlib import Path
import sys

# Set up project path
project_root = Path.cwd().parent

# --- Verification Step: Check for GPU ---
print("--- TensorFlow Setup ---")
print(f"TensorFlow Version: {tf.__version__}")
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"GPU detected: {gpu_devices[0]}")
else:
    print("WARNING: No GPU detected. TensorFlow will run on CPU.")
print("-" * 26)

--- TensorFlow Setup ---
TensorFlow Version: 2.10.1
GPU detected: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
--------------------------


In [2]:
# --- Define 18S-specific file paths ---
PROCESSED_DATA_DIR = project_root / "data" / "processed"
MODELS_DIR = project_root / "models"

X_TRAIN_PATH = PROCESSED_DATA_DIR / "X_train_18s.npz"
X_TEST_PATH = PROCESSED_DATA_DIR / "X_test_18s.npz"
Y_TRAIN_PATH = PROCESSED_DATA_DIR / "y_train_18s.npy"
Y_TEST_PATH = PROCESSED_DATA_DIR / "y_test_18s.npy"

LABEL_ENCODER_PATH = MODELS_DIR / "18s_genus_label_encoder.pkl"

# --- Load the data and encoders ---
print("--- Loading 18S Data ---")
X_train = load_npz(X_TRAIN_PATH)
X_test = load_npz(X_TEST_PATH)
y_train = np.load(Y_TRAIN_PATH)
y_test = np.load(Y_TEST_PATH)

with open(LABEL_ENCODER_PATH, 'rb') as f:
    label_encoder = pickle.load(f)
print("Data loading complete.")

# --- Verification Step ---
print("\n--- Loaded Data Shapes ---")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print("-" * 30)
print(f"Shape of X_test:  {X_test.shape}")
print(f"Shape of y_test:  {y_test.shape}")
print(f"Number of classes (genera): {len(label_encoder.classes_)}")

--- Loading 18S Data ---
Data loading complete.

--- Loaded Data Shapes ---
Shape of X_train: (6427, 14058)
Shape of y_train: (6427,)
------------------------------
Shape of X_test:  (1607, 14058)
Shape of y_test:  (1607,)
Number of classes (genera): 616


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, Callback
from sklearn.model_selection import train_test_split

# --- PART 1: Define and Compile Model ---
print("--- Defining 18S Model Architecture ---")
num_classes = len(label_encoder.classes_)
input_shape = X_train.shape[1]

model = Sequential([
    Dense(2048, activation='relu', input_shape=(input_shape,)),
    Dropout(0.5),
    Dense(1024, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


# --- PART 2: Prepare Data and Train ---
print("\n--- Preparing Data and Starting Training ---")

# Pre-flight check for singletons in the training set
unique, counts = np.unique(y_train, return_counts=True)
if np.min(counts) < 2:
    print("WARNING: Singletons found in y_train. Cleaning...")
    non_singleton_indices = np.where(~np.isin(y_train, unique[counts < 2]))[0]
    X_train = X_train[non_singleton_indices]
    y_train = y_train[non_singleton_indices]

# Create validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

# Define custom callback
class TrainingProgressCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        acc = logs.get('accuracy', 0); val_acc = logs.get('val_accuracy', 0)
        loss = logs.get('loss', 0); val_loss = logs.get('val_loss', 0)
        acc_bar = '█' * int(acc * 20) + '·' * (20 - int(acc * 20))
        val_acc_bar = '█' * int(val_acc * 20) + '·' * (20 - int(val_acc * 20))
        print(f"\rEpoch {epoch+1:02d}/50 | Loss: {loss:.4f} | Acc: {acc:.2%} [{acc_bar}] | Val_Loss: {val_loss:.4f} | Val_Acc: {val_acc:.2%} [{val_acc_bar}]", end='')

# Define early stopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, verbose=1, restore_best_weights=True)

# Start training
history = model.fit(
    X_train_final, y_train_final,
    epochs=50,
    batch_size=16,
    validation_data=(X_val, y_val),
    verbose=0,
    callbacks=[early_stopping, TrainingProgressCallback()]
)
print("\n\n--- Training complete. ---")


# --- PART 3: Save the Model ---
MODEL_PATH = MODELS_DIR / "18s_genus_classifier.keras"
print(f"\nSaving trained model to: {MODEL_PATH}")
model.save(MODEL_PATH)
print("✅ Model saved successfully.")

--- Defining 18S Model Architecture ---
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2048)              28792832  
                                                                 
 dropout (Dropout)           (None, 2048)              0         
                                                                 
 dense_1 (Dense)             (None, 1024)              2098176   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_2 (Dense)             (None, 616)               631400    
                                                                 
Total params: 31,522,408
Trainable params: 31,522,408
Non-trainable params: 0
_________________________________________________________________

---



Epoch 18/50 | Loss: 2.4823 | Acc: 50.31% [██████████··········] | Val_Loss: 1.5871 | Val_Acc: 66.25% [█████████████·······]Restoring model weights from the end of the best epoch: 16.
Epoch 19/50 | Loss: 2.4734 | Acc: 50.67% [██████████··········] | Val_Loss: 1.6282 | Val_Acc: 68.58% [█████████████·······]Epoch 19: early stopping


--- Training complete. ---

Saving trained model to: C:\Users\jampa\Music\atlas-v3\models\18s_genus_classifier.keras
✅ Model saved successfully.


In [1]:
# =============================================================================
# STEP 5: FINAL MODEL EVALUATION
# =============================================================================
#
# OBJECTIVE:
#   To perform a definitive, unbiased evaluation of the trained 18S model on
#   the unseen test set. This is a memory-safe operation.
#
# =============================================================================

import tensorflow as tf
import gc
from tensorflow.keras.models import load_model
from pathlib import Path
from scipy.sparse import load_npz
import numpy as np

# --- Define all necessary file paths ---
project_root = Path.cwd().parent
MODELS_DIR = project_root / "models"
PROCESSED_DATA_DIR = project_root / "data" / "processed"

MODEL_PATH = MODELS_DIR / "18s_genus_classifier.keras"
X_TEST_PATH = PROCESSED_DATA_DIR / "X_test_18s.npz"
Y_TEST_PATH = PROCESSED_DATA_DIR / "y_test_18s.npy"

# --- 1. Clean up memory ---
print("--- Starting Final Evaluation ---")
print("Clearing TensorFlow session...")
tf.keras.backend.clear_session()
gc.collect()
print("Memory cleared successfully.")

# --- 2. Load model and test data ---
print(f"\nLoading model from: {MODEL_PATH}")
loaded_model = load_model(MODEL_PATH)
print("Model loaded successfully.")

print(f"\nLoading test data...")
X_test = load_npz(X_TEST_PATH)
y_test = np.load(Y_TEST_PATH)
print("Test data loaded successfully.")

# --- 3. Evaluate the model ---
print("\nEvaluating model on the test set...")
loss, accuracy = loaded_model.evaluate(X_test, y_test, verbose=1)

print("\n--- Final 18S Model Evaluation ---")
print(f"Test Set Loss:     {loss:.4f}")
print(f"Test Set Accuracy: {accuracy:.2%}")
print("----------------------------------")

--- Starting Final Evaluation ---
Clearing TensorFlow session...
Memory cleared successfully.

Loading model from: C:\Users\jampa\Music\atlas-v3\models\18s_genus_classifier.keras
Model loaded successfully.

Loading test data...
Test data loaded successfully.

Evaluating model on the test set...

--- Final 18S Model Evaluation ---
Test Set Loss:     1.9340
Test Set Accuracy: 64.09%
----------------------------------


In [2]:
# =============================================================================
# STEP 6: VISUALIZE TRAINING HISTORY (Optional)
# =============================================================================
#
# NOTE: This cell will only work if the kernel has NOT been restarted since
# the training cell was run, as it depends on the 'history' object in memory.
#
# =============================================================================

import matplotlib.pyplot as plt
import pandas as pd

try:
    print("--- Generating Training History Plots ---")
    history_df = pd.DataFrame(history.history)
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
    fig.suptitle('18S Model Training History', fontsize=18)

    # Plot Accuracy
    ax1.plot(history_df.index + 1, history_df['accuracy'], label='Training Accuracy', marker='o')
    ax1.plot(history_df.index + 1, history_df['val_accuracy'], label='Validation Accuracy', marker='o')
    ax1.set_title('Model Accuracy Over Epochs')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(True, linestyle='--')

    # Plot Loss
    ax2.plot(history_df.index + 1, history_df['loss'], label='Training Loss', marker='o')
    ax2.plot(history_df.index + 1, history_df['val_loss'], label='Validation Loss', marker='o')
    ax2.set_title('Model Loss Over Epochs')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(True, linestyle='--')

    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

except NameError:
    print("\nCould not generate plots because the 'history' object was not found in memory.")
    print("This is expected if the kernel was restarted after training.")

--- Generating Training History Plots ---

Could not generate plots because the 'history' object was not found in memory.
This is expected if the kernel was restarted after training.
