In [1]:
#
# -----------------------------------------------------------------------------
#
#                    ATLAS v3: ITS Model Training (Fungi)
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To build, train, and evaluate a deep learning classifier for the ITS
#       region using the pre-processed data from the UNITE database.
#
#   METHODOLOGY:
#
#       1.  Load the ITS-specific training/testing data and encoders from disk.
#       2.  Define the standard neural network architecture.
#       3.  Train the model on the training data using the GPU.
#       4.  Save, reload, and evaluate the final model's accuracy on the
#           unseen test set, following our memory-safe workflow.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import numpy as np
import tensorflow as tf
from scipy.sparse import load_npz
import pickle
from pathlib import Path
import sys

# --- Setup Project Path ---
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent
print(f"Project Root: {project_root}")


# --- 1. Verification Step: Check for GPU ---
print("\n--- TensorFlow Setup ---")
print(f"  - TensorFlow Version: {tf.__version__}")
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"  - GPU detected: {gpu_devices[0]}")
else:
    print("  - WARNING: No GPU detected. TensorFlow will run on CPU.")
print("--------------------------")


# --- 2. Define ITS-specific file paths ---
PROCESSED_DATA_DIR = project_root / "data" / "processed"
MODELS_DIR = project_root / "models"

X_TRAIN_PATH = PROCESSED_DATA_DIR / "X_train_its.npz"
X_TEST_PATH = PROCESSED_DATA_DIR / "X_test_its.npz"
Y_TRAIN_PATH = PROCESSED_DATA_DIR / "y_train_its.npy"
Y_TEST_PATH = PROCESSED_DATA_DIR / "y_test_its.npy"

LABEL_ENCODER_PATH = MODELS_DIR / "its_genus_label_encoder.pkl"


# --- 3. Load the data and encoders ---
print("\n--- Loading ITS Data ---")
try:
    X_train = load_npz(X_TRAIN_PATH)
    X_test = load_npz(X_TEST_PATH)
    y_train = np.load(Y_TRAIN_PATH)
    y_test = np.load(Y_TEST_PATH)

    with open(LABEL_ENCODER_PATH, 'rb') as f:
        label_encoder = pickle.load(f)
    print("  - Data loading complete.")
except FileNotFoundError:
    print("\n[ERROR] Processed data not found.")
    print("        Please run `python src/pipeline_its/01_prepare_data.py` before starting this notebook.")
    # Stop execution if files are missing
    raise

# --- 4. Verification Step ---
print("\n--- Loaded Data Shapes ---")
print(f"  - Shape of X_train: {X_train.shape}")
print(f"  - Shape of y_train: {y_train.shape}")
print("  ------------------------------")
print(f"  - Shape of X_test:  {X_test.shape}")
print(f"  - Shape of y_test:  {y_test.shape}")
print(f"  - Number of classes (genera): {len(label_encoder.classes_)}")

Project Root: C:\Users\jampa\Music\atlas

--- TensorFlow Setup ---
  - TensorFlow Version: 2.10.1
  - GPU detected: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
--------------------------

--- Loading ITS Data ---
  - Data loading complete.

--- Loaded Data Shapes ---
  - Shape of X_train: (6696, 18837)
  - Shape of y_train: (6696,)
  ------------------------------
  - Shape of X_test:  (1674, 18837)
  - Shape of y_test:  (1674,)
  - Number of classes (genera): 634


In [2]:
#
# -----------------------------------------------------------------------------
#
#                     STEP 2: DEFINE AND TRAIN THE ITS MODEL
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To define the neural network architecture, train it on the prepared
#       ITS data, and save the resulting model artifact to disk.
#
#   WORKFLOW:
#
#       1.  Define the Keras Sequential model architecture, dynamically
#           sizing it to the input features and output classes of the ITS dataset.
#       2.  Create a validation set and define callbacks for Early Stopping
#           and clean progress reporting.
#       3.  Execute the training using `model.fit()`.
#       4.  Immediately save the best version of the trained model to disk.
#
# -----------------------------------------------------------------------------
#

# --- Imports for this cell ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, Callback
from sklearn.model_selection import train_test_split

# --- PART 1: Define and Compile Model ---
print("--- Defining ITS Model Architecture ---")
num_classes = len(label_encoder.classes_)
input_shape = X_train.shape[1]

model = Sequential([
    Dense(2048, activation='relu', input_shape=(input_shape,)),
    Dropout(0.5),
    Dense(1024, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


# --- PART 2: Prepare Data and Train ---
print("\n--- Preparing Data and Starting Training ---")

# Define training parameters
EPOCHS = 50
BATCH_SIZE = 16
RANDOM_STATE = 42

# Create validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=RANDOM_STATE, stratify=y_train
)

# Define custom callback for clean, single-line progress
class TrainingProgressCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        acc = logs.get('accuracy', 0); val_acc = logs.get('val_accuracy', 0)
        loss = logs.get('loss', 0); val_loss = logs.get('val_loss', 0)
        acc_bar = '█' * int(acc * 20) + '·' * (20 - int(acc * 20))
        val_acc_bar = '█' * int(val_acc * 20) + '·' * (20 - int(val_acc * 20))
        print(f"\rEpoch {epoch+1:02d}/{EPOCHS} | Loss: {loss:.4f} | Acc: {acc:.2%} [{acc_bar}] | Val_Loss: {val_loss:.4f} | Val_Acc: {val_acc:.2%} [{val_acc_bar}]", end='')

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, verbose=1, restore_best_weights=True)

# Start training
history = model.fit(
    X_train_final, y_train_final,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    verbose=0,
    callbacks=[early_stopping, TrainingProgressCallback()]
)
print("\n\n--- Training complete. ---")


# --- PART 3: Save the Model ---
MODEL_PATH = MODELS_DIR / "its_genus_classifier.keras"
print(f"\nSaving trained model to: {MODEL_PATH}")
try:
    model.save(MODEL_PATH)
    print("  - Model saved successfully.")
except Exception as e:
    print(f"[ERROR] Could not save the model: {e}")

--- Defining ITS Model Architecture ---
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2048)              38580224  
                                                                 
 dropout (Dropout)           (None, 2048)              0         
                                                                 
 dense_1 (Dense)             (None, 1024)              2098176   
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_2 (Dense)             (None, 634)               649850    
                                                                 
Total params: 41,328,250
Trainable params: 41,328,250
Non-trainable params: 0
_________________________________________________________________

---



Epoch 08/50 | Loss: 1.2294 | Acc: 73.42% [██████████████······] | Val_Loss: 1.8791 | Val_Acc: 64.63% [████████████········]Restoring model weights from the end of the best epoch: 6.
Epoch 09/50 | Loss: 1.2158 | Acc: 74.58% [██████████████······] | Val_Loss: 1.9036 | Val_Acc: 64.33% [████████████········]Epoch 9: early stopping


--- Training complete. ---

Saving trained model to: C:\Users\jampa\Music\atlas\models\its_genus_classifier.keras
  - Model saved successfully.
