In [1]:
#
# -----------------------------------------------------------------------------
#
#                    ATLAS v3: ITS Model Training (Fungi)
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To build, train, and evaluate a deep learning classifier for the ITS
#       region using the pre-processed data from the UNITE database.
#
#   METHODOLOGY:
#
#       1.  Load the ITS-specific training/testing data and encoders from disk.
#       2.  Define the standard neural network architecture.
#       3.  Train the model on the training data using the GPU.
#       4.  Save, reload, and evaluate the final model's accuracy on the
#           unseen test set, following our memory-safe workflow.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import numpy as np
import tensorflow as tf
from scipy.sparse import load_npz
import pickle
from pathlib import Path
import sys

# --- Setup Project Path ---
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent
print(f"Project Root: {project_root}")


# --- 1. Verification Step: Check for GPU ---
print("\n--- TensorFlow Setup ---")
print(f"  - TensorFlow Version: {tf.__version__}")
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"  - GPU detected: {gpu_devices[0]}")
else:
    print("  - WARNING: No GPU detected. TensorFlow will run on CPU.")
print("--------------------------")


# --- 2. Define ITS-specific file paths ---
PROCESSED_DATA_DIR = project_root / "data" / "processed"
MODELS_DIR = project_root / "models"

X_TRAIN_PATH = PROCESSED_DATA_DIR / "X_train_its.npz"
X_TEST_PATH = PROCESSED_DATA_DIR / "X_test_its.npz"
Y_TRAIN_PATH = PROCESSED_DATA_DIR / "y_train_its.npy"
Y_TEST_PATH = PROCESSED_DATA_DIR / "y_test_its.npy"

LABEL_ENCODER_PATH = MODELS_DIR / "its_genus_label_encoder.pkl"


# --- 3. Load the data and encoders ---
print("\n--- Loading ITS Data ---")
try:
    X_train = load_npz(X_TRAIN_PATH)
    X_test = load_npz(X_TEST_PATH)
    y_train = np.load(Y_TRAIN_PATH)
    y_test = np.load(Y_TEST_PATH)

    with open(LABEL_ENCODER_PATH, 'rb') as f:
        label_encoder = pickle.load(f)
    print("  - Data loading complete.")
except FileNotFoundError:
    print("\n[ERROR] Processed data not found.")
    print("        Please run `python src/pipeline_its/01_prepare_data.py` before starting this notebook.")
    # Stop execution if files are missing
    raise

# --- 4. Verification Step ---
print("\n--- Loaded Data Shapes ---")
print(f"  - Shape of X_train: {X_train.shape}")
print(f"  - Shape of y_train: {y_train.shape}")
print("  ------------------------------")
print(f"  - Shape of X_test:  {X_test.shape}")
print(f"  - Shape of y_test:  {y_test.shape}")
print(f"  - Number of classes (genera): {len(label_encoder.classes_)}")

Project Root: C:\Users\jampa\Music\atlas

--- TensorFlow Setup ---
  - TensorFlow Version: 2.10.1
  - GPU detected: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
--------------------------

--- Loading ITS Data ---
  - Data loading complete.

--- Loaded Data Shapes ---
  - Shape of X_train: (6696, 18837)
  - Shape of y_train: (6696,)
  ------------------------------
  - Shape of X_test:  (1674, 18837)
  - Shape of y_test:  (1674,)
  - Number of classes (genera): 634
