In [1]:
# =============================================================================
# ATLAS v3 - COI PIPELINE - SETUP AND DATA LOADING
# =============================================================================
#
# OBJECTIVE:
#   To set up the environment for COI model training and load all
#   pre-processed data artifacts from disk.
#
# WORKFLOW:
#   1.  Import all necessary libraries.
#   2.  Set up the project's root path.
#   3.  Verify that TensorFlow can detect and utilize the GPU.
#   4.  Define the file paths for the COI-specific artifacts.
#   5.  Load the training data, testing data, and the label encoder.
#   6.  Print a summary of the loaded data shapes for verification.
#
# =============================================================================

# --- Imports ---
import numpy as np
import tensorflow as tf
from scipy.sparse import load_npz
import pickle
from pathlib import Path
import sys

# --- Setup Project Path ---
project_root = Path.cwd().parent

# --- 1. Verification Step: Check for GPU ---
print("--- TensorFlow Setup ---")
print(f"TensorFlow Version: {tf.__version__}")
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print(f"GPU detected: {gpu_devices[0]}")
else:
    print("WARNING: No GPU detected. TensorFlow will run on CPU.")
print("-" * 26)

# --- 2. Define COI-specific file paths ---
PROCESSED_DATA_DIR = project_root / "data" / "processed"
MODELS_DIR = project_root / "models"

X_TRAIN_PATH = PROCESSED_DATA_DIR / "X_train_coi.npz"
X_TEST_PATH = PROCESSED_DATA_DIR / "X_test_coi.npz"
Y_TRAIN_PATH = PROCESSED_DATA_DIR / "y_train_coi.npy"
Y_TEST_PATH = PROCESSED_DATA_DIR / "y_test_coi.npy"

LABEL_ENCODER_PATH = MODELS_DIR / "coi_genus_label_encoder.pkl"

# --- 3. Load the data and encoders ---
print("\n--- Loading COI Data ---")
X_train = load_npz(X_TRAIN_PATH)
X_test = load_npz(X_TEST_PATH)
y_train = np.load(Y_TRAIN_PATH)
y_test = np.load(Y_TEST_PATH)

with open(LABEL_ENCODER_PATH, 'rb') as f:
    label_encoder = pickle.load(f)
print("Data loading complete.")

# --- 4. Verification Step ---
print("\n--- Loaded Data Shapes ---")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print("-" * 30)
print(f"Shape of X_test:  {X_test.shape}")
print(f"Shape of y_test:  {y_test.shape}")
print(f"Number of classes (genera): {len(label_encoder.classes_)}")

--- TensorFlow Setup ---
TensorFlow Version: 2.10.1
GPU detected: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
--------------------------

--- Loading COI Data ---
Data loading complete.

--- Loaded Data Shapes ---
Shape of X_train: (7916, 41040)
Shape of y_train: (7916,)
------------------------------
Shape of X_test:  (1980, 41040)
Shape of y_test:  (1980,)
Number of classes (genera): 111
