# Cell 1 — Load memmaps + trained model

In [1]:
import numpy as np
import tensorflow as tf
from pathlib import Path

SEED = 123
tf.random.set_seed(SEED)
np.random.seed(SEED)

# Paths
LABELED_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_labeled")
UNLABELED_DIR = Path(r"C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_unlabeled")

# Labeled (memmap)
X_train = np.load(LABELED_DIR / "X_train.npy", mmap_mode="r")
y_train = np.load(LABELED_DIR / "y_train.npy")
X_val   = np.load(LABELED_DIR / "X_val.npy",   mmap_mode="r")
y_val   = np.load(LABELED_DIR / "y_val.npy")
X_test  = np.load(LABELED_DIR / "X_test.npy",  mmap_mode="r")
y_test  = np.load(LABELED_DIR / "y_test.npy")

# Unlabeled features (memmap) - you must have this file
X_unl = np.load(UNLABELED_DIR / "X_unlabeled.npy", mmap_mode="r")

print("Labeled X_train:", X_train.shape, X_train.dtype)
print("Unlabeled X_unl:", X_unl.shape, X_unl.dtype)

# Load your best model checkpoint
MODEL_PATH = LABELED_DIR / "cnn_mfcc_best.keras"   # or wherever you saved it
model = tf.keras.models.load_model(MODEL_PATH)
print("Loaded model:", MODEL_PATH)

Labeled X_train: (52551, 3, 32, 201) float16
Unlabeled X_unl: (687350, 3, 32, 201) float16
Loaded model: C:\Users\leona\Documents\Thesis_Project_UACH\Temp\Dataset\features_mfcc_labeled\cnn_mfcc_best.keras


# Cell 2 — Build a prediction dataset for unlabeled (memmap-safe)

This streams batches from ```X_unl``` and never loads it all.

In [None]:
BATCH_PRED = 256  # increase if you have GPU VRAM

unl_ds = tf.data.Dataset.from_tensor_slices(X_unl)
unl_ds = unl_ds.batch(BATCH_PRED).map(lambda x: tf.cast(x, tf.float32), num_parallel_calls=tf.data.AUTOTUNE)
unl_ds = unl_ds.prefetch(tf.data.AUTOTUNE)

: 

# Cell 3 — Generate pseudo-labels with confidence filtering

This creates small arrays of ```selected_indices```, ```pseudo_labels```, and ```pseudo_confidence```.

In [None]:
import numpy as np

THRESH = 0.95  # confidence threshold (try 0.90–0.98)

probs = model.predict(unl_ds, verbose=1)  # shape: (N_unl, 4)
pseudo_y = probs.argmax(axis=1).astype(np.int64)
conf = probs.max(axis=1).astype(np.float32)

selected_mask = conf >= THRESH
selected_idx = np.where(selected_mask)[0].astype(np.int64)
selected_y = pseudo_y[selected_mask]
selected_conf = conf[selected_mask]

print("Unlabeled total:", len(probs))
print("Selected pseudo-labeled:", len(selected_idx))
print("Selection rate: {:.2f}%".format(100.0 * len(selected_idx) / max(1, len(probs))))

# Class distribution among selected (useful to detect collapse into class 3)
u, c = np.unique(selected_y, return_counts=True)
print("Pseudo-label class counts:", dict(zip(u, c)))
print("Confidence stats: min/mean/max =", float(selected_conf.min()), float(selected_conf.mean()), float(selected_conf.max()))

prevent pseudo labels from being 95% class “3”. If your selected set is overwhelmingly class 3, cap per class to keep diversity:

In [None]:
MAX_PER_CLASS = 20000  # cap per pseudo class (adjust)

kept = []
for k in range(4):
    idx_k = selected_idx[selected_y == k]
    if len(idx_k) > MAX_PER_CLASS:
        # keep highest-confidence ones for that class
        conf_k = selected_conf[selected_y == k]
        top = np.argsort(-conf_k)[:MAX_PER_CLASS]
        idx_k = idx_k[top]
    kept.append(idx_k)

selected_idx = np.concatenate(kept).astype(np.int64)
# Recompute selected_y/conf for the capped indices
selected_y = pseudo_y[selected_idx]
selected_conf = conf[selected_idx]

print("After per-class cap -> Selected:", len(selected_idx))
u, c = np.unique(selected_y, return_counts=True)
print("Capped pseudo-label counts:", dict(zip(u, c)))

# Cell 4 — Build ```tf.data``` datasets WITHOUT concatenating big arrays

This is the key part. We will not do ```np.concatenate([X_train, X_pseudo])```.
Instead we build two datasets and mix them.

## 4A) Labeled dataset (memmap streaming)

In [None]:
BATCH_TRAIN = 64

def make_labeled_ds(X, y, training=False):
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    if training:
        ds = ds.shuffle(20000, seed=SEED, reshuffle_each_iteration=True)
    ds = ds.batch(BATCH_TRAIN)
    ds = ds.map(lambda a,b: (tf.cast(a, tf.float32), tf.cast(b, tf.int64)),
                num_parallel_calls=tf.data.AUTOTUNE)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds_labeled = make_labeled_ds(X_train, y_train, training=True)
val_ds = make_labeled_ds(X_val, y_val, training=False)

## 4B) Pseudo-labeled dataset via indices (true memmap-safe, no copies)

We store only ```selected_idx``` and fetch ```X_unl[idx]``` on demand.

In [None]:
# Make globals accessible inside py_function
X_unl_global = X_unl
selected_y_global = selected_y  # aligned with selected_idx positions

def _get_pseudo_sample(i):
    # i is an index into selected_idx array (0..len(selected_idx)-1)
    i = int(i)
    real_idx = int(selected_idx[i])
    x = np.array(X_unl_global[real_idx], dtype=np.float32)  # (3,32,201)
    y = np.int64(selected_y_global[i])
    return x, y

def tf_get_pseudo_sample(i):
    x, y = tf.py_function(_get_pseudo_sample, [i], [tf.float32, tf.int64])
    x.set_shape((3, 32, 201))
    y.set_shape(())
    return x, y

pseudo_ds = tf.data.Dataset.from_tensor_slices(np.arange(len(selected_idx), dtype=np.int64))
pseudo_ds = pseudo_ds.shuffle(min(len(selected_idx), 20000), seed=SEED, reshuffle_each_iteration=True)
pseudo_ds = pseudo_ds.map(tf_get_pseudo_sample, num_parallel_calls=tf.data.AUTOTUNE)
pseudo_ds = pseudo_ds.batch(BATCH_TRAIN).prefetch(tf.data.AUTOTUNE)

print("Pseudo dataset batches ready.")

## 4C) Mix labeled + pseudo-labeled (no huge arrays)

Use ```sample_from_datasets``` so each batch is drawn from one source or the other.

In [None]:
# Weighting: start conservative (more real labels than pseudo)
W_LABELED = 0.7
W_PSEUDO  = 0.3

train_ds_mixed = tf.data.Dataset.sample_from_datasets(
    [train_ds_labeled, pseudo_ds],
    weights=[W_LABELED, W_PSEUDO],
    seed=SEED
)

# Cell 5 — Fine-tune (continue training) with EarlyStopping

In [None]:
# Keep compile same; maybe lower LR for fine-tuning
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),  # smaller LR for fine-tune
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_accuracy", mode="max",
        patience=10, restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_accuracy", mode="max",
        patience=3, factor=0.5
    )
]

history = model.fit(
    train_ds_mixed,
    validation_data=val_ds,
    epochs=30,
    callbacks=callbacks
)

# Cell 6 — Evaluate on test

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

test_ds = make_labeled_ds(X_test, y_test, training=False)
probs_test = model.predict(test_ds, verbose=0)
y_pred = np.argmax(probs_test, axis=1)

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))