<a href="https://colab.research.google.com/github/JWiseman-git/ml_sandbox/blob/main/token_classifier_revisited.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import requests
import numpy as np
import tqdm
import requests
from io import BytesIO
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
# from tensorflow.keras.layers import Embedding # Removed as it's not suitable for this data type

url = 'https://nordy.cloud/dataset_lstm.npy'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
result = requests.get(url, headers=headers)

npzfile = np.load(BytesIO(result.content))
X_train = npzfile['X_train']
print(X_train)
X_test = npzfile['X_test']
y_train = npzfile['y_train']
y_test = npzfile['y_test']

print(f"Original X_train shape: {X_train.shape}")

# Define the number of features you want to use
limited_features_count = X_train.shape[1] # Using all features available

# # Ensure limited_features_count does not exceed the actual number of features
# if limited_features_count > X_train.shape[1]:
#     limited_features_count = X_train.shape[1]
#     print(f"Warning: limited_features_count was greater than actual features. Using all {limited_features_count} features.")

# Select the first 'limited_features_count' features
X_train_limited = X_train[:, :limited_features_count]
X_test_limited = X_test[:, :limited_features_count]

number_of_features = limited_features_count
# sequence_length = 1 # Assuming sequence length of 1 based on previous commented reshaping

# Reshape the limited feature datasets
X_train_lstm = np.reshape(X_train_limited, (X_train_limited.shape[0], sequence_length, number_of_features))
X_test_lstm = np.reshape(X_test_limited, (X_test_limited.shape[0], sequence_length, number_of_features))

print(X_test_lstm)

y_train_lstm = y_train.reshape(-1, 1)
y_test_lstm = y_test.reshape(-1, 1)

print(f"Reshaped X_train_lstm shape: {X_train_lstm.shape}")

# model = Sequential([
#     Input(shape=(sequence_length, number_of_features)), # Corrected Input shape for LSTM
#     LSTM(64),
#     Dense(1, activation='sigmoid')
# ])

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Now you can fit the model with the limited features
# model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=50, validation_split=0.2)

[[11227   128  6280 ...   185  1671   323]
 [    0     0     0 ...  4528  1699     9]
 [ 3544  1132   508 ...  6749   368  8604]
 ...
 [  920   810   746 ...   660  3692   617]
 [    0     0     0 ...   823  1308 28473]
 [  328 13572  2082 ...  6873  6127    69]]
Original X_train shape: (315, 500)
[[[ 3830   747   721 ...   236  8390 23023]]

 [[  213  1809    67 ...   505   224   330]]

 [[   19  9924    24 ...  4145  1404   307]]

 ...

 [[14942   169  7274 ...  2246  1058   453]]

 [[ 1096  1065    11 ...   135  1392   176]]

 [[  799   542  1849 ...   296  2157   170]]]
Reshaped X_train_lstm shape: (315, 1, 500)


In [None]:
"""LSTM text classifier with proper sequence handling and Embedding layer."""

from io import BytesIO

import numpy as np
import requests
from tensorflow.keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM
from tensorflow.keras.metrics import AUC, BinaryAccuracy, Precision, Recall
from tensorflow.keras.models import Sequential

# --- Configuration ---
VOCAB_SIZE = 30000  # Must be >= max token ID in dataset + 1
EMBEDDING_DIM = 128
LSTM_UNITS = 128
DROPOUT = 0.3
EPOCHS = 30  # Loss was still dropping at epoch 5 — give it more room
BATCH_SIZE = 32
VALIDATION_SPLIT = 0.2

# --- Load dataset ---
url = "https://nordy.cloud/dataset_lstm.npy"
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/50.0.2661.102 Safari/537.36"
    ),
}
result = requests.get(url, headers=headers, timeout=30)
npzfile = np.load(BytesIO(result.content))

X_train = npzfile["X_train"]
X_test = npzfile["X_test"]
y_train = npzfile["y_train"]
y_test = npzfile["y_test"]

# --- Inspect shapes ---
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")
print(f"Max token ID:  {X_train.max()}")

# --- Validate sample counts match ---
assert X_train.shape[0] == y_train.shape[0], "X_train and y_train sample count mismatch"
assert X_test.shape[0] == y_test.shape[0], "X_test and y_test sample count mismatch"

# --- Determine dimensions ---
# Each row IS a sequence of tokens — sequence_length = number of columns
sequence_length = X_train.shape[1]

# Ensure vocab_size covers all token IDs
vocab_size = max(VOCAB_SIZE, int(X_train.max()) + 1)

# --- Determine number of labels ---
num_labels = y_train.shape[1]  # 39 labels
print(f"Number of labels: {num_labels}")

# --- Build model ---
# Multi-label classification: each of the 39 outputs is an independent binary
# decision, so we use sigmoid (not softmax) and binary_crossentropy.
#
# Improvements over the basic version:
# - Bidirectional LSTM: reads the sequence forwards AND backwards
# - Dropout: reduces overfitting on this small dataset (315 samples)
# - Larger embedding/LSTM dims: more capacity to learn token relationships
model = Sequential(
    [
        Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM),
        Bidirectional(LSTM(LSTM_UNITS)),
        Dropout(DROPOUT),
        Dense(64, activation="relu"),
        Dropout(DROPOUT),
        Dense(num_labels, activation="sigmoid"),
    ]
)

# Use metrics that actually make sense for multi-label:
# - BinaryAccuracy: average per-label accuracy (is each label right?)
# - Precision/Recall: how many predicted labels are correct / how many true labels are found
# - AUC: overall ranking quality across thresholds
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[
        BinaryAccuracy(name="binary_acc"),
        Precision(name="precision"),
        Recall(name="recall"),
        AUC(name="auc", multi_label=True),
    ],
)
model.summary()

# --- Train ---
model.fit(
    X_train,
    y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
)

# --- Evaluate ---
results = model.evaluate(X_test, y_test)
metric_names = model.metrics_names
for name, value in zip(metric_names, results):
    print(f"  {name}: {value:.4f}")


# ============================================================================
# WHY MULTI-LABEL (NOT MULTI-CLASS)?
# ============================================================================
# y_train has shape (315, 39) — each sample has 39 independent binary labels.
# This means a document can belong to MULTIPLE categories at once.
#
# - sigmoid: each output is an independent probability (0 to 1)
# - binary_crossentropy: treats each of the 39 outputs as a separate binary task
#
# If instead each sample had exactly ONE class (mutually exclusive), you'd use:
#   Dense(num_classes, activation="softmax")
#   loss="categorical_crossentropy"
# ============================================================================

X_train shape: (315, 500)
X_test shape:  (35, 500)
y_train shape: (315, 39)
y_test shape:  (35, 39)
Max token ID:  28487
Number of labels: 39


Epoch 1/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - auc: 0.4855 - binary_acc: 0.6783 - loss: 0.6875 - precision: 0.0296 - recall: 0.3641 - val_auc: 0.4106 - val_binary_acc: 0.9007 - val_loss: 0.6439 - val_precision: 0.0160 - val_recall: 0.0476
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - auc: 0.4321 - binary_acc: 0.8678 - loss: 0.5322 - precision: 0.0336 - recall: 0.1505 - val_auc: 0.4233 - val_binary_acc: 0.9630 - val_loss: 0.1868 - val_precision: 0.0333 - val_recall: 0.0159
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - auc: 0.4400 - binary_acc: 0.9259 - loss: 0.2255 - precision: 0.0275 - recall: 0.0577 - val_auc: 0.4153 - val_binary_acc: 0.9744 - val_loss: 0.1326 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - auc: 0.4971 - binary_acc: 0.9650 - loss: 0.1658 - precision: 0.0592 - recall: 0.