# LSTM Model Training for Next-Page Prediction

This notebook trains an LSTM model to predict the next page a user will visit based on their clickstream history.

## Model Architecture
- **Sequence Length**: 20
- **Embedding Size**: 64
- **LSTM Units**: 128
- **Batch Size**: 64
- **Train/Validation Split**: 80/20


## 1. Import Libraries


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import json
import os
from datetime import datetime

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")


## 2. Load Preprocessed Data


In [None]:
# Configuration
DATA_DIR = './data'
SEQUENCE_LENGTH = 20
EMBEDDING_SIZE = 64
LSTM_UNITS = 128
BATCH_SIZE = 64
EPOCHS = 20
LEARNING_RATE = 0.001

# Load preprocessed data
print("Loading preprocessed data...")
X_train = np.load(f'{DATA_DIR}/X_train.npy')
X_val = np.load(f'{DATA_DIR}/X_val.npy')
y_train = np.load(f'{DATA_DIR}/y_train.npy')
y_val = np.load(f'{DATA_DIR}/y_val.npy')

# Load vocabulary
with open(f'{DATA_DIR}/vocab.json', 'r') as f:
    vocab = json.load(f)

vocab_size = len(vocab)
num_classes = vocab_size - 2  # Exclude <PAD> and <UNK>

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Sequence length: {SEQUENCE_LENGTH}")


## 3. Define Model Architecture


In [None]:
def create_lstm_model(sequence_length, vocab_size, embedding_size, lstm_units, num_classes):
    """
    Create LSTM model for next-page prediction.
    
    Architecture:
    - Embedding layer: Maps page indices to dense vectors
    - LSTM layer: Processes sequences
    - Dense layer: Output predictions
    """
    model = keras.Sequential([
        # Embedding layer
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_size,
            input_length=sequence_length,
            name='embedding'
        ),
        
        # LSTM layer
        layers.LSTM(
            lstm_units,
            return_sequences=False,  # Only return final output
            name='lstm'
        ),
        
        # Dropout for regularization
        layers.Dropout(0.2, name='dropout'),
        
        # Dense output layer
        layers.Dense(
            num_classes,
            activation='softmax',
            name='output'
        )
    ])
    
    return model

# Create model
model = create_lstm_model(
    sequence_length=SEQUENCE_LENGTH,
    vocab_size=vocab_size,
    embedding_size=EMBEDDING_SIZE,
    lstm_units=LSTM_UNITS,
    num_classes=num_classes
)

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy', 'top_k_categorical_accuracy']
)

# Display model summary
model.summary()


## 4. Define Callbacks


In [None]:
# Create callbacks
callbacks = [
    # Early stopping
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    
    # Model checkpoint
    keras.callbacks.ModelCheckpoint(
        filepath='./models/lstm_model_epoch_{epoch:02d}_val_loss_{val_loss:.4f}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    ),
    
    # Reduce learning rate on plateau
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    ),
    
    # CSV logger
    keras.callbacks.CSVLogger(
        filename='./results/training_history.csv',
        append=False
    )
]

# Create directories
os.makedirs('./models', exist_ok=True)
os.makedirs('./results', exist_ok=True)

print("Callbacks configured.")


## 5. Train Model


In [None]:
print(f"Starting training at {datetime.now()}")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")

# Train model
history = model.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

print(f"Training completed at {datetime.now()}")


## 6. Evaluate Model


In [None]:
# Evaluate on validation set
print("Evaluating model on validation set...")
val_loss, val_accuracy, val_top_k = model.evaluate(X_val, y_val, verbose=1)

print(f"\nValidation Results:")
print(f"  Loss: {val_loss:.4f}")
print(f"  Accuracy: {val_accuracy:.4f}")
print(f"  Top-K Accuracy: {val_top_k:.4f}")


## 7. Save Final Model


In [None]:
# Save final model
model_path = './models/lstm_final_model.h5'
model.save(model_path)
print(f"Model saved to {model_path}")

# Also save in SavedModel format for TensorFlow.js conversion
saved_model_path = './models/lstm_saved_model'
model.save(saved_model_path, save_format='tf')
print(f"Model saved in SavedModel format to {saved_model_path}")


## 8. Plot Training History


In [None]:
import matplotlib.pyplot as plt

# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Loss
axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Model Loss')
axes[0].legend()
axes[0].grid(True)

# Accuracy
axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Model Accuracy')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.savefig('./results/training_history.png', dpi=150)
plt.show()

print("Training history plot saved to ./results/training_history.png")


## 9. Model Summary and Hyperparameters


In [None]:
# Save model configuration
config = {
    'sequence_length': SEQUENCE_LENGTH,
    'embedding_size': EMBEDDING_SIZE,
    'lstm_units': LSTM_UNITS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'vocab_size': vocab_size,
    'num_classes': num_classes,
    'final_val_loss': float(val_loss),
    'final_val_accuracy': float(val_accuracy),
    'training_samples': int(len(X_train)),
    'validation_samples': int(len(X_val))
}

with open('./results/model_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Model configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

print("\nConfiguration saved to ./results/model_config.json")
