# Prepare Data for Next Word Prediction

This notebook prepares the data for training. It reads a text file, tokenizes it, creates sequences for training, and saves the processed data.

**Note:** To prevent memory errors with large vocabularies, we save the target labels `y` as integers (sparse) instead of one-hot encoded vectors. When training, use `sparse_categorical_crossentropy` loss.

In [1]:
import numpy as np
import pickle
import os
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Config
SEQ_LENGTH = 50  # Length of input sequence context
# Using absolute paths as provided in the original file
BASE_DIR = r"C:\Users\LAPTOPS HUB\Desktop\projects_RNNs\rnn_lstm_next_word_project"
DATA_PATH = os.path.join(BASE_DIR, "data", "dataset.txt")
OUTPUT_DIR = os.path.join(BASE_DIR, "data")

def prepare_data():
    print("Loading data...")
    try:
        with open(DATA_PATH, 'r', encoding='utf-8') as f:
            text = f.read()
    except FileNotFoundError:
        print(f"File not found: {DATA_PATH}")
        return

    # Preprocessing
    print("Preprocessing...")
    text = text.lower()
    # Remove special characters, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenization
    print("Tokenizing...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    total_words = len(tokenizer.word_index) + 1
    print(f"Total words: {total_words}")
    
    # Create sequences
    print("Creating sequences (Sliding Window)...")
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    sequences = []
    # Use sliding window to prevent memory explosion with large text
    if len(token_list) > SEQ_LENGTH:
        for i in range(SEQ_LENGTH, len(token_list)):
            # Take SEQ_LENGTH words as input, +1 for target
            seq = token_list[i-SEQ_LENGTH:i+1]
            sequences.append(seq)
    else:
        # Fallback for short text
        for i in range(1, len(token_list)):
            sequences.append(token_list[:i+1])

    if not sequences:
        print("Not enough data to create sequences.")
        return

    sequences = np.array(sequences)
    
    # Split X and y
    # X is the context (first SEQ_LENGTH words), y is the target (last word)
    X, y = sequences[:, :-1], sequences[:, -1]
    
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    # We skip one-hot encoding for y to save memory. Use sparse_categorical_crossentropy loss during training.
    print("Keeping y as integer labels (sparse) to save memory...")
    
    # Save
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    print(f"Saving processed data to {OUTPUT_DIR}...")
    np.save(os.path.join(OUTPUT_DIR, 'X.npy'), X)
    np.save(os.path.join(OUTPUT_DIR, 'y.npy'), y)
    
    # Save tokenizer using pickle (standard for Keras Tokenizer)
    with open(os.path.join(OUTPUT_DIR, 'tokenizer.pkl'), 'wb') as f:
        pickle.dump(tokenizer, f)
        
    print("Data preparation complete.")

if __name__ == "__main__":
    prepare_data()


Loading data...
Preprocessing...
Tokenizing...
Total words: 12848
Creating sequences (Sliding Window)...
X shape: (202569, 50)
y shape: (202569,)
Keeping y as integer labels (sparse) to save memory...
Saving processed data to C:\Users\LAPTOPS HUB\Desktop\projects_RNNs\rnn_lstm_next_word_project\data...
Data preparation complete.
