In [1]:
###
# CELL 1: IMPORTS AND DATA LOADING SETUP
###

import tensorflow as tf
import numpy as np
import os
import re

print("TensorFlow Version:", tf.__version__)

# --- Define File Paths ---
# We are in 'notebooks', go up one ('..') to 'ai_model'
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
data_dir = os.path.join(base_dir, 'data')
clean_file_path = os.path.join(data_dir, 'train_clean.txt')
noisy_file_path = os.path.join(data_dir, 'train_noisy.txt')

print(f"Clean data file: {clean_file_path}")
print(f"Noisy data file: {noisy_file_path}")

# --- Load a Small Sample to Verify ---
num_samples_to_preview = 5
clean_lines_sample = []
noisy_lines_sample = []

try:
    with open(clean_file_path, 'r', encoding='utf-8') as f_clean, \
         open(noisy_file_path, 'r', encoding='utf-8') as f_noisy:

        print("\n--- Previewing first few lines ---")
        for i in range(num_samples_to_preview):
            clean_line = f_clean.readline().strip()
            noisy_line = f_noisy.readline().strip()
            if not clean_line or not noisy_line:
                break
            clean_lines_sample.append(clean_line)
            noisy_lines_sample.append(noisy_line)
            print(f"Clean Sample {i+1}: {clean_line}")
            print(f"Noisy Sample {i+1}: {noisy_line}\n")

    print(f"\nSuccessfully loaded {len(clean_lines_sample)} sample lines.")

except FileNotFoundError:
    print(f"ERROR: Could not find training files in {data_dir}")
    print("Please make sure 'train_clean.txt' and 'train_noisy.txt' exist.")
except Exception as e:
    print(f"An error occurred: {e}")

TensorFlow Version: 2.20.0
Clean data file: C:\Users\jampa\Videos\Ghost Type Corrector\ai_model\data\train_clean.txt
Noisy data file: C:\Users\jampa\Videos\Ghost Type Corrector\ai_model\data\train_noisy.txt

--- Previewing first few lines ---
Clean Sample 1: waiting time tariff each period of one minute or part
Noisy Sample 1: waiting time tariuff each period of one minut or part

Clean Sample 2: christian catholic relabelled as christian roman catholic for consistency with census labels
Noisy Sample 2: christian catholic relabelled as christian rman catholic for cfnsistency with census labels

Clean Sample 3: per night per person
Noisy Sample 3: per night per person

Clean Sample 4: mile south of the town which is included in the heritage walk vale trail
Noisy Sample 4: mile south of the town which is includeyd in the heritage walk ale trial

Clean Sample 5: mile you will see the entrance to t cerrig woodland retreats on your left
Noisy Sample 5: mile you wull see the entrance to t jc

In [2]:
###
# CELL 2: TOKENIZATION AND DATA PREPARATION
###

import tensorflow as tf
import numpy as np
import os
import re
import json # To save our tokenizer configuration

print("--- Starting Data Preparation ---")

# --- Reload data (adjust NUM_LINES if needed for faster testing) ---
# Set NUM_LINES to None to load ALL lines (will take longer)
NUM_LINES = 100000 # Let's start with 100k lines for faster processing initially
# NUM_LINES = None # Uncomment this to use the full dataset later

print(f"Loading {NUM_LINES if NUM_LINES else 'all'} lines from files...")

clean_lines = []
noisy_lines = []
try:
    with open(clean_file_path, 'r', encoding='utf-8') as f_clean, \
         open(noisy_file_path, 'r', encoding='utf-8') as f_noisy:
        
        line_num = 0
        while True:
            clean_line = f_clean.readline().strip()
            noisy_line = f_noisy.readline().strip()
            
            if not clean_line or not noisy_line:
                break # End of file
            
            # Simple filter: skip very long lines which might be noise or headers
            if len(clean_line) < 100 and len(noisy_line) < 100:
                clean_lines.append(clean_line)
                noisy_lines.append(noisy_line)
                
            line_num += 1
            if NUM_LINES is not None and line_num >= NUM_LINES:
                break # Stop after reaching NUM_LINES

    print(f"Loaded {len(clean_lines)} pairs of lines.")

except Exception as e:
    print(f"Error loading data: {e}")
    # Stop execution if data loading fails
    raise

# --- Character Tokenization ---
# We treat the problem as character-level seq2seq

# 1. Build Vocabulary
# We need START, END, and PADDING tokens in addition to our alphabet
START_TOKEN = '\t' # Indicates start of sequence (often used in seq2seq)
END_TOKEN = '\n'   # Indicates end of sequence
PAD_TOKEN = ''     # Represents padding (Keras handles index 0 automatically)

# Find all unique characters in both clean and noisy text
all_text = " ".join(clean_lines + noisy_lines)
chars = sorted(list(set(all_text)))
vocabulary = [PAD_TOKEN, START_TOKEN, END_TOKEN] + chars # Ensure PAD=0, START=1, END=2
char_to_index = {char: index for index, char in enumerate(vocabulary)}
index_to_char = {index: char for index, char in enumerate(vocabulary)}
vocab_size = len(vocabulary)

print(f"\nVocabulary Size: {vocab_size}")
print(f"Sample vocabulary mapping: {list(char_to_index.items())[:10]}...") # Show first 10 mappings

# --- Vectorization and Padding ---
# We need to convert sentences to sequences of indices

def vectorize_text(text_list):
    vectorized = []
    for text in text_list:
        # Add START and END tokens
        tokens = [char_to_index[START_TOKEN]] + [char_to_index[char] for char in text] + [char_to_index[END_TOKEN]]
        vectorized.append(tokens)
    return vectorized

print("\nVectorizing text...")
noisy_vectors = vectorize_text(noisy_lines)
clean_vectors = vectorize_text(clean_lines)

# Find the maximum length needed for padding
max_len_noisy = max(len(vec) for vec in noisy_vectors)
max_len_clean = max(len(vec) for vec in clean_vectors)
max_seq_length = max(max_len_noisy, max_len_clean)

print(f"Max sequence length (including START/END tokens): {max_seq_length}")

# Pad sequences
# 'post' means add padding at the end
print("Padding sequences...")
noisy_padded = tf.keras.preprocessing.sequence.pad_sequences(
    noisy_vectors, maxlen=max_seq_length, padding='post'
)
clean_padded = tf.keras.preprocessing.sequence.pad_sequences(
    clean_vectors, maxlen=max_seq_length, padding='post'
)

print("\n--- Data Preparation Complete ---")
print(f"Shape of noisy_padded (Input X): {noisy_padded.shape}")   # Should be (NUM_LINES, max_seq_length)
print(f"Shape of clean_padded (Target Y): {clean_padded.shape}") # Should be (NUM_LINES, max_seq_length)

# --- Save Tokenizer Config ---
# We need to save char_to_index and max_seq_length to use them later
# during inference (in the extension) and for the conversion script.
tokenizer_config = {
    'char_to_index': char_to_index,
    'index_to_char': index_to_char,
    'max_seq_length': max_seq_length,
    'vocab_size': vocab_size,
    'start_token_index': char_to_index[START_TOKEN],
    'end_token_index': char_to_index[END_TOKEN],
    'pad_token_index': char_to_index[PAD_TOKEN] # Should be 0
}

config_save_path = os.path.join(data_dir, 'tokenizer_config.json')
try:
    with open(config_save_path, 'w', encoding='utf-8') as f:
        json.dump(tokenizer_config, f, ensure_ascii=False, indent=4)
    print(f"\nSaved tokenizer configuration to: {config_save_path}")
except Exception as e:
    print(f"\nError saving tokenizer config: {e}")

--- Starting Data Preparation ---
Loading 100000 lines from files...
Loaded 42985 pairs of lines.

Vocabulary Size: 30
Sample vocabulary mapping: [('', 0), ('\t', 1), ('\n', 2), (' ', 3), ('a', 4), ('b', 5), ('c', 6), ('d', 7), ('e', 8), ('f', 9)]...

Vectorizing text...
Max sequence length (including START/END tokens): 101
Padding sequences...

--- Data Preparation Complete ---
Shape of noisy_padded (Input X): (42985, 101)
Shape of clean_padded (Target Y): (42985, 101)

Saved tokenizer configuration to: C:\Users\jampa\Videos\Ghost Type Corrector\ai_model\data\tokenizer_config.json


In [3]:
###
# CELL 3: DEFINE SEQ2SEQ MODEL ARCHITECTURE
###

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, TimeDistributed

print("--- Defining Model Architecture ---")

# --- Hyperparameters ---
# These are settings we can tune later to improve the model
embedding_dim = 128  # Size of the vector for each character
latent_dim = 256     # Number of units in the LSTM layers (complexity)

# --- Encoder ---
# Takes the noisy sequence as input
encoder_inputs = Input(shape=(max_seq_length,), name='encoder_input') # max_seq_length comes from CELL 2

# Embedding layer: Turns character indices into dense vectors
# mask_zero=True tells LSTMs to ignore padding (0s)
encoder_embedding = Embedding(input_dim=vocab_size, # vocab_size comes from CELL 2
                              output_dim=embedding_dim,
                              mask_zero=True,
                              name='encoder_embedding')(encoder_inputs)

# LSTM layer: Processes the sequence and outputs its final state
# return_state=True gives us the hidden state (h) and cell state (c)
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
# Using LSTM involves recurrent connections
# 
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# We discard encoder_outputs, keeping only the states (the "thought vector")
encoder_states = [state_h, state_c]

# --- Decoder ---
# Takes the *clean* sequence as input during training (teacher forcing)
# Note: Shape is (max_seq_length,) because we will shift it later
decoder_inputs = Input(shape=(max_seq_length,), name='decoder_input') # max_seq_length comes from CELL 2

# Embedding layer for the decoder (can use a separate one or reuse encoder's)
decoder_embedding_layer = Embedding(input_dim=vocab_size, # vocab_size comes from CELL 2
                                    output_dim=embedding_dim,
                                    mask_zero=True,
                                    name='decoder_embedding')
decoder_embedding = decoder_embedding_layer(decoder_inputs)

# Decoder LSTM:
# return_sequences=True makes it output at *each* timestep
# We initialize its state with the encoder's final state (encoder_states)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
# We discard the decoder's final states during training

# --- Output Layer ---
# TimeDistributed applies a Dense layer to *each* timestep of the decoder output
# It predicts the probability of each character in our vocabulary using softmax
# 
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'), name='output_dense') # vocab_size from CELL 2
decoder_outputs = decoder_dense(decoder_outputs)

# --- Define the Model ---
# Connects the encoder input, decoder input, and the final decoder output
model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='seq2seq_autocorrect')

print("\n--- Model Architecture Defined ---")
model.summary() # Print a summary of the layers

--- Defining Model Architecture ---

--- Model Architecture Defined ---


In [4]:
###
# CELL 4: PREPARE TARGETS AND COMPILE MODEL
###

import tensorflow as tf
import numpy as np

print("--- Preparing Decoder Targets and Compiling Model ---")

# --- Prepare Decoder Target Data ---
# The decoder target sequence should be the clean sequence shifted one step to the left
# Example: If clean_padded is [START, c, a, t, END, PAD],
#          decoder_target should be [c, a, t, END, PAD, PAD]

# Create decoder_target_data by slicing clean_padded from the second element onwards
decoder_target_data = clean_padded[:, 1:] # Shape: (num_samples, max_seq_length - 1)

# We need to add one more padding step at the end to make its length equal to max_seq_length
# Create a zero array with shape (num_samples, 1)
padding_column = np.zeros((decoder_target_data.shape[0], 1), dtype=np.int32)

# Concatenate the padding column to the end
decoder_target_data = np.concatenate([decoder_target_data, padding_column], axis=-1)

# One-Hot Encode the target data (required for sparse_categorical_crossentropy if not used)
# However, sparse_categorical_crossentropy is more memory efficient as it works directly with indices.
# We will use sparse_categorical_crossentropy, so we don't need to one-hot encode.
# If we were using categorical_crossentropy, we would uncomment the line below:
# decoder_target_one_hot = tf.keras.utils.to_categorical(decoder_target_data, num_classes=vocab_size)

print(f"\nShape of noisy_padded (Encoder Input): {noisy_padded.shape}")
print(f"Shape of clean_padded (Decoder Input): {clean_padded.shape}")
print(f"Shape of decoder_target_data (Decoder Target): {decoder_target_data.shape}")

# --- Compile the Model ---
# sparse_categorical_crossentropy works directly with integer indices (like ours)
# adam is a standard, effective optimizer
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

print("\n--- Model Compiled Successfully ---")

--- Preparing Decoder Targets and Compiling Model ---

Shape of noisy_padded (Encoder Input): (42985, 101)
Shape of clean_padded (Decoder Input): (42985, 101)
Shape of decoder_target_data (Decoder Target): (42985, 101)

--- Model Compiled Successfully ---


In [5]:
###
# CELL 5: TRAIN THE MODEL
###

import tensorflow as tf

print("--- Starting Model Training ---")

# --- Training Parameters ---
epochs = 5  # Start with a small number for testing
batch_size = 64

# --- Prepare Inputs and Targets for model.fit ---
# Encoder input: noisy sequences
encoder_input_data = noisy_padded

# Decoder input: clean sequences (used for teacher forcing)
decoder_input_data = clean_padded

# Decoder target: clean sequences shifted left by one step
# (We already prepared this in CELL 4 as decoder_target_data)

# --- Train the Model ---
# This is where the learning happens!
# The model will try to minimize the loss (sparse_categorical_crossentropy)
# by adjusting its internal weights based on the input data and expected targets.
# 
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2) # Use 20% of data for validation

print("\n--- Model Training Complete ---")

# --- Optional: Plot training history (requires matplotlib) ---
# You can uncomment this section later if you install matplotlib (`pip install matplotlib`)
# import matplotlib.pyplot as plt
#
# print("\n--- Plotting Training History ---")
# plt.figure(figsize=(12, 4))
#
# plt.subplot(1, 2, 1)
# plt.plot(history.history['loss'], label='Training Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.title('Loss Over Epochs')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
#
# plt.subplot(1, 2, 2)
# plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.title('Accuracy Over Epochs')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend()
#
# plt.tight_layout()
# plt.show()

--- Starting Model Training ---
Epoch 1/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 507ms/step - accuracy: 0.2679 - loss: 2.0398 - val_accuracy: 0.3113 - val_loss: 1.8541
Epoch 2/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 517ms/step - accuracy: 0.3467 - loss: 1.6159 - val_accuracy: 0.3560 - val_loss: 1.6743
Epoch 3/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 534ms/step - accuracy: 0.3769 - loss: 1.4762 - val_accuracy: 0.3757 - val_loss: 1.5942
Epoch 4/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m316s[0m 524ms/step - accuracy: 0.3919 - loss: 1.4030 - val_accuracy: 0.3818 - val_loss: 1.5515
Epoch 5/5
[1m538/538[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m275s[0m 512ms/step - accuracy: 0.4039 - loss: 1.3446 - val_accuracy: 0.3935 - val_loss: 1.5099

--- Model Training Complete ---


In [6]:
###
# CELL 6: SAVE THE TRAINED MODEL
###

import os

print("--- Saving the Trained Model ---")

# Define the path where the model should be saved
# We want to save it inside the 'ai_model' folder, not 'notebooks'
model_save_path = os.path.join(base_dir, 'autocorrect_model.h5') # base_dir was defined in CELL 1

try:
    # Save the entire model (architecture + weights + optimizer state)
    model.save(model_save_path)
    print(f"\nModel successfully saved to: {model_save_path}")
    print("File size:", os.path.getsize(model_save_path) / (1024 * 1024), "MB") # Print size in MB
except Exception as e:
    print(f"\nError saving model: {e}")



--- Saving the Trained Model ---

Model successfully saved to: C:\Users\jampa\Videos\Ghost Type Corrector\ai_model\autocorrect_model.h5
File size: 9.245223999023438 MB
