# Importing libraries

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional , Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import nltk
import re

2025-11-01 14:14:26.348177: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-01 14:14:26.407349: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-01 14:14:28.350733: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

print(tf.config.list_logical_devices('GPU'))


[LogicalDevice(name='/device:GPU:0', device_type='GPU')]


I0000 00:00:1761995671.137925    3512 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1753 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


# load Data

In [3]:
input_file = 'holmes.txt'

# Read the contents of the file
with open(input_file, 'r', encoding='utf-8') as infile:
    data = infile.read()

## View characteristics of data 

In [4]:
data[:100] # view first 100 characters

"*Project Gutenberg's Etext of Tom Swift And His Submarine Boat*\n\n#4 in the Victor Appleton's Tom Swi"

In [5]:
len(data)

236110709

In [6]:
data = data[:5000000] # Limit data to 5000000 characters

In [7]:
data



# Clean Data

In [8]:

def preprocessing_text(text):
    
    start_marker = "Chapter One"
    try:
        start_index = text.index(start_marker)
        text = text[start_index:]
    except ValueError:
        print("Warning: 'Chapter One' not found. Cleaning from the start.")
        pass

    text = re.sub(r'\n\n+', ' \n ', text) # منستخدم سطر جديد واحد بس للفصل بين الفقرات
    text = re.sub(r'\n', ' ', text)
    
    text = text.lower()
    
    
    # . توحيد كل علامات نهاية الجملة لـ 
    text = re.sub(r'[!?]', '.', text)
    
    #  إزالة كلشي ما عدا الاحرف والارقام والنقط والمسافات
    text = re.sub(r'[^a-z0-9\.\s]', '', text)

    # 5.  إضافة مسافات حول النقطة
    text = re.sub(r'\.', ' . ', text)
    
    # 6. إزالة المسافات الزايدة
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [9]:
data=preprocessing_text(data)
print(data[:500])

chapter one news of a treasure wreck there was a rushing whizzing throbbing noise in the air . a great body like that of some immense bird sailed along casting a grotesque shadow on the ground below . an elderly man who was seated on the porch of a large house started to his feet in alarm . gracious goodness . what was that mrs . baggert . he called to a motherlylooking woman who stood in the doorway . what happened . nothing much mr . swift was the calm reply i think that was tom and mr . sharp


# Tokenization

In [10]:
sentences = [s.strip() for s in data.split(' . ') if s.strip()]
print(f"Total sentences found: {len(sentences)}")
print(f"First sentence example: {sentences[0][:100]}...")

Total sentences found: 57369
First sentence example: chapter one news of a treasure wreck there was a rushing whizzing throbbing noise in the air...


In [11]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

# Generate input sequences
input_sequences = []
for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    if len(token_list) <= 1:
        continue
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')


In [12]:
X,labels = input_sequences[:,:-1],input_sequences[:,-1]


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(X, labels, test_size=0.1, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (740542, 603)
y_train shape: (740542,)


# Define rhe model

In [14]:
model = Sequential()
model.add(Embedding(total_words, 100)) 
model.add(Bidirectional(LSTM(150)))
model.add(Dropout(0.2)) 
model.add(Dense(total_words, activation='softmax'))

adam = Adam(learning_rate=0.001)

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer=adam, 
              metrics=['accuracy'])

## Training The model

In [15]:
BATCH_SIZE = 64 

# إنشاء (dataset) للتدريب
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# إنشاء (dataset) للـ Validation
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [16]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint_callback = ModelCheckpoint(
    filepath='best_model.keras', 
    save_best_only=True,
    monitor='val_loss',
    mode='min',  
    verbose=1
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=3,
    mode='min',
    verbose=1,
    restore_best_weights=True
)

In [18]:
history = model.fit(
    train_dataset,
    epochs=30,
    validation_data=val_dataset,
    callbacks=[checkpoint_callback, early_stopping_callback],
    verbose=1
)

Epoch 1/30


2025-11-01 14:15:12.192127: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91400


[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.1080 - loss: 6.4956
Epoch 1: val_loss improved from None to 5.64627, saving model to best_model.keras
[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1559s[0m 133ms/step - accuracy: 0.1310 - loss: 6.1103 - val_accuracy: 0.1654 - val_loss: 5.6463
Epoch 2/30
[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - accuracy: 0.1698 - loss: 5.5021
Epoch 2: val_loss improved from 5.64627 to 5.40488, saving model to best_model.keras
[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1260s[0m 109ms/step - accuracy: 0.1775 - loss: 5.4201 - val_accuracy: 0.1916 - val_loss: 5.4049
Epoch 3/30
[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step - accuracy: 0.1950 - loss: 5.1830
Epoch 3: val_loss improved from 5.40488 to 5.32263, saving model to best_model.keras
[1m11571/11571[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [


KeyboardInterrupt

