# Arabic Auto-Complete System

## Imports

In [1]:
import os  
import re   
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re

## Text Cleaning Function

In [2]:
import re

def clean_arabic_text(text):
    # 1. Remove tashkeel including tatweel 
    text = re.sub(r'[ً-ْـ]', '', text)
    
    # 2. Remove punctuation and special characters (keeping Arabic letters, numbers, and basic punctuation)
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s٠-٩۰-۹.,،؛؟!]', ' ', text)

    # 3. Normalize Arabic characters
    text = re.sub(r'[آأإ]', 'ا', text)  
    text = re.sub(r'ى', 'ي', text)     
    text = re.sub(r'ة', 'ه', text)     

    # 4. Remove English letters and Western digits
    text = re.sub(r'[a-zA-Z]', '', text)
    text = re.sub(r'[0-9]', '', text)

    # 5. Normalize Arabic punctuation to standard ones
    text = text.replace('؟', '?')  # Arabic question mark
    text = text.replace('،', ',')  # Arabic comma
    text = text.replace('؛', ';')  # Arabic semicolon

    # 6. Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. Remove repeated characters 
    text = re.sub(r'(.)\1+', r'\1', text)

    return text


## Data Loading and Cleaning

In [3]:
file_names = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt']  # data files
all_clean_lines = []  

for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()  
        for line in lines:
            if 'NULL' in line:
                continue  # skip lines with the word NULL
            clean_line = clean_arabic_text(line)
            if clean_line:  # don't save empty lines
                all_clean_lines.append(clean_line)


##  Save Cleaned Data


In [4]:
with open('cleaned_data.txt', 'w', encoding='utf-8') as f:
    for line in all_clean_lines:
        f.write(line + '\n')

## Read the data

In [5]:
with open('cleaned_data.txt', 'r', encoding='utf-8') as file:
    texts = file.readlines()

## Info about the data

In [6]:
print(f"Total texts: {len(texts)}")
print("Sample texts:")
for i in range(3):
    print(f"{i+1}. {texts[i].strip()}")

Total texts: 8367
Sample texts:
1. اهنئ الدكتور احمد جمال الدين, القيادي بحزب مصر, بمناسبه صدور اولي روايه
2. امير عيد هو الي فعلا يتقال عليه ستريكر صريح
3. الصداقه تزرع الحياه ازهارا


## Tokenization and Sequence Preparation

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1
print(f"\nTotal unique words: {total_words}")
input_sequences = []
for text in texts:
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


Total unique words: 12415


## Padding Sequences

In [8]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
print("\nSample padded sequences:")
print(input_sequences[:3])



Sample padded sequences:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0 4112  471]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0 4112  471  692]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0 4112  471  692  306]]


## Split Features (X) and Labels (y)

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


##  Build and Train the Model

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Build the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=64, input_length=max_sequence_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
history = model.fit(X, y, epochs=30, verbose=1)



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 41, 64)            794560    
_________________________________________________________________
lstm (LSTM)                  (None, 150)               129000    
_________________________________________________________________
dense (Dense)                (None, 12415)             1874665   
Total params: 2,798,225
Trainable params: 2,798,225
Non-trainable params: 0
_________________________________________________________________


2025-04-25 14:04:58.869297: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
