# Arabic Auto-Complete System

## Imports

In [1]:
import os  
import re   
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import numpy as np

## Text Cleaning Function

In [2]:
import re

def clean_arabic_text(text):
    # 1. Remove tashkeel including tatweel 
    text = re.sub(r'[ً-ْـ]', '', text)
    
    # 2. Remove punctuation and special characters (keeping Arabic letters, numbers, and basic punctuation)
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s٠-٩۰-۹.,،؛؟!]', ' ', text)

    # 3. Normalize Arabic characters
    text = re.sub(r'[آأإ]', 'ا', text)  
    text = re.sub(r'ى', 'ي', text)     
    text = re.sub(r'ة', 'ه', text)     

    # 4. Remove English letters and Western digits
    text = re.sub(r'[a-zA-Z]', '', text)
    text = re.sub(r'[0-9]', '', text)

    # 5. Normalize Arabic punctuation to standard ones
    text = text.replace('؟', '?')  # Arabic question mark
    text = text.replace('،', ',')  # Arabic comma
    text = text.replace('؛', ';')  # Arabic semicolon

    # 6. Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. Remove repeated characters 
    text = re.sub(r'(.)\1+', r'\1', text)

    return text


## Data Loading and Cleaning

In [3]:
file_names = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt']  # data files
all_clean_lines = []  

for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()  
        for line in lines:
            if 'NULL' in line:
                continue  # skip lines with the word NULL
            clean_line = clean_arabic_text(line)
            if clean_line:  # don't save empty lines
                all_clean_lines.append(clean_line)


##  Save Cleaned Data


In [4]:
with open('cleaned_data.txt', 'w', encoding='utf-8') as f:
    for line in all_clean_lines:
        f.write(line + '\n')

## Read the data

In [5]:
with open('cleaned_data.txt', 'r', encoding='utf-8') as file:
    texts = file.readlines()

## Info about the data

In [6]:
print(f"Total texts: {len(texts)}")
print("Sample texts:")
for i in range(3):
    print(f"{i+1}. {texts[i].strip()}")

Total texts: 8367
Sample texts:
1. اهنئ الدكتور احمد جمال الدين, القيادي بحزب مصر, بمناسبه صدور اولي روايه
2. امير عيد هو الي فعلا يتقال عليه ستريكر صريح
3. الصداقه تزرع الحياه ازهارا


## Tokenization and Sequence Preparation

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1
print(f"\nTotal unique words: {total_words}")
input_sequences = []
for text in texts:
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


Total unique words: 12415


## Padding Sequences

In [8]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
print("\nSample padded sequences:")
print(input_sequences[:3])



Sample padded sequences:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0 4112  471]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0 4112  471  692]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0 4112  471  692  306]]


## Split Features (X) and Labels (y)

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


##  Build and Train the Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Build the model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=64, input_length=max_sequence_len - 1))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
history = model.fit(X, y, epochs=10, verbose=1)



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 41, 64)            794560    
_________________________________________________________________
lstm_1 (LSTM)                (None, 41, 128)           98816     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 12415)             1601535   
Total params: 2,626,495
Trainable params: 2,626,495
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


##  Predection Function

In [13]:
def predict_sequence(model, tokenizer, start_word, next_words, max_len):
    result = start_word
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                result += " " + word
                break
    return result

# Try an example
start_word = "دي"
next_words = 9
generated_text = predict_sequence(model, tokenizer, start_word, next_words, max_sequence_len)
print("النص الناتج:", generated_text)

النص الناتج: دي دي بتكون في و في و في و في


##  Arabic Complete system's GUI

In [None]:
import tkinter as tk
from tkinter import ttk, scrolledtext
import numpy as np
from PIL import Image, ImageTk
import os
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences  

PRIMARY_COLOR = "#2196F3"
SECONDARY_COLOR = "#FFC107"
BG_COLOR = "#F5F5F5"
TEXT_COLOR = "#212121"
ACCENT_COLOR = "#E3F2FD"

HEADER_FONT = ("Segoe UI", 24, "bold")
LABEL_FONT = ("Segoe UI", 12)
BUTTON_FONT = ("Segoe UI", 12, "bold")
TEXT_FONT = ("Segoe UI", 12)

class ArabicAutocompleteGUI:
    def __init__(self, root):
        self.root = root
      
        
        self.root.title("نظام الإكمال التلقائي للغة العربية")
        self.root.geometry("800x600")
        self.root.configure(bg=BG_COLOR)
        
        self.main_frame = tk.Frame(self.root, bg=BG_COLOR)
        self.main_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)
        
        self.setup_ui()
        
    def setup_ui(self):
        header_frame = tk.Frame(self.main_frame, bg=PRIMARY_COLOR, height=60)
        header_frame.pack(fill=tk.X, pady=(0, 20))
        
        header_label = tk.Label(
            header_frame, 
            text="نظام الإكمال التلقائي للغة العربية", 
            font=HEADER_FONT, 
            bg=PRIMARY_COLOR, 
            fg="white",
            pady=10
        )
        header_label.pack()
        
        input_frame = tk.Frame(self.main_frame, bg=BG_COLOR)
        input_frame.pack(fill=tk.X, pady=10)

        input_label = tk.Label(
            input_frame, 
            text="أدخل النص:", 
            font=LABEL_FONT, 
            bg=BG_COLOR, 
            fg=TEXT_COLOR,
            anchor="e"
        )
        input_label.pack(anchor="e", padx=5, pady=5)

        self.input_text = tk.Entry(
            input_frame, 
            font=TEXT_FONT, 
            bg="white", 
            fg=TEXT_COLOR,
            bd=1,
            relief=tk.SOLID,
            justify="right"
        )
        self.input_text.pack(fill=tk.X, padx=10, pady=5, ipadx=15, ipady=8)
        
        options_frame = tk.Frame(self.main_frame, bg=BG_COLOR)
        options_frame.pack(fill=tk.X, pady=10)
        
        words_label = tk.Label(
            options_frame, 
            text="عدد الكلمات للتنبؤ:", 
            font=LABEL_FONT, 
            bg=BG_COLOR, 
            fg=TEXT_COLOR
        )
        words_label.pack(side=tk.RIGHT, padx=5)
        
        self.words_var = tk.StringVar(value="5")
        words_spinner = tk.Spinbox(
            options_frame, 
            from_=1, 
            to=10, 
            textvariable=self.words_var,
            width=5,
            font=TEXT_FONT
        )
        words_spinner.pack(side=tk.RIGHT, padx=5)
        
        generate_button = tk.Button(
            options_frame, 
            text="توليد النص", 
            font=BUTTON_FONT, 
            bg=SECONDARY_COLOR, 
            fg=TEXT_COLOR,
            padx=15,
            pady=5,
            relief=tk.RAISED,
            command=self.generate_text
        )
        generate_button.pack(side=tk.LEFT, padx=5)
        
        clear_button = tk.Button(
            options_frame, 
            text="مسح", 
            font=BUTTON_FONT, 
            bg=PRIMARY_COLOR, 
            fg="white",
            padx=15,
            pady=5,
            relief=tk.RAISED,
            command=self.clear_text
        )
        clear_button.pack(side=tk.LEFT, padx=5)
        
        results_frame = tk.Frame(self.main_frame, bg=ACCENT_COLOR, bd=1, relief=tk.SOLID)
        results_frame.pack(fill=tk.BOTH, expand=True, pady=10)
        
        results_label = tk.Label(
            results_frame, 
            text="النص الناتج:", 
            font=LABEL_FONT, 
            bg=ACCENT_COLOR, 
            fg=TEXT_COLOR,
            anchor="e"
        )
        results_label.pack(anchor="e", padx=10, pady=5)
        
        self.results_text = scrolledtext.ScrolledText(
            results_frame, 
            font=TEXT_FONT, 
            bg="white", 
            fg=TEXT_COLOR,
            wrap=tk.WORD,
            height=10
        )
        self.results_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=(0, 10))
        self.results_text.tag_configure("rtl", justify="right")
        
    def generate_text(self):
        start_word = self.input_text.get().strip()
        if not start_word:
            return
        
        try:
            next_words = int(self.words_var.get())
            if next_words < 1:
                next_words = 1
            elif next_words > 20:
                next_words = 20
        except ValueError:
            next_words = 5
            self.words_var.set("5")
        
        try:
            generated_text = predict_sequence(model, tokenizer, start_word, next_words, max_sequence_len)
            self.results_text.delete(1.0, tk.END)
            self.results_text.insert(tk.END, generated_text, "rtl")
        except Exception as e:
            print("Error:", e)  
    
    def clear_text(self):
        self.input_text.delete(0, tk.END)
        self.results_text.delete(1.0, tk.END)
    
    


root = tk.Tk()
app = ArabicAutocompleteGUI(root)
root.mainloop()