# Arabic Auto-Complete System

## Imports

In [47]:
import os  
import re   
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re

## Text Cleaning Function

In [48]:
import re

def clean_arabic_text(text):
    # 1. Remove tashkeel including tatweel 
    text = re.sub(r'[ً-ْـ]', '', text)
    
    # 2. Remove punctuation and special characters (keeping Arabic letters, numbers, and basic punctuation)
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s٠-٩۰-۹.,،؛؟!]', ' ', text)

    # 3. Normalize Arabic characters
    text = re.sub(r'[آأإ]', 'ا', text)  
    text = re.sub(r'ى', 'ي', text)     
    text = re.sub(r'ة', 'ه', text)     

    # 4. Remove English letters and Western digits
    text = re.sub(r'[a-zA-Z]', '', text)
    text = re.sub(r'[0-9]', '', text)

    # 5. Normalize Arabic punctuation to standard ones
    text = text.replace('؟', '?')  # Arabic question mark
    text = text.replace('،', ',')  # Arabic comma
    text = text.replace('؛', ';')  # Arabic semicolon

    # 6. Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. Remove repeated characters 
    text = re.sub(r'(.)\1+', r'\1', text)

    return text


## Data Loading and Cleaning

In [49]:
file_names = ['file1.txt', 'file2.txt', 'file3.txt', 'file4.txt']  # data files
all_clean_lines = []  

for file in file_names:
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()  
        for line in lines:
            if 'NULL' in line:
                continue  # skip lines with the word NULL
            clean_line = clean_arabic_text(line)
            if clean_line:  # don't save empty lines
                all_clean_lines.append(clean_line)


##  Save Cleaned Data


In [50]:
with open('cleaned_data.txt', 'w', encoding='utf-8') as f:
    for line in all_clean_lines:
        f.write(line + '\n')

## Read the data

In [51]:
with open('cleaned_data.txt', 'r', encoding='utf-8') as file:
    texts = file.readlines()

## Info about the data

In [52]:
print(f"Total texts: {len(texts)}")
print("Sample texts:")
for i in range(3):
    print(f"{i+1}. {texts[i].strip()}")

Total texts: 7904
Sample texts:
1. اهنئ الدكتور احمد جمال الدين, القيادي بحزب مصر, بمناسبه صدور اولي روايه
2. امير عيد هو الي فعلا يتقال عليه ستريكر صريح
3. الصداقه تزرع الحياه ازهارا


## Tokenization and Sequence Preparation

In [53]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index) + 1
print(f"\nTotal unique words: {total_words}")
input_sequences = []
for text in texts:
    token_list = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


Total unique words: 7868


## Padding Sequences

In [54]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
print("\nSample padded sequences:")
print(input_sequences[:3])



Sample padded sequences:
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0 3117  378]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0 3117  378  547]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0 3117  378  547  255]]


## Split Features (X) and Labels (y)

In [55]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)
