In [1]:
from datasets import load_dataset
dataset = load_dataset("M-A-D/Mixed-Arabic-Dataset-Main")
# Normalize Arabic text (remove diacritics, etc.) using a library like arabic-reshaper

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.7.0+cpu
False


In [3]:
from transformers.utils import is_torch_available
print(is_torch_available())


True


In [4]:
import tkinter as tk
from tkinter import scrolledtext
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import torch
import os

In [5]:

# Suppress huggingface_hub symlink warning
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-base")

In [6]:


# Normalize Arabic text
def normalize_arabic(text):
    text = re.sub(r'[ًٌٍَُِّْ]', '', text)  # Remove diacritics
    text = re.sub(r'ـ', '', text)  # Remove tatweel
    text = re.sub(r'[إأآ]', 'ا', text)  # Normalize alif variants
    return text

# Autocomplete function: Predict top 3 next words
def autocomplete():
    prefix = input_field.get().strip()
    if not prefix:
        for btn in suggestion_buttons:
            btn.config(text="", command=lambda: None, state="disabled")
        return
    
    # Normalize input
    prefix = normalize_arabic(prefix)
    
    # Tokenize input
    inputs = tokenizer(prefix, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]  # Logits for the next token
        probs = torch.softmax(logits, dim=-1)
        top_k_probs, top_k_indices = torch.topk(probs, k=3)  # Top 3 probabilities
    
    # Decode top 3 tokens to words
    top_words = []
    for idx in top_k_indices[0]:
        token = tokenizer.decode([idx], skip_special_tokens=True).strip()
        if token:  # Ensure non-empty
            top_words.append(token)
    
    # Pad with empty strings if fewer than 3 words
    top_words = top_words + [""] * (3 - len(top_words))
    
    # Update suggestion buttons
    for i, (btn, word) in enumerate(zip(suggestion_buttons, top_words)):
        if word:
            btn.config(text=word, state="normal", command=lambda w=word: append_word(w))
        else:
            btn.config(text="", command=lambda: None, state="disabled")

# Append selected word to input field
def append_word(word):
    current_text = input_field.get().strip()
    new_text = f"{current_text} {word}" if current_text else word
    input_field.delete(0, tk.END)
    input_field.insert(0, new_text)
    # Clear suggestion buttons
    for btn in suggestion_buttons:
        btn.config(text="", command=lambda: None, state="disabled")

# Clear input field
def clear_input():
    input_field.delete(0, tk.END)
    for btn in suggestion_buttons:
        btn.config(text="", command=lambda: None, state="disabled")

# Create Tkinter GUI
root = tk.Tk()
root.title("Arabic Autocomplete")
root.geometry("600x400")

# Input label and field
tk.Label(root, text="أدخل النص العربي", font=("Arial", 12)).pack(pady=10)
input_field = tk.Entry(root, width=50, font=("Arial", 12), justify="right")
input_field.pack(pady=5)

# Buttons frame
button_frame = tk.Frame(root)
button_frame.pack(pady=10)
tk.Button(button_frame, text="إكمال النص", command=autocomplete, font=("Arial", 12)).pack(side=tk.LEFT, padx=5)
tk.Button(button_frame, text="مسح", command=clear_input, font=("Arial", 12)).pack(side=tk.LEFT, padx=5)

# Suggestion buttons
suggestion_frame = tk.Frame(root)
suggestion_frame.pack(pady=10)
suggestion_buttons = []
for i in range(3):
    btn = tk.Button(suggestion_frame, text="", font=("Arial", 12), width=15, state="disabled")
    btn.pack(side=tk.LEFT, padx=5)
    suggestion_buttons.append(btn)

# Ensure Arabic text displays correctly
root.option_add("*Font", "Arial 12")
root.option_add("*Entry*justify", "right")

# Run the GUI
root.mainloop()