In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

def load_arabic_data():

    train = pd.read_csv("train.csv", encoding='utf-8')
    valid = pd.read_csv("valid.csv", encoding='utf-8')
    corpus= train['correct'].tolist()
    return corpus

def preprocess_arabic(corpus):
    # Arabic corpus normalization
    corpus = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+', ' ', corpus)
    corpus = re.sub(r'\s+', ' ', corpus).strip()
    return corpus

corpus=load_arabic_data()
corpus = [preprocess_arabic(text) for text in corpus]
corpus = corpus[:1000]  # Limit to 1000 samples for demonstration
# Step 2: Tokenize the Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
sequences = tokenizer.texts_to_sequences(corpus)

# Create input-target pairs
input_sequences = []
target_words = []
seq_len = 3  # Sequence length

for seq in sequences:
    for i in range(len(seq) - seq_len):
        input_sequences.append(seq[i:i + seq_len])
        target_words.append(seq[i + seq_len])

# Pad sequences to ensure uniform input size
input_sequences = pad_sequences(input_sequences, maxlen=seq_len, padding='pre')
target_words = np.array(target_words)

# Step 3: Define the Model
embedding_dim = 50  # Size of embeddings
hidden_dim = 64     # Size of GRU hidden state

# model = Sequential([
#     Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_len),
#     GRU(hidden_dim, return_sequences=False),  # Return only the last hidden state
#     Dense(vocab_size, activation='softmax')   # Output layer with softmax for vocabulary
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Step 4: Train the Model
# epochs = 10
# batch_size = 2

# model.fit(input_sequences, target_words, epochs=epochs, batch_size=batch_size)

# Step 5: Generate Predictions



In [3]:
def predict_next_word(model, tokenizer, text, seq_len):
    # Tokenize and pad the input text
    tokenized_text = tokenizer.texts_to_sequences([text])[0]
    padded_text = pad_sequences([tokenized_text[-seq_len:]], maxlen=seq_len, padding='pre')
    
    # Predict the next word
    predictions = model.predict(padded_text)
    predicted_index = np.argmax(predictions, axis=-1)[0]
    
    # Map index back to word
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return "<UNK>"


In [18]:
# Example usage
test_sentence = "السلام عليكم و"
next_word = predict_next_word(model, tokenizer, test_sentence, seq_len)
print(f"Next word after '{test_sentence}': {next_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 340ms/step
Next word after 'السلام عليكم و': عليه


In [24]:
#save the model
model.save('arabic_gru_model.h5')



In [1]:
from tensorflow.keras.models import load_model
# Load the model
loaded_model = load_model('arabic_gru_model.h5')
# Example usage of the loaded model
test_sentence = "السلام عليكم "



In [6]:
model = loaded_model
next_word = predict_next_word(model, tokenizer, test_sentence, seq_len)
print(f"Next word after '{test_sentence}': {next_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 332ms/step
Next word after 'السلام عليكم ': السلام
