## imports

In [None]:
!pip install tensorflow
!pip install scikit-learn
!pip install matplotlib
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers.schedules import ExponentialDecay
import unicodedata
import json
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt



## Load Dataset

In [29]:
with open("./youm7_articles.json", encoding="utf-8") as f:
    clean_data = json.load(f)

with open("./noisy_output.json", encoding="utf-8") as f:
    noisy_data = json.load(f)

# Directly align by order
x = [item["text"] for item in noisy_data]
y = [item["text"] for item in clean_data]


In [30]:
# Sample size limit (optional)
MAX_SAMPLES = 500
x = x[:MAX_SAMPLES]
y = y[:MAX_SAMPLES]

## Split Train and Test

In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)


## Step 2: Tokenization

In [32]:
VOCAB_SIZE = 10000
SEQ_LEN = 256

vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE, output_sequence_length=SEQ_LEN, standardize=None
)
vectorizer.adapt(xtrain + ytrain)

In [33]:
def prepare_dataset(input_texts, target_texts):
    input_ds = tf.data.Dataset.from_tensor_slices(input_texts)
    target_ds = tf.data.Dataset.from_tensor_slices(target_texts)
    ds = tf.data.Dataset.zip((input_ds, target_ds))

    def encode(noisy, clean):
        noisy = vectorizer(noisy)
        clean = vectorizer(clean)
        noisy = tf.cast(noisy, tf.int64)
        clean = tf.cast(clean, tf.int64)

        return {
            "encoder_inputs": noisy,
            "decoder_inputs": clean[:-1]
        }, clean[1:]

    return ds.map(encode).padded_batch(
        32,
        padded_shapes=(
            {
                "encoder_inputs": [None],
                "decoder_inputs": [None]
            },
            [None]
        ),
        padding_values=(
            {
                "encoder_inputs": tf.constant(0, dtype=tf.int64),
                "decoder_inputs": tf.constant(0, dtype=tf.int64)
            },
            tf.constant(0, dtype=tf.int64)
        ),
        drop_remainder=True
    )


train_ds = prepare_dataset(xtrain, ytrain)
val_ds = prepare_dataset(xtest, ytest)

## Positional Encoding Layer

In [34]:
class PositionalEncoding(layers.Layer):
    def __init__(self, sequence_length, d_model):
        super().__init__()
        self.pos_encoding = self.positional_encoding(tf.cast(sequence_length, tf.float32), d_model)

    def get_angles(self, pos, i, d_model):
        angles = pos / tf.pow(10000, (2 * (i//2)) / tf.cast(d_model, tf.float32))
        return angles

    def positional_encoding(self, pos, d_model):
        angle_rads = self.get_angles(
            tf.cast(tf.range(tf.cast(pos, tf.int32))[:, tf.newaxis], tf.float32), tf.cast(tf.range(d_model)[tf.newaxis, :], tf.float32), d_model
        )
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        return tf.expand_dims(pos_encoding, axis=0)

    def call(self, x):
        return x + self.pos_encoding[:, :tf.shape(x)[1], :]

## Transformer Encoder-Decoder

In [35]:
def transformer_model():
    d_model = 256
    num_heads = 4
    dff = 512
    num_layers = 4

    encoder_inputs = layers.Input(shape=(None,), name="encoder_inputs")
    decoder_inputs = layers.Input(shape=(None,), name="decoder_inputs")

    # Shared embedding
    embedding = layers.Embedding(VOCAB_SIZE, d_model)
    encoder_emb = embedding(encoder_inputs)
    decoder_emb = embedding(decoder_inputs)

    # Positional encoding
    encoder_emb = PositionalEncoding(SEQ_LEN, d_model)(encoder_emb)
    decoder_emb = PositionalEncoding(SEQ_LEN, d_model)(decoder_emb)

    # Encoder
    for _ in range(num_layers):
        encoder_emb = layers.LayerNormalization()(encoder_emb)
        encoder_emb = layers.MultiHeadAttention(num_heads, d_model)(encoder_emb, encoder_emb)
        encoder_emb = layers.Dense(dff, activation='relu')(encoder_emb)

    # Decoder
    for _ in range(num_layers):
        decoder_emb = layers.LayerNormalization()(decoder_emb)
        decoder_emb = layers.MultiHeadAttention(num_heads, d_model)(decoder_emb, decoder_emb)
        decoder_emb = layers.MultiHeadAttention(num_heads, d_model)(decoder_emb, encoder_emb)
        decoder_emb = layers.Dense(dff, activation='relu')(decoder_emb)

    outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(decoder_emb)

    return tf.keras.Model([encoder_inputs, decoder_inputs], outputs)

## Compile & Train

In [27]:
model = transformer_model()
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

model.fit(train_ds, validation_data=val_ds, epochs=10)
model.save("arabic_text_correction_model.keras")


# Save model

Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 39s/step - accuracy: 0.0873 - loss: 9.2046  
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 35s/step - accuracy: 0.2297 - loss: 8.8652
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 37s/step - accuracy: 0.2297 - loss: 8.9868
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 36s/step - accuracy: 0.1546 - loss: 8.3614
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 37s/step - accuracy: 0.1134 - loss: 7.0779
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 39s/step - accuracy: 0.2297 - loss: 7.0673
Epoch 7/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 39s/step - accuracy: 0.2297 - loss: 7.2950
Epoch 8/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 83s/step - accuracy: 0.2297 - loss: 6.9575
Epoch 9/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0