# Lib

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd

In [3]:
import re

def extract_arabic_text(text):
    # Remove emojis and latin text
    text = re.sub(r"[^\u0600-\u06FF\s]+", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)
    # Return the remaining Arabic text
    return text.strip()

# Example usage
text = "Hello 👋, مرحبا بالعالم"
arabic_text = extract_arabic_text(text)
print(arabic_text)  # Output: "مرحبا بالعالم"


مرحبا بالعالم


# Model

In [5]:
# Prepare the training data
with open("/content/drive/MyDrive/ads.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [44]:
# Prepare the training data
with open("/content/drive/MyDrive/arabic_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [7]:
databn = pd.read_csv("/content/drive/MyDrive/MTCD.csv")

In [46]:
text

'\ufeffسلام لباس كدير بخير\nأجي تشرب قهوة\nسر تلعب'

In [6]:
len(text)

30051

In [15]:
databn = pd.read_csv("/content/drive/MyDrive/MTCD.csv")
databn["text"] = databn["text"].apply(extract_arabic_text)
databn = databn.drop(columns= "labels")

In [4]:
# Load the dataset
dataa = pd.read_csv("/content/drive/MyDrive/ads.csv", encoding="utf-8")

In [5]:
data = dataa[:30]

In [7]:
data.head(20)

Unnamed: 0,ads_clean
0,حصرياً و غير عند اورنج، عيش لفيبر فدارك ونتا ه...
1,تعرفوا معنا على سر الحرفة مع فاطمة، عمران و هش...
2,بفضل الخطوات ديالكم وزعنا الاأنترنت على عدة جم...
3,الساعة جديدة هادي ولا القديمة ؟ شكون فعائلتك و...
4,حيت عندنا ديما نتا لول، تبرع بالماكس ديال السخ...
5,اتبعوا معنا تجربة المقاول عمران سينو رئيس تعاو...
6,ماكين غير فورفي تبرع باللامحدود، تمزك، تفرج و ...
7,شارك ف مع أورنج و جمع الخطوات ديالك باش تساهم ...
8,أورنج، ريزو لي فين ماكنتي يجري بيك للقدام الري...
9,غير عند أورنج هاد رمضان ماكين غير لفراجة قسيمة...


In [6]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead
import pandas as pd

# Load the pre-trained GPT-2 model and tokenizer for Arabic
model_name = "aubmindlab/aragpt2-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelWithLMHead.from_pretrained(model_name)


# Preprocess the data
max_length = 512
input_ids = []
for text in data["ads_clean"]:
    encoded_text = tokenizer.encode(text, add_special_tokens=True, max_length=max_length)
    input_ids.append(encoded_text)
input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_length, dtype="long", value=0, truncating="post", padding="post")

# Define the training parameters
batch_size = 4
learning_rate = 5e-5
num_epochs = 6

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define the training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for i in range(0, len(input_ids), batch_size):
        batch = input_ids[i:i+batch_size]
        with tf.GradientTape() as tape:
            outputs = model(batch, return_dict=True)
            loss_value = loss(batch[:, 1:], outputs.logits[:, :-1, :])
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        print(f"Step {i//batch_size+1}/{len(input_ids)//batch_size+1}, Loss: {loss_value.numpy():.4f}")
    
    # Generate text after each epoch
    prompt = "حيت عند أورنج نتوما"
    input_ids = tokenizer.encode(prompt, return_tensors="tf")
    generated = model.generate(
        input_ids=input_ids,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=1.0,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    text = tokenizer.decode(generated[0], skip_special_tokens=True)
    print(f"Generated text: {text}")

# Save the model
model.save_pretrained("arabic_gpt2")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at aubmindlab/aragpt2-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/6




Step 1/8, Loss: 0.5983
Step 2/8, Loss: 0.5320
Step 3/8, Loss: 0.4973
Step 4/8, Loss: 0.3482
Step 5/8, Loss: 0.4478
Step 6/8, Loss: 0.4164
Step 7/8, Loss: 0.2110


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 8/8, Loss: 0.4326
Generated text: حيت عند أورنج نتوما بغيتي نديرو على هاد الهضرة
Epoch 2/6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 1/1, Loss: 3.3114
Generated text: حيت عند أورنج نتوما حتال على هاد الهضرة.
Epoch 3/6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 1/1, Loss: 1.3382
Generated text: حيت عند أورنج نتوما لقدامو نجيوكم معنا على الفيسبوك [رابط]
Epoch 4/6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 1/1, Loss: 0.0636
Generated text: حيت عند أورنج نتوما في
Epoch 5/6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 1/1, Loss: 0.0084
Generated text: حيت عند أورنج نتوما في
Epoch 6/6


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Step 1/1, Loss: 0.0004
Generated text: حيت عند أورنج نتوما كتوما إلعبوا أورسكوا معنا على اليوتوب و غادي نتواصل معكم على الفيسبوك


In [35]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelWithLMHead

# Load the pre-trained GPT-2 model and tokenizer for Arabic
model_name = "aubmindlab/aragpt2-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelWithLMHead.from_pretrained(model_name)

# Tokenize the text
input_ids = tokenizer.encode(text, return_tensors="tf")

# Define the training parameters
batch_size = 4
learning_rate = 5e-5
num_epochs = 5

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define the training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for i in range(0, len(input_ids), batch_size):
        batch = input_ids[i:i+batch_size]
        with tf.GradientTape() as tape:
            outputs = model(batch, return_dict=True)
            loss_value = loss(batch[:, 1:], outputs.logits[:, :-1, :])
        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        print(f"Step {i//batch_size+1}/{len(input_ids)//batch_size+1}, Loss: {loss_value.numpy():.4f}")
    
    # Generate text after each epoch
    prompt = "اتاي"
    input_ids = tokenizer.encode(prompt, return_tensors="tf")
    generated = model.generate(
        input_ids=input_ids,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        temperature=1.0,
        do_sample=True,
        top_k=50,
        top_p=0.95,
    )
    text = tokenizer.decode(generated[0], skip_special_tokens=True)
    print(f"Generated text: {text}")

# Save the model
model.save_pretrained("arabic_gpt2")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at aubmindlab/aragpt2-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


FileNotFoundError: ignored