In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/arabic-cybersecurity-texts/arabic_cybersecurity_texts.csv")
df.head()


Unnamed: 0,Text,Score
0,إذا كانت بيئة العمل لا تستند إلى نظام تشغيل آم...,9.5
1,احتمال أن يشكل تهديدا سوف تستخدم من التعرض للض...,8.9
2,برمجيات خبيثةأيضا في بعض الأحيانالبرامج الضارة...,5.2
3,من الأساليب الشائعة بشكل متزايد (2015) برامج ا...,2.4
4,وبالأستناد إلى نتائج المشروع المشترك لمعهد الم...,2.1


In [2]:
import re

def clean_arabic(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)   # keep Arabic only
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["Clean_Text"] = df["Text"].apply(clean_arabic)
df.head()


Unnamed: 0,Text,Score,Clean_Text
0,إذا كانت بيئة العمل لا تستند إلى نظام تشغيل آم...,9.5,إذا كانت بيئة العمل لا تستند إلى نظام تشغيل آم...
1,احتمال أن يشكل تهديدا سوف تستخدم من التعرض للض...,8.9,احتمال أن يشكل تهديدا سوف تستخدم من التعرض للض...
2,برمجيات خبيثةأيضا في بعض الأحيانالبرامج الضارة...,5.2,برمجيات خبيثةأيضا في بعض الأحيانالبرامج الضارة...
3,من الأساليب الشائعة بشكل متزايد (2015) برامج ا...,2.4,من الأساليب الشائعة بشكل متزايد برامج الإعلانا...
4,وبالأستناد إلى نتائج المشروع المشترك لمعهد الم...,2.1,وبالأستناد إلى نتائج المشروع المشترك لمعهد الم...


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df["Clean_Text"])

sequences = tokenizer.texts_to_sequences(df["Clean_Text"])
X = pad_sequences(sequences, maxlen=MAX_LEN)

y = df["Score"].values


2025-12-18 11:22:55.088173: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766056975.287713      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766056975.342338      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766056975.812951      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766056975.812992      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766056975.812995      55 computation_placer.cc:177] computation placer alr

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)


(115, 100) (29, 100)


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

X_train_t = torch.tensor(X_train, dtype=torch.long)
X_test_t  = torch.tensor(X_test, dtype=torch.long)

y_train_t = torch.tensor(y_train, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.float32)


In [6]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [7]:
BATCH_SIZE = 16

train_dataset = TextDataset(X_train_t, y_train_t)
test_dataset  = TextDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [8]:
xb, yb = next(iter(train_loader))
print(xb.shape, yb.shape)


torch.Size([16, 100]) torch.Size([16])


In [9]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, (h, _) = self.lstm(x)
        out = self.fc(h[-1])
        return out.squeeze()


In [10]:
VOCAB_SIZE = MAX_WORDS
EMBED_DIM = 128
HIDDEN_DIM = 64

model = LSTMModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [11]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{EPOCHS}] - Loss: {avg_loss:.4f}")


Epoch [1/10] - Loss: 32.2294
Epoch [2/10] - Loss: 28.9638
Epoch [3/10] - Loss: 25.7435
Epoch [4/10] - Loss: 26.0406
Epoch [5/10] - Loss: 21.7397
Epoch [6/10] - Loss: 11.5885
Epoch [7/10] - Loss: 8.1794
Epoch [8/10] - Loss: 8.3451
Epoch [9/10] - Loss: 7.9673
Epoch [10/10] - Loss: 7.6731


In [12]:
model.eval()
preds_list = []
true_list = []

with torch.no_grad():
    for xb, yb in test_loader:
        preds = model(xb)
        preds_list.extend(preds.numpy())
        true_list.extend(yb.numpy())


In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

mse = mean_squared_error(true_list, preds_list)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_list, preds_list)

print("MSE :", mse)
print("RMSE:", rmse)
print("MAE :", mae)


MSE : 9.814581711168664
RMSE: 3.132823281190413
MAE : 2.7177101543237425


In [14]:
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, h = self.gru(x)          # h: (1, batch, hidden)
        out = self.fc(h[-1])
        return out.squeeze()


In [15]:
VOCAB_SIZE = MAX_WORDS
EMBED_DIM = 128
HIDDEN_DIM = 64

gru_model = GRUModel(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001)


In [16]:
EPOCHS = 10

for epoch in range(EPOCHS):
    gru_model.train()
    total_loss = 0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = gru_model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"GRU Epoch [{epoch+1}/{EPOCHS}] - Loss: {avg_loss:.4f}")


GRU Epoch [1/10] - Loss: 31.3884
GRU Epoch [2/10] - Loss: 30.4850
GRU Epoch [3/10] - Loss: 26.9185
GRU Epoch [4/10] - Loss: 24.0528
GRU Epoch [5/10] - Loss: 16.7767
GRU Epoch [6/10] - Loss: 9.3798
GRU Epoch [7/10] - Loss: 8.0451
GRU Epoch [8/10] - Loss: 7.7911
GRU Epoch [9/10] - Loss: 7.7377
GRU Epoch [10/10] - Loss: 7.1807


In [17]:
gru_model.eval()
preds_list = []
true_list = []

with torch.no_grad():
    for xb, yb in test_loader:
        preds = gru_model(xb)
        preds_list.extend(preds.numpy())
        true_list.extend(yb.numpy())


In [18]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

mse_gru = mean_squared_error(true_list, preds_list)
rmse_gru = np.sqrt(mse_gru)
mae_gru = mean_absolute_error(true_list, preds_list)

print("GRU MSE :", mse_gru)
print("GRU RMSE:", rmse_gru)
print("GRU MAE :", mae_gru)


GRU MSE : 10.133272044264933
GRU RMSE: 3.1832800763151416
GRU MAE : 2.7647724773349434


In [19]:
import pandas as pd

results = pd.DataFrame({
    "Model": ["LSTM", "GRU"],
    "MAE": [mae, mae_gru],
    "RMSE": [rmse, rmse_gru],
    "MSE": [mse, mse_gru]
})

results


Unnamed: 0,Model,MAE,RMSE,MSE
0,LSTM,2.71771,3.132823,9.814582
1,GRU,2.764772,3.18328,10.133272


**2ème partie :**

In [20]:
!pip install transformers datasets




In [21]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [22]:
texts = [
    "Cybersecurity is an important field in modern information systems.",
    "Deep learning models are widely used for text analysis.",
    "Transformers have revolutionized natural language processing.",
    "GPT models can generate coherent and meaningful text.",
    "Data security is a major challenge in the digital world."
]


In [23]:
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=64):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors="pt"
        )

    def __len__(self):
        return self.encodings["input_ids"].size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["input_ids"][idx]
        }

dataset = TextDataset(texts, tokenizer)


In [24]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=10,
    save_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=6, training_loss=4.76703421274821, metrics={'train_runtime': 4.5442, 'train_samples_per_second': 3.301, 'train_steps_per_second': 1.32, 'total_flos': 91860480000.0, 'train_loss': 4.76703421274821, 'epoch': 3.0})

In [26]:
import torch

# mettre le modèle en mode évaluation
model.eval()

# choisir le device du modèle
device = model.device

prompt = "Cybersecurity is"

# tokenization AVEC attention mask
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True
)

# déplacer les tensors sur le même device que le modèle
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

# génération
with torch.no_grad():
    output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=60,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

# décodage
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


Cybersecurity is the fundamental concept behind the Internet, and the future is not just about government control of the Internet.
