In [None]:
import pandas as pd 
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import F1Score
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm



# 📥 Load & Prepare Arabic Summarization Dataset:
**In this step, we load and clean our Arabic news summarization dataset. Here's what happens:**


In [None]:
data = []
xl_sum = pd.DataFrame(data, columns=['text', 'summary'])
second = pd.read_csv('/kaggle/input/arabic-summarization-bbc-news/bbc_news_arabic_summarization.csv')
second = second.drop(['id','url','title'],axis=1)
xl_sum = pd.concat([xl_sum,second])
new_column_names = {
    'text': 'paragraph',
    'summary': 'summary',
}

# Rename the columns using the dictionary
xl_sum = xl_sum.rename(columns=new_column_names)
xl_sum.head()

# 🧹 Data Cleaning: Removing Duplicates
Provides a summary of dataset statistics: number of non-null entries, etc. While .describe() mainly gives info on numeric columns, it's useful to confirm your dataset shape

In [4]:
column_name = 'paragraph'  
xl_sum = xl_sum.drop_duplicates(subset=column_name)
column_name = 'summary'  
xl_sum = xl_sum.drop_duplicates(subset=column_name)
xl_sum.describe()

NameError: name 'xl_sum' is not defined

# 🧼 Text Preprocessing Functions for Arabic Summarization
To prepare our dataset for training, we define a set of functions for cleaning and standardizing Arabic text:

In [2]:
counter = 0
def delete_links(input_text):
    pettern  = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def remove_extra_spaces(input_text):
    replace = ' +'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    words = [word for word in words if word.isalpha()]
    out_text = ' '.join(words)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items()) 
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[^\u0621-\u064A\u0660-\u0669\u06F0-\u06F90-9]'
    out_text = re.sub(replace, " ", input_text)
    #words = nltk.word_tokenize(out_text)
    #words = [word for word in words if word.isalpha()]
    #out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    wnl = nltk.WordNetLemmatizer()
    lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
    out_text = [w for w in lemmatizedTokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text

def stem_text(input_text):
    st = ISRIStemmer()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [st.stem(w) for w in tokens]
    out_text = ' '.join(out_text)
    return out_text


def text_prepare(input_text, ar_text):
    global counter
    counter +=1

    #out_text = delete_links(input_text)
    #out_text = delete_repeated_characters(out_text)
    #out_text = delete_stopwords(input_text)
    out_text = clean_text(input_text)
    #out_text = remove_extra_spaces(out_text)
    if(counter%100==0):
        print(counter,'\n',out_text)
    return out_text

# 🧹 Applying Preprocessing to the Dataset
* Applies the text_prepare() function to both the paragraph and summary columns.

* args=(True,) passes the ar_text parameter (not used in the function logic currently).

* This step cleans and standardizes all text data in preparation for model training.

In [3]:
xl_sum['paragraph'] = xl_sum['paragraph'].apply(text_prepare, args=(True,))
xl_sum['summary'] = xl_sum['summary'].apply(text_prepare, args=(True,))

NameError: name 'xl_sum' is not defined

# 📊 Splitting the Dataset
    * Splits the dataset xl_sum into two parts:

        * 90% for training (train)

        * 10% for validation (val)

    * random_state=42 ensures the split is reproducible — using the same seed will always return the same split.



In [None]:
#train, val = train_test_split(xl_sum, test_size=0.1, random_state=42)
train, val = train_test_split(xl_sum, test_size=0.1, random_state=42)
train = train.sample(frac=0.5, random_state=42).reset_index(drop=True)


# 📦 SummaryDataset Class – Custom Dataset for Model Input

This class extends torch.utils.data.Dataset to format and encode the data for training the summarization model.

In [12]:
class SummaryDataset(Dataset):
    def __init__(
        self,
        data,
        text_max_token_len = 3000,
        summary_max_token_len = 400
    ):
        self.tokenizer = AutoTokenizer.from_pretrained("Jezia/AraBART-finetuned-wiki-ar")

        
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['paragraph']

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        summary_encoding = self.tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )


# 📦 SummaryDataModule – PyTorch Lightning Data Module

This class is used to organize and manage data loading during training and validation with PyTorch Lightning.

In [13]:
class SummaryDataModule(pl.LightningDataModule):
    def __init__(self,train_path,val_path,batch_size=12, text_max_token_len = 3000, summary_max_token_len = 400):
        super().__init__()
        self.train_path,self.val_path= train_path,val_path
        self.batch_size = batch_size
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
    
    def setup(self,stage=None):
        train = self.train_path
        val = self.val_path
        self.train_dataset = SummaryDataset(data=train,
                                            text_max_token_len=self.text_max_token_len,
                                            summary_max_token_len=self.summary_max_token_len)
        self.val_dataset = SummaryDataset(data=val,
                                          text_max_token_len=self.text_max_token_len,
                                          summary_max_token_len=self.summary_max_token_len)
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True,num_workers=2)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size=self.batch_size,shuffle=False,num_workers=2)

# 🤖 AraBart – Classe du Modèle de Résumé Automatique

Cette classe encapsule le modèle AraBART pour l'entraînement et la validation en utilisant PyTorch Lightning.

In [15]:
class AraBart(pl.LightningModule):
    def __init__(self, lr=0.0001):
        super().__init__()
        self.lr = lr
        self.model = AutoModelForSeq2SeqLM.from_pretrained("Jezia/AraBART-finetuned-wiki-ar")
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=self.lr)
    
    def training_step(self, batch, batch_size):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        decoder_attention_mask = batch['decoder_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

        return loss
    
    def validation_step(self, batch, batch_size):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        decoder_attention_mask = batch['decoder_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

        return loss

Installe gdown, un outil Python permettant de télécharger des fichiers à partir de Google Drive via un lien direct.


In [19]:
!pip install gdown




🔹 Téléchargement des Poids du Modèle

In [18]:

# Install gdown (only needed the first time)
!pip install -q gdown

# Download the model_weights.pth file directly from Google Drive
!gdown --id 1fCA-ECnHykbTaueCsfQ-N2mf2YboYI7A -O model_weights.pth


Downloading...
From (original): https://drive.google.com/uc?id=1fCA-ECnHykbTaueCsfQ-N2mf2YboYI7A
From (redirected): https://drive.google.com/uc?id=1fCA-ECnHykbTaueCsfQ-N2mf2YboYI7A&confirm=t&uuid=c91889de-5a88-4b02-b340-e6032f67582e
To: /kaggle/working/model_weights.pth
100%|████████████████████████████████████████| 557M/557M [00:05<00:00, 98.1MB/s]


🔹 Chargement du Modèle Pré-entraîné
  * Crée une instance du modèle AraBart que tu as défini précédemment.

  * Charge les poids pré-entraînés dans le modèle avec load_state_dict, en spécifiant weights_only=True pour ne charger que les poids, et non la structure complète du modèl

In [20]:

# First Time only

dm = SummaryDataModule(train_path=train,
                 val_path = val,
                 text_max_token_len = 1000,
                 batch_size=2)

trained_model = AraBart()
# Loading model weights with the 'weights_only=True' flag
trained_model.load_state_dict(torch.load('model_weights.pth', weights_only=True))




<All keys matched successfully>

* Configure le trainer PyTorch Lightning avec 3 époques et l'entraînement sur CPU (gpus=0).

* Lance l’entraînement en utilisant les données préparées (dm), mais cette partie est commentée car le modèle est déjà entraîné.

In [None]:
# Training
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=3, enable_checkpointing=False)
trainer.fit(trained_model,dm)

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

**🔹 Sauvegarde des Poids du Modèle**
> Explication :

   1.  trained_model.state_dict() : Récupère les poids et paramètres du modèle (c'est-à-dire l'état actuel du modèle) sous forme de dictionnaire.

   2. torch.save() : Sauvegarde ce dictionnaire dans un fichier. Ici, tu sauves les poids du modèle sous le nom model_weights.pth.

In [None]:
torch.save(trained_model.state_dict(), 'model_weights.pth')

In [None]:
from IPython.display import HTML

def create_download_link(title = "Download JSON file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
create_download_link(filename='model_weights.pth')


    * La fonction prend un texte comme entrée et utilise un modèle pré-entraîné AraBART pour générer un résumé.

    * Elle vérifie si un GPU est disponible et déplace le modèle et les données vers le périphérique approprié (GPU ou CPU).

    *  Le texte est tokenisé avec des paramètres qui définissent la longueur maximale et d'autres options nécessaires à la génération.

    *  La génération du résumé est effectuée en utilisant la méthode generate() avec des options pour contrôler la longueur, les répétitions et le processus de génération.

    * Le résumé est ensuite décodé à partir des IDs générés et renvoyé sous forme de texte lisible.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Jezia/AraBART-finetuned-wiki-ar")

trained_model.freeze()

def summarizeText(text, mymodel):
    # Check if a GPU is available and if not, use a CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Move the model to the GPU
    mymodel = mymodel.to(device)

    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move the text encoding to the GPU
    text_encoding = {key: val.to(device) for key, val in text_encoding.items()}

    generated_ids = mymodel.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=1000,
        num_beams=5,
        repetition_penalty=1.0,
        length_penalty=0.8,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]

    return "".join(preds)

In [None]:
!pip install rouge
from rouge import Rouge

**charger les données JSONL et les afficher sous forme de DataFrame :**

In [None]:
data = []
with open('/kaggle/input/arabic-summarization-bbc-news/arabic_test.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))
xl_test = pd.DataFrame(data, columns=['text', 'summary'])
xl_test.head()

1. Charge un échantillon de 10% des données de test à partir d'un fichier JSON.

2. Pour chaque exemple, génère un résumé du texte à l'aide du modèle AraBART.

3. Affiche le texte et le résumé généré pour chaque exemple.

4. Sauvegarde les résultats (ID de l'exemple et résumé) dans un DataFrame pour analyse ou évaluation.

In [None]:
import pandas as pd
import json

# Load the full dataset
data = []
with open('/kaggle/input/arabic-summarization-bbc-news/arabic_test.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Create DataFrame
xl_test = pd.DataFrame(data, columns=['text', 'summary'])

# Take only 10% of the dataset
xl_test = xl_test.sample(frac=0.1, random_state=42).reset_index(drop=True)

# ---- SUMMARIZATION LOOP STARTS HERE ----

model_summaries = pd.DataFrame(columns=['example_id','summary'])

for i in range(len(xl_test)):
    sample_row = xl_test.iloc[i]
    text = sample_row['text']
    summary = summarizeText(text, trained_model.model)
    
    print("Text\n", text, "\nSummary\n", summary, "\n\n")

    new_row = pd.DataFrame({'example_id': [i], 'summary': [summary]})
    model_summaries = pd.concat([model_summaries, new_row], ignore_index=True)


**# INFERENCEEEEEEEEEEEEEEE TAKES TIME**

In [None]:
'''model_summaries = pd.DataFrame(columns=['example_id','summary'])
for i in range(len(xl_test)):
    sample_row = xl_test.iloc[i]
    text = sample_row['text']
    summary = summarizeText(text, trained_model.model)
    print("Text\n", text, "\nSummary\n", summary, "\n\n")
    
    # Use pd.concat instead of append
    new_row = pd.DataFrame({'example_id': [i], 'summary': [summary]})
    model_summaries = pd.concat([model_summaries, new_row], ignore_index=True)'''




1. Chargement des données : Les données de test sont chargées à partir d'un fichier JSONL, puis converties en un DataFrame contenant des textes et leurs résumés correspondants.

2. Génération des résumés : Le modèle est utilisé pour générer des résumés pour chaque texte dans le jeu de test.

3. Calcul des scores Rouge-L : Le score Rouge-L est calculé pour chaque paire de résumé généré et résumé de référence, puis affiché.

4. Calcul de la moyenne : La moyenne des scores Rouge-L est calculée pour évaluer la performance globale du modèle sur les résumés générés.

In [None]:
# Create an instance of the Rouge object
rouge = Rouge()
system_summaries = model_summaries["summary"].tolist()
reference_summaries = xl_test["summary"].tolist()

# Calculate RougeL scores for the list of summaries
scores = rouge.get_scores(system_summaries, reference_summaries)

# Print the RougeL scores for each summary pair
rouge_l_score = []
for i, score in enumerate(scores):
    rouge_l_score.append(score['rouge-l']['f'])
    print("RougeL Score for Summary", i + 1, ":", score['rouge-l']['f'])

print(sum(rouge_l_score)/len(rouge_l_score))

In [None]:
import matplotlib.pyplot as plt

# Create a histogram
plt.hist(rouge_l_score, bins=100, color='blue', edgecolor='black')

# Add labels and title
plt.xlabel('Rouge L Score')
plt.ylabel('Frequency')
plt.title('Distribution of Rouge L Scores')

# Show the plot
plt.show()


# préparer votre environnement pour utiliser des modèles de sentence-transformers pour transformer des textes en vecteurs et effectuer des comparaisons ou des analyses sémantiques.

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util


**Le but principal de Sentence Transformers dans ce projet est de fournir une méthode pour évaluer la qualité des résumés générés par ton modèle de résumé.Ce modèle ne génère pas de texte lui-même, mais il t'aide à mesurer à quel point les résumés générés sont proches des résumés de référence, en termes de contenu sémantique.**


In [None]:
model = SentenceTransformer('stsb-roberta-large')
sentence1 = model_summaries["summary"][0]
sentence2 = xl_test["summary"][0]
# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

In [None]:
system_summaries = xl_test["summary"].values
reference_summaries = model_summaries["summary"].values
semantic_test = []
for i in range(len(system_summaries)):
    sentence1 = system_summaries[i]
    sentence2 = reference_summaries[i]
    embedding1 = model.encode(sentence1, convert_to_tensor=True)
    embedding2 = model.encode(sentence2, convert_to_tensor=True)
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    semantic = cosine_scores.item()
    semantic_test.append(semantic)
    print("Test",i,"Similarity score: ", semantic)
print(sum(semantic_test)/len(semantic_test))

In [None]:
# Create a histogram
plt.hist(semantic_test, bins=100, color='blue', edgecolor='black')

# Add labels and title
plt.xlabel('Semantic Similarity Score')
plt.ylabel('Frequency')
plt.title('Distribution of Semantic Similarity Scores')

# Show the plot
plt.show()
