In [None]:
import pandas as pd 
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
import joblib
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics.classification import F1Score
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split

from tqdm.auto import tqdm



# 📥 Load & Prepare Arabic Summarization Dataset:
**In this step, we load and clean our Arabic news summarization dataset. Here's what happens:**


In [None]:
data = []
xl_sum = pd.DataFrame(data, columns=['text', 'summary'])
second = pd.read_csv('/kaggle/input/arabic-summarization-bbc-news/bbc_news_arabic_summarization.csv')
second = second.drop(['id','url','title'],axis=1)
xl_sum = pd.concat([xl_sum,second])
new_column_names = {
    'text': 'paragraph',
    'summary': 'summary',
}

# Rename the columns using the dictionary
xl_sum = xl_sum.rename(columns=new_column_names)
xl_sum.head()

# 🧹 Data Cleaning: Removing Duplicates
Provides a summary of dataset statistics: number of non-null entries, etc. While .describe() mainly gives info on numeric columns, it's useful to confirm your dataset shape

In [None]:
column_name = 'paragraph'  
xl_sum = xl_sum.drop_duplicates(subset=column_name)
column_name = 'summary'  
xl_sum = xl_sum.drop_duplicates(subset=column_name)
xl_sum.describe()

# 🧼 Text Preprocessing Functions for Arabic Summarization
To prepare our dataset for training, we define a set of functions for cleaning and standardizing Arabic text:

In [None]:
counter = 0
def delete_links(input_text):
    pettern  = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    out_text = re.sub(pettern, ' ', input_text)
    return out_text

def delete_repeated_characters(input_text):
    pattern  = r'(.)\1{2,}'
    out_text = re.sub(pattern, r"\1\1", input_text)
    return out_text

def remove_extra_spaces(input_text):
    replace = ' +'
    out_text = re.sub(replace, " ", input_text)
    words = nltk.word_tokenize(out_text)
    words = [word for word in words if word.isalpha()]
    out_text = ' '.join(words)
    return out_text

def replace_letters(input_text):
    replace = {"أ": "ا","ة": "ه","إ": "ا","آ": "ا","": ""}
    replace = dict((re.escape(k), v) for k, v in replace.items()) 
    pattern = re.compile("|".join(replace.keys()))
    out_text = pattern.sub(lambda m: replace[re.escape(m.group(0))], input_text)
    return out_text

def clean_text(input_text):
    replace = r'[^\u0621-\u064A\u0660-\u0669\u06F0-\u06F90-9]'
    out_text = re.sub(replace, " ", input_text)
    #words = nltk.word_tokenize(out_text)
    #words = [word for word in words if word.isalpha()]
    #out_text = ' '.join(words)
    return out_text

def remove_vowelization(input_text):
    vowelization = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    out_text = re.sub(vowelization, '', input_text)
    return out_text

def delete_stopwords(input_text):
    stop_words = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    wnl = nltk.WordNetLemmatizer()
    lemmatizedTokens =[wnl.lemmatize(t) for t in tokens]
    out_text = [w for w in lemmatizedTokens if not w in stop_words]
    out_text = ' '.join(out_text)
    return out_text

def stem_text(input_text):
    st = ISRIStemmer()
    tokenizer = nltk.tokenize.WhitespaceTokenizer()
    tokens = tokenizer.tokenize(input_text)
    out_text = [st.stem(w) for w in tokens]
    out_text = ' '.join(out_text)
    return out_text


def text_prepare(input_text, ar_text):
    global counter
    counter +=1

    #out_text = delete_links(input_text)
    #out_text = delete_repeated_characters(out_text)
    #out_text = delete_stopwords(input_text)
    out_text = clean_text(input_text)
    #out_text = remove_extra_spaces(out_text)
    if(counter%100==0):
        print(counter,'\n',out_text)
    return out_text

# 🧹 Applying Preprocessing to the Dataset
* Applies the text_prepare() function to both the paragraph and summary columns.

* args=(True,) passes the ar_text parameter (not used in the function logic currently).

* This step cleans and standardizes all text data in preparation for model training.

In [None]:
xl_sum['paragraph'] = xl_sum['paragraph'].apply(text_prepare, args=(True,))
xl_sum['summary'] = xl_sum['summary'].apply(text_prepare, args=(True,))

# mbart MODEL 

In [None]:
pip install -U accelerate

In [None]:
from datasets import Dataset
from transformers import MBartTokenizer, MBartForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Charger vos données (en supposant qu'elles soient déjà chargées dans xl_sum)
train_df, _ = train_test_split(xl_sum, test_size=0.1, random_state=42)

# Prendre seulement 50% des données d'entraînement
train_df = train_df.sample(frac=0.3, random_state=42).reset_index(drop=True)

# Convertir en format Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)

# Charger le tokenizer mBART
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")

# Définir la langue cible pour la génération (Arabe)
tokenizer.src_lang = "en_XX"  # Langue source (Anglais)
tokenizer.tgt_lang = "ar_AR"  # Langue cible (Arabe)

def preprocess_function(examples):
    # Préparer l'entrée et la sortie pour la traduction (résumé ici)
    inputs = examples["paragraph"]
    targets = examples["summary"]

    # Tokenizer pour les entrées (textes à résumer)
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenizer pour les sorties (résumés)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenizer les données d'entraînement
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["paragraph", "summary"])

# Charger le modèle mBART pour la génération
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",  # Sauvegarder les résultats
    do_train=True,
    do_eval=False,  # Pas d'évaluation durant l'entraînement
    per_device_train_batch_size=6,  # Réduire la taille du batch
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_dir="./logs",  # Sauvegarder les logs
    logging_steps=50,
    save_total_limit=2,
    save_strategy="epoch",
    fp16=True,

    report_to="none"  # Désactiver les rapports vers wandb
)


# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
)

# Démarrer l'entraînement
trainer.train()


MODELLLLLLLLLLLLLLLLL 2

In [None]:
#model.save_pretrained("/kaggle/working/mbart_summary_model")
#tokenizer.save_pretrained("/kaggle/working/mbart_summary_model")



In [None]:
# Save only the model weights
#!rm -rf /kaggle/working/results

In [None]:
#torch.save(model.state_dict(), "model_weights.pth")


In [5]:
!pip install rouge-score




In [7]:
from transformers import MBartForConditionalGeneration, AutoTokenizer
import torch
import json
import pandas as pd
from rouge_score import rouge_scorer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

# Load the fine-tuned mBART model manually
model_path = "/kaggle/input/model-weights/model_weights(2).pth"
trained_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")  # Load the base model first

# Now load the weights from your custom path
trained_model.load_state_dict(torch.load(model_path))

# Freeze the model if you don't plan to train further
trained_model.eval()

# Define summarizeText function for mBART
def summarizeText(text, mymodel):
    # Check if a GPU is available and if not, use a CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Move the model to the GPU
    mymodel = mymodel.to(device)

    # Tokenize the input text
    text_encoding = tokenizer(
        text,
        max_length=1000,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    # Move the text encoding to the GPU
    text_encoding = {key: val.to(device) for key, val in text_encoding.items()}

    # Generate the summary
    generated_ids = mymodel.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=1000,
        num_beams=5,
        repetition_penalty=1.0,
        length_penalty=0.8,
        early_stopping=True
    )

    # Decode the generated summary
    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]

    return "".join(preds)

# Load the test dataset
data = []
with open('/kaggle/input/arabic-summarization-bbc-news/arabic_test.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

# Create DataFrame
xl_test = pd.DataFrame(data, columns=['text', 'summary'])

# Take only 10% of the dataset
xl_test = xl_test.sample(frac=0.1, random_state=42).reset_index(drop=True)

# ---- SUMMARIZATION LOOP STARTS HERE ----

# Create an empty DataFrame to store the model summaries
model_summaries = pd.DataFrame(columns=['example_id', 'summary'])

# Generate summaries using the model
for i in range(len(xl_test)):
    sample_row = xl_test.iloc[i]
    text = sample_row['text']
    summary = summarizeText(text, trained_model)
    
    # Printing the text and summary for debugging
    print("Text\n", text, "\nSummary\n", summary, "\n\n")

    new_row = pd.DataFrame({'example_id': [i], 'summary': [summary]})
    model_summaries = pd.concat([model_summaries, new_row], ignore_index=True)

# Create an instance of the Rouge scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

# Extract the system and reference summaries
system_summaries = model_summaries["summary"].tolist()
reference_summaries = xl_test["summary"].tolist()

# Calculate ROUGE scores for the summaries
scores = []
for i in range(len(system_summaries)):
    score = scorer.score(reference_summaries[i], system_summaries[i])
    scores.append(score)

# Print the ROUGE scores for each summary
for i, score in enumerate(scores):
    print(f"Summary {i + 1} ROUGE scores: {score}")

# Calculate the average Rouge-L score
rouge_l_score = [score['rougeL'].fmeasure for score in scores]
average_rouge_l = sum(rouge_l_score) / len(rouge_l_score)
print(f"Average Rouge-L F-Score: {average_rouge_l}")


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Text
 وستحتفظ ماكينزي لنفسها بحصة 4% من أسهم شركة أمازون التي أسسها بيزوس قبل 25 عاما. وأعلنت ماكينزي الاتفاق في تغريدة نشرتها على حسابها على موقع تويتر وهي التغريدة الأولى والوحيدة لها على الموقع منذ اشتركت فيه قبل أيام. وعاش جيف وماكينزي بيزوس معا منذ مطلع التسعينات من القرن الماضي وأنجبا 4 أبناء. وتعتبر امازون واحدة من أقوى مواقع المبيعات على شبكة الإنترنت وبلغ حجم مبيعاتها العام الماضي نحو 232 مليار دولار وقد ساهمت الشركة في تضخم ثروة بيزوس وأسرته منذ أسسها عام 1994 لتصبح أكثر من 131 مليار دولار حسب مجلة فوربس الامريكية. أما ماكينزي بيزوس فهي مؤلفة وروائية ناجحة ولها روايتان منشورتان حيث تعلمت فن القص في جامعة برينستون على يدي توني موريسون الفائزة بجائزة نوبل. ووصفتها موريسون قائلة إنها "بالفعل واحدة من أفضل الطلبة الذين درست لهم فن الكتابة الإبداعية". وتعد هذه التسوية هي الأعلى في التاريخ منذ انفصال تاجر التحف أليك ويلدرستاين وزوجته السابقة عام 1999 والتي دفع بموجبها لزوجته 3.8 مليار دولار. 
Summary
 أعربت شركة غوغل عن قلقها الشديد إزاء ما وصفته بـ الاختراق الواسع الذي تعرضت له شر