# Requirements

In [1]:
!pip install transformers datasets sentencepiece evaluate accelerate




In [2]:
!pip install evaluate



In [3]:
!pip install rouge-score



In [10]:
!pip install peft accelerate bitsandbytes sentencepiece


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [15]:
!pip install -U bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
!pip install bert-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


# Libraries

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


In [2]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset
import evaluate
from transformers import BitsAndBytesConfig
import torch
from peft import get_peft_model, LoraConfig, TaskType
import numpy as np
from collections import defaultdict
import numpy as np
import evaluate
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


# Dataset

In [3]:

# Load only the required languages
xl_sum_en = pd.DataFrame(load_dataset("csebuetnlp/xlsum", "english", split="train"))[:10000]
xl_sum_es = pd.DataFrame(load_dataset("csebuetnlp/xlsum", "spanish", split="train"))[:10000]
xl_sum_zh =  pd.DataFrame(load_dataset("csebuetnlp/xlsum", "chinese_simplified", split="train"))[:10000]



In [4]:
dataset = pd.concat([xl_sum_en, xl_sum_es,xl_sum_zh], ignore_index=True)

In [5]:
train,test = train_test_split(dataset, test_size=0.1, random_state=42)

In [6]:


len(train),len(test)

(27000, 3000)

In [7]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

# Tokenizer

In [8]:

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 128


def preprocess(example):
    inputs = example["text"]
    targets = example["summary"]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train = train.map(preprocess, batched=True, remove_columns=["text", "summary", "title", "url", "id"])
test = test.map(preprocess, batched=True, remove_columns=["text", "summary", "title", "url", "id"])


Map: 100%|██████████| 27000/27000 [00:11<00:00, 2408.21 examples/s]
Map: 100%|██████████| 3000/3000 [00:01<00:00, 2587.52 examples/s]


In [9]:
train

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 27000
})

In [10]:
test

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3000
})

# Model

In [11]:
"""
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",     # best for QLoRA
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)
"""

'\nbnb_config = BitsAndBytesConfig(\n    load_in_4bit=True,\n    bnb_4bit_quant_type="nf4",     # best for QLoRA\n    bnb_4bit_use_double_quant=True,\n    bnb_4bit_compute_dtype=torch.float16,\n)\n'

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)



# Rouge Metrics

In [None]:

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Use the same tokenizer used during training
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Compute BERTScore
    bertscore_result = bertscore.compute(
        predictions=decoded_preds,
        references=decoded_labels,
           lang="multilingual"
    )

    result = {key: round(value * 100, 4) for key, value in result.items()}
    result["gen_len"] = np.mean([len(tokenizer.tokenize(pred)) for pred in decoded_preds])
    result["bertscore_f1"] = round(np.mean(bertscore_result["f1"]) * 100, 4)
    result["bertscore_precision"] = round(np.mean(bertscore_result["precision"]) * 100, 4)
    result["bertscore_recall"] = round(np.mean(bertscore_result["recall"]) * 100, 4)
    return result




In [14]:
# Disable P2P and IB communication
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"


In [15]:
"""lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # or ["k", "q", "v"] depending on attention blocks
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Should show only a few LoRA layers trainable"""


'lora_config = LoraConfig(\n    r=8,\n    lora_alpha=16,\n    target_modules=["q", "v"],  # or ["k", "q", "v"] depending on attention blocks\n    lora_dropout=0.05,\n    bias="none",\n    task_type=TaskType.SEQ_2_SEQ_LM\n)\n\nmodel = get_peft_model(model, lora_config)\nmodel.print_trainable_parameters()  # Should show only a few LoRA layers trainable'

# Training

In [16]:
from transformers import DataCollatorForSeq2Seq

from transformers import DataCollatorForSeq2Seq
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:



training_args = Seq2SeqTrainingArguments(
    output_dir="./bart-multilingual-summarizer-new-new",
    eval_strategy="steps",
    learning_rate=5e-5,
    eval_steps=1000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="steps",
    save_steps = 1000,
    fp16=True,
    push_to_hub=False,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,            
    metric_for_best_model="rougeL",         
greater_is_better=True,   
    generation_max_length=128,   # ✅ sets generation limits
    generation_num_beams=4       # ✅ sets beam search for better outputs
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = collator

)


  trainer = Seq2SeqTrainer(


In [None]:

trainer.train()




Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bertscore F1,Bertscore Precision,Bertscore Recall
1000,1.2739,1.276647,20.526,6.0543,14.396,14.3842,79.094333,69.886,69.2053,70.7021
2000,1.2406,1.217741,20.7074,6.1922,14.577,14.549,63.158667,69.6827,69.701,69.8179
3000,1.2133,1.179448,20.5273,5.9163,14.1318,14.1205,69.398,69.4686,69.2205,69.8453
4000,1.0149,1.157119,20.9487,6.3573,14.7509,14.7154,70.332333,70.1071,69.7391,70.6246
5000,0.9631,1.143168,21.277,6.4355,14.992,14.9601,75.328,70.2324,69.5812,71.0324
6000,0.9819,1.134357,21.6028,6.7639,15.1874,15.1845,68.886333,70.151,69.7871,70.6791
7000,0.8592,1.136339,21.4206,6.5721,15.018,15.0003,70.765333,70.4074,69.8865,71.0988
8000,0.8233,1.136148,21.4654,6.5732,15.0938,15.0959,67.596,70.0732,69.7467,70.5626
9000,0.8681,1.12289,21.8756,6.8647,15.4539,15.4252,71.496667,70.3941,69.8577,71.0993
10000,0.8587,1.122647,21.9283,7.0082,15.5159,15.5029,72.574,70.4786,69.8933,71.2287


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=10125, training_loss=1.0448691087887612, metrics={'train_runtime': 6248.3017, 'train_samples_per_second': 12.964, 'train_steps_per_second': 1.62, 'total_flos': 8.7767736385536e+16, 'train_loss': 1.0448691087887612, 'epoch': 3.0})

# Save model

In [20]:
trainer.save_model("./continue-finetuned-model")
trainer.tokenizer.save_pretrained("./continue-finetuned-model")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('./continue-finetuned-model/tokenizer_config.json',
 './continue-finetuned-model/special_tokens_map.json',
 './continue-finetuned-model/vocab.json',
 './continue-finetuned-model/merges.txt',
 './continue-finetuned-model/added_tokens.json',
 './continue-finetuned-model/tokenizer.json')

In [None]:
model.save_pretrained("./bart-multilingual-final")
tokenizer.save_pretrained("./bart-multilingual-final")

# Push to Huggingface

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/home/SSD2/mahmoudreda/bart-multilingual-final"


model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)





In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/home/SSD2/mahmoudreda/bart-multilingual-final"


model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)





In [5]:
model.push_to_hub("Mahmoud3899/bart-multilingual-final")
tokenizer.push_to_hub("Mahmoud3899/bart-multilingual-final")




model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mahmoud3899/bart-multilingual-final/commit/4e88fea7f30959f023c2e47df280d6eb2f7cd409', commit_message='Upload tokenizer', commit_description='', oid='4e88fea7f30959f023c2e47df280d6eb2f7cd409', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mahmoud3899/bart-multilingual-final', endpoint='https://huggingface.co', repo_type='model', repo_id='Mahmoud3899/bart-multilingual-final'), pr_revision=None, pr_num=None)

# Evaluation

In [1]:
from transformers import pipeline
summarizer = pipeline("summarization", model="bart-multilingual-final", tokenizer="bart-multilingual-final",device=-1)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [3]:
import pandas as pd
from datasets import load_dataset
test =  pd.DataFrame(load_dataset("csebuetnlp/xlsum", "english", split="train"))
len(test)

Generating train split: 100%|██████████| 306522/306522 [00:01<00:00, 187657.48 examples/s]
Generating test split: 100%|██████████| 11535/11535 [00:00<00:00, 213038.68 examples/s]
Generating validation split: 100%|██████████| 11535/11535 [00:00<00:00, 205068.06 examples/s]


11535

In [4]:
test["text"][0]

'By Kate DaileyBBC News Earlier this week, Trump posted a photo of himself sitting at a desk at Mar-a-Largo, a permanent marker hovering over a notepad. "Writing my inaugural address at the Winter White House, Mar-a-Lago, three weeks ago. Looking forward to Friday," he tweeted. Trump vows to end \'American carnage\' Trump\'s angry call to arms Full text of Trump\'s inauguration speech It\'s unclear whether the president-elect actually wrote the speech himself, but the content was pure Trump: the same populist message that resonated throughout the primaries and the campaign. "Today, we are not merely transferring power from one administration to another, or from one party to another, but we are transferring power from Washington, DC, and giving it back to you, the people," he said at the beginning of his remarks. For some on Twitter, it bore an eerie similarity to the Batman villain Bane\'s speech in The Dark Night Rises, so much so that someone posted a 10-second mash-up of the two. Bu

In [5]:
test["summary"][0]

"Donald Trump campaigned on becoming a president unlike any Washington has ever seen. With his inauguration speech, he's already set the tone."

In [6]:
summarizer(test["text"][0])

[{'summary_text': 'Donald Trump\'s inaugural speech was a call to arms for Americans to come together and fight for what he described as "the cause" of America\'s problems. It was an unusually bleak speech for a president who campaigned as an outsider, and was met with boos and boos from the crowd that greeted his arrival at the Lincoln Memorial. But what did his supporters make of it?'}]

In [25]:
import pandas as pd
from datasets import load_dataset
test =  pd.DataFrame(load_dataset("csebuetnlp/xlsum", "spanish", split="train"))
len(test)

38110

In [30]:
test["text"][9000]

'El ejército le había dado un ultimátum al presidente Morsi: 48 horas para escuchar las demandas de la gente. No está claro que rol tomará el actual presidente Mohamed Morsi durante este proceso y en el período previo a una elección presidencial libre. La agencia de noticias Reuters ha recibido información similar de fuentes militares, que subrayan que todavía se está discutiendo el plan y que podría cambiar. Las Fuerzas Armadas de Egipto advirtieron el lunes que intervendrían si el presidente Morsi no escuchaba las demandas de la gente dentro de las 48 horas. Esto después de que millones de personas salieran a las calles para exigir la renuncia del presidente.'

In [27]:
test["summary"][9000]

'La directora de ayuda humanitaria de Naciones Unidas, Valerie Amos, advirtió que de no invertir más dinero, el Programa Mundial de Alimentos tendrá que detener sus operaciones en Siria en dos meses.'

In [31]:
summarizer(test["text"][9000])

[{'summary_text': 'os en Egipto informan que el ejército le había dado un ultimátum al presidente Mohamed Morsi: 48 horas para escuchar las demandas de la gente, según fuentes.'}]

In [35]:
import pandas as pd
from datasets import load_dataset
test =  pd.DataFrame(load_dataset("csebuetnlp/xlsum", "chinese_simplified", split="train"))
len(test)

37362

In [48]:
test["text"][0]

'软件巨头微软公司的IE6网络浏览器在美国的用户不到美国互联网用户的1%，该公司为此举行了一个随意的庆祝仪式。 微软公司急于终止使用老一代的网络搜索器，鼓励用户升级使用IE8或9网络浏览器。 与此同时，微软公司的对手谷歌被迫减少在网络宣传他们自己的Chrome网络搜索器。'

In [47]:
test["summary"][0]

'微软公司做蛋糕庆祝该公司的IE6网络浏览器终止使用。'

In [46]:
summarizer(test["text"][0])

[{'summary_text': '网络搜索器（IE6）的微软公司，提出的鼓励用户。'}]

# Human Feedback & cultural/linguistic biases

## English

In [49]:
english_text = "Donald Trump delivered a speech that focused on the struggles of ordinary Americans and promised to return power to the people."
spanish_text = "Donald Trump pronunció un discurso centrado en las luchas de los estadounidenses comunes y prometió devolver el poder al pueblo."
mandarin_text = "唐纳德·特朗普发表演讲，强调普通美国人的困境，并承诺将权力还给人民。"


In [50]:
summary_en = summarizer(english_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_es = summarizer(spanish_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_zh = summarizer(mandarin_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']


Your max_length is set to 60, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 60, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)


In [None]:
#compare English vs Spanish summary
bert_result_en_es = bertscore.compute(predictions=[summary_en], references=[summary_es], lang="multilingual")
bert_result_en_zh = bertscore.compute(predictions=[summary_en], references=[summary_zh], lang="multilingual")


In [52]:
print("\n=== BERTSCORE SIMILARITY ===")
print("EN ↔ ES BERTScore F1:", round(bert_result_en_es['f1'][0] * 100, 2))
print("EN ↔ ZH BERTScore F1:", round(bert_result_en_zh['f1'][0] * 100, 2))


=== BERTSCORE SIMILARITY ===
EN ↔ ES BERTScore F1: 69.08
EN ↔ ZH BERTScore F1: 67.35


## Spanish

In [53]:

english_text = "The Spanish government has announced a new economic plan to reduce youth unemployment. The Economy Minister stated that the goal is to create 250,000 new jobs in the next two years through tax incentives for small and medium-sized businesses. There is also a plan for significant investment in vocational training programs for youth."
spanish_text = "El gobierno español ha anunciado un nuevo plan económico para reducir el desempleo juvenil. El ministro de Economía declaró que el objetivo es crear 250.000 nuevos puestos de trabajo en los próximos dos años mediante incentivos fiscales para las pequeñas y medianas empresas. También se prevé una inversión significativa en programas de formación profesional para jóvenes."
mandarin_text = "西班牙政府宣布了一项新的经济计划，旨在降低青年失业率。经济部长表示，目标是在未来两年内通过对中小企业的税收激励措施创造25万个新的就业岗位。该计划还包括对青年职业培训项目的大规模投资。"

In [54]:
summary_en = summarizer(english_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_es = summarizer(spanish_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_zh = summarizer(mandarin_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']


In [None]:
#compare English vs Spanish summary
bert_result_es_en = bertscore.compute(predictions=[summary_es], references=[summary_en], lang="multilingual")
bert_result_es_zh = bertscore.compute(predictions=[summary_es], references=[summary_zh], lang="multilingual")


In [56]:
print("\n=== BERTSCORE SIMILARITY ===")
print("ES ↔ EN BERTScore F1:", round(bert_result_es_en['f1'][0] * 100, 2))
print("ES ↔ ZH BERTScore F1:", round(bert_result_es_zh['f1'][0] * 100, 2))


=== BERTSCORE SIMILARITY ===
ES ↔ EN BERTScore F1: 67.3
ES ↔ ZH BERTScore F1: 67.27


## Chinese 

In [57]:

english_text = "China's Cyberspace Administration has recently issued a new regulation requiring all major tech companies to report to regulators before launching new algorithmic services. The measure aims to strengthen oversight of AI and recommendation systems to protect user privacy and prevent information manipulation. This regulation is seen as an important step in China's efforts to enhance internet governance."
spanish_text = "La Administración del Ciberespacio de China ha emitido recientemente una nueva regulación que exige a todas las principales empresas tecnológicas informar a los reguladores antes de lanzar nuevos servicios basados en algoritmos. La medida tiene como objetivo reforzar la supervisión de la inteligencia artificial y los sistemas de recomendación para proteger la privacidad de los usuarios y prevenir la manipulación de la información. Esta regulación se considera un paso importante en los esfuerzos de China por mejorar la gobernanza de Internet."
mandarin_text = "中国国家网信办近日发布了一项新的规定，要求所有大型科技公司在推出新的算法服务前，必须向监管机构报告。该措施旨在加强对人工智能和推荐算法的监管，以保护用户隐私并防止信息操控。这一规定被认为是中国在加强互联网治理方面迈出的重要一步。"

In [58]:
summary_en = summarizer(english_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_es = summarizer(spanish_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
summary_zh = summarizer(mandarin_text, max_length=60, min_length=20, do_sample=False)[0]['summary_text']


In [59]:
bert_result_zh_es = bertscore.compute(predictions=[summary_zh], references=[summary_en], lang="multilingual")
bert_result_eh_en = bertscore.compute(predictions=[summary_zh], references=[summary_es], lang="multilingual")


In [60]:
print("\n=== BERTSCORE SIMILARITY ===")
print("ES ↔ EN BERTScore F1:", round(bert_result_zh_es['f1'][0] * 100, 2))
print("ES ↔ ZH BERTScore F1:", round(bert_result_eh_en['f1'][0] * 100, 2))


=== BERTSCORE SIMILARITY ===
ES ↔ EN BERTScore F1: 66.24
ES ↔ ZH BERTScore F1: 61.66
