In [None]:
import os
import logging

import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
file_path = 'arabic_train.jsonl'
import json
json_list = []  # List to store the JSON objects
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        json_list.append(json_obj)
# Create a dictionary with the desired format
from datasets import Dataset

data_dict = {
    'document': [item['text'] for item in json_list],
    'summary': [item['summary'] for item in json_list],
    'id': [item['id'] for item in json_list],
}

# Create a Dataset object
train = Dataset.from_dict(data_dict)

# Print the dataset information
print(train)

In [None]:
file_path = 'arabic_val.jsonl'
import json
json_list = []  # List to store the JSON objects
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        json_list.append(json_obj)
# Create a dictionary with the desired format
from datasets import Dataset

data_dict = {
    'document': [item['text'] for item in json_list],
    'summary': [item['summary'] for item in json_list],
    'id': [item['id'] for item in json_list],
}

# Create a Dataset object
validation = Dataset.from_dict(data_dict)

# Print the dataset information
print(validation)

In [None]:
from datasets import DatasetDict

# Create a DatasetDict object
data_dict = DatasetDict({
    'train': train,
    'test': validation
})

# Print the dataset information
print(data_dict)


In [None]:
# The percentage of the dataset you want to split as train and test
TRAIN_TEST_SPLIT = 0.1

MAX_INPUT_LENGTH = 1024  # Maximum length of the input to the model
MIN_TARGET_LENGTH = 5  # Minimum length of the output by the model
MAX_TARGET_LENGTH = 128  # Maximum length of the output by the model
BATCH_SIZE = 4  # Batch-size for training our model
LEARNING_RATE = 5e-2  # Learning-rate for training our model
MAX_EPOCHS = 1  # Maximum number of epochs we will train the model for

# This notebook is built on the t5-small checkpoint from the Hugging Face Model Hub
model_name = "UBC-NLP/Arat5-msa-small"
MODEL_CHECKPOINT = model_name

In [None]:
from datasets import load_dataset
# raw_datasets = load_dataset("xsum", split="train")
raw_datasets = data_dict

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
if MODEL_CHECKPOINT in ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" , "UBC-NLP/AraT5-base","UBC-NLP/Arat5-msa-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)


In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [None]:
max_input_length = 0
max_target_length = 0

for example in tokenized_datasets["train"]:
    input_length = len(example["document"])
    target_length = len(example["summary"])
    if input_length > max_input_length:
        max_input_length = input_length
    if target_length > max_target_length:
        max_target_length = target_length

print("Max Input Length:", max_input_length)
print("Max Target Length:", max_target_length)

min_input_length = float("inf")
min_target_length = float("inf")

for example in tokenized_datasets["train"]:
    input_length = len(example["document"])
    target_length = len(example["summary"])
    if input_length < min_input_length:
        min_input_length = input_length
    if target_length < min_target_length:
        min_target_length = target_length

print("Min Input Length:", min_input_length)
print("Min Target Length:", min_target_length)
total_input_length = 0
total_target_length = 0
num_examples = 0

for example in tokenized_datasets["train"]:
    input_length = len(example["document"])
    target_length = len(example["summary"])
    total_input_length += input_length
    total_target_length += target_length
    num_examples += 1

mean_input_length = total_input_length / num_examples
mean_target_length = total_target_length / num_examples

print("Mean Input Length:", mean_input_length)
print("Mean Target Length:", mean_target_length)


In [None]:
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer)

In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    # We will print only the F1 score, you can use other aggregation metrics as well
    result = {"RougeL": result["f1_score"]}

    return result

In [None]:
from transformers.keras_callbacks import KerasMetricCallback
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
import math

# Define your learning rate decay function
def lr_decay(epoch):
    initial_learning_rate = 0.0001  # Initial learning rate
    decay_rate = 0.1  # Decay rate
    decay_steps = 10  # Decay steps
    new_learning_rate = initial_learning_rate * math.pow(decay_rate, math.floor(epoch / decay_steps))
    return new_learning_rate

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

# Define the path where you want to save the weights
checkpoint_path = 'model_weightslllll.h5'

# Create the ModelCheckpoint callback
checkpoint_callback = ModelCheckpoint(checkpoint_path, save_weights_only=True, verbose=1)

# Create the LearningRateScheduler callback
lr_scheduler_callback = LearningRateScheduler(lr_decay)

callbacks = [metric_callback, checkpoint_callback, lr_scheduler_callback]

model.load_weights('decay_model_weights_052.h5')


In [None]:
# model.fit(
#     train_dataset, validation_data=test_dataset, epochs=1, callbacks=callbacks
# )

In [None]:
# model.save_weights('decay_model_weights_052.h5')

In [None]:
import json
from transformers import pipeline

file_path = 'validation_data.jsonl'
json_list = []  # List to store the JSON objects
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_obj = json.loads(line)
        json_list.append(json_obj)

# Define the column name for the new summary
new_summary_column = 'summary'

# Create a new list to store the updated JSON data
updated_json_list = []

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

# Iterate over each text in json_list
for i in range(len(json_list)):
    print(f"start {i}")
    print(f"paragraph:- {json_list[i]['paragraph']}")
    
    # Generate the summary for the current text
    summary = summarizer(
        json_list[i]['paragraph'],
        min_length=MIN_TARGET_LENGTH,
        max_length=MAX_TARGET_LENGTH,
    )[0]['summary_text']
    
    print(f"new summary:- {summary}")
    # Create a new dictionary to store the updated data
    updated_dict = dict(json_list[i])

    # Add the summary to the new dictionary
    updated_dict[new_summary_column] = summary

    # Append the updated dictionary to the new list
    updated_json_list.append(updated_dict)
    print(f"finished {i}")

# Save the updated JSON data to a file in JSONL format
output_file_path = 'predictions.jsonl'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for updated_dict in updated_json_list:
        updated_dict_str = json.dumps(updated_dict, ensure_ascii=False) + '\n'
        output_file.write(updated_dict_str)

print(f"Updated JSON data saved to: {output_file_path}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from arabert.preprocess import ArabertPreprocessor
import json
import jsonlines

model_name="abdalrahmanshahrour/arabartsummarization"
preprocessor = ArabertPreprocessor(model_name="")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipeline = pipeline("text2text-generation",model=model,tokenizer=tokenizer)


file_path = 'validation_data.jsonl'

json_list = []  # List to store the JSON objects
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        json_list.append(json_obj)
        
for i in range (len(json_list)):    
    text = json_list[i]['paragraph']
    text = preprocessor.preprocess(text)

    result = pipeline(text,
                pad_token_id=tokenizer.eos_token_id,
                num_beams=3,
                repetition_penalty=3.0,
                max_length=200,
                length_penalty=1.0,
                no_repeat_ngram_size = 3)[0]['generated_text']
    json_list[i]['summary'] = result
#     del json_list[i]["paragraph"]
    print(f'paragraph : {json_list[i]["paragraph"]}')
    print(f'our_summary : {json_list[i]["summary"]}')

gg = json_list
with jsonlines.open('predictions.jsonl', mode='w') as writer:
    writer.write_all(gg)
    

In [None]:
import json
import re

file_path = 'validation_data.jsonl'
summary_list = []  # List to store the summaries
nnn=[]
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_obj = json.loads(line)
        text = json_obj['paragraph']
        candidate_summaries = []
        
        for i in range(3):
            sentences = re.split(r'(?<=[.:;])\s', text)[:i + 1]
            summary = ' '.join(sentences)
            candidate_summaries.append(summary)
            if i ==2:
                nnn.append(candidate_summaries[2])
        summary_list.append(candidate_summaries)
        

# summary_list
file_path = 'validation_data.jsonl'
import json
json_list = []  # List to store the JSON objects
with open(file_path, 'r') as file:
    for line in file:
        json_obj = json.loads(line)
        json_list.append(json_obj)
# json_list[0]

for i in range (len(json_list)):
    json_list[i]['summary'] = nnn[i]
    del json_list[i]["paragraph"]
    
# json_list[0]

import jsonlines
gg = json_list
with jsonlines.open('predictions.jsonl', mode='w') as writer:
    writer.write_all(gg)
    
