In [1]:
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install jsonlines



In [2]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
from torch.utils.data import Dataset, DataLoader
import jsonlines
import torch
from torch.utils.data import Dataset, DataLoader
#from transformers import AutoTokenizer, MBartForConditionalGeneration
from sklearn.model_selection import train_test_split

# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess the labeled dataset
def load_labeled_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            example_id = data['example_id']
            paragraph = data['paragraph']
            summary = data['summary']
            dataset.append((example_id, paragraph, summary))
    return dataset

# Custom dataset class
class LabeledDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained('moussaKam/AraBART')


    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example_id, paragraph, summary = self.data[index]
        inputs = self.tokenizer.encode_plus(paragraph, truncation=True, padding='max_length', max_length=512,
                                            return_tensors='pt')
        labels = self.tokenizer.encode_plus(summary, truncation=True, padding='max_length', max_length=110,
                                            return_tensors='pt')
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

# Load and preprocess the labeled dataset
labeled_dataset = load_labeled_dataset('/content/labeled_validation_dataset.jsonl')

# Create an instance of the labeled dataset
dataset = LabeledDataset(labeled_dataset)

# Configure the model
model_config = AutoConfig.from_pretrained('moussaKam/AraBART')
model_config.num_labels = 2  # Binary classification for sequence-to-sequence model

# Create the model
model = AutoModelForSeq2SeqLM.from_pretrained('moussaKam/AraBART', config=model_config)

# Move the model to the device
model.to(device)

# Define training parameters
batch_size = 8
num_epochs = 12
learning_rate = 1e-5

# Create a data loader for training
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer.step()

        # Print the loss or any other metrics

    # Evaluate the model on validation set after each epoch
    model.eval()
    # Run validation and calculate metrics

# Save the trained model
model.save_pretrained('trained_model')


In [6]:
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch.utils.data import Dataset, DataLoader

# Load the trained model
model_path = '/content/trained_model'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('moussaKam/AraBART')

# Load the unlabeled validation dataset
def load_validation_dataset(file_path):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            example_id = data['example_id']
            paragraph = data['paragraph']
            dataset.append((example_id, paragraph))
    return dataset

# Custom dataset class for validation dataset
class ValidationDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.tokenizer = AutoTokenizer.from_pretrained('moussaKam/AraBART')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example_id, paragraph = self.data[index]
        inputs = self.tokenizer.encode_plus(paragraph, truncation=True, padding='max_length', max_length=512,
                                            return_tensors='pt')
        return {
            'example_id': example_id,
            'paragraph': paragraph,
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

# Load the unlabeled validation dataset
validation_dataset = load_validation_dataset('/content/validation_data.jsonl')

# Create an instance of the validation dataset
dataset = ValidationDataset(validation_dataset)

# Define batch size
batch_size = 8

# Create a data loader for validation dataset
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Generate predictions
predictions = []
original_word_counts = []  # To store the word counts of original paragraphs
model.eval()
for batch in data_loader:
    example_ids = batch['example_id']
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    for example_id, inp_id, att_mask, paragraph in zip(example_ids, input_ids, attention_mask, batch['paragraph']):
        paragraph_word_count = len(tokenizer.tokenize(paragraph))

        # Calculate the target length as a percentage of the paragraph's length
        target_length = int(0.47 * paragraph_word_count)  # Adjust the percentage as needed

        # Generate a new summary with the adjusted length
        outputs = model.generate(input_ids=inp_id.unsqueeze(0), attention_mask=att_mask.unsqueeze(0),
                                 max_length=target_length)

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = {'example_id': example_id.item(), 'summary': summary}
        predictions.append(prediction)
        original_word_counts.append(paragraph_word_count)

# Save the predictions to predictions.jsonl
with open('predictions.jsonl', 'w', encoding='utf-8') as file:
    for prediction in predictions:
        file.write(json.dumps(prediction, ensure_ascii=False) + '\n')

# Compute the Compression Ratio
target_ratio_lower = 0.3
target_ratio_upper = 0.4

compression_ratios = []
for prediction, word_count in zip(predictions, original_word_counts):
    summary = prediction['summary']
    summary_word_count = len(tokenizer.tokenize(summary))
    compression_ratio = summary_word_count / word_count
    compression_ratios.append(compression_ratio)

# Calculate the score based on the Compression Ratio
scores = []
for compression_ratio in compression_ratios:
    if target_ratio_lower <= compression_ratio <= target_ratio_upper:
        scores.append(1.0)
    else:
        score = max(0, (target_ratio_upper - compression_ratio) / (target_ratio_upper - target_ratio_lower))
        scores.append(score)

# Print the Compression Ratios and Scores
for i, prediction in enumerate(predictions):
    example_id = prediction['example_id']
    original_paragraph_length = original_word_counts[i]
    summary_length = len(tokenizer.tokenize(prediction['summary']))
    print("Example ID:", example_id)
    print("Original Paragraph Length:", original_paragraph_length)
    print("Summary Length:", summary_length)
    print("Compression Ratio:", compression_ratios[i])
    print("Score:", scores[i])
    print()

# Compute the average Compression Ratio
average_compression_ratio = np.mean(compression_ratios)
print("Average Compression Ratio:", average_compression_ratio)

# Compute the average score
average_score = np.mean(scores)
print("Average Score:", average_score)


Example ID: 0
Original Paragraph Length: 460
Summary Length: 204
Compression Ratio: 0.4434782608695652
Score: 0

Example ID: 1
Original Paragraph Length: 173
Summary Length: 77
Compression Ratio: 0.44508670520231214
Score: 0

Example ID: 2
Original Paragraph Length: 125
Summary Length: 52
Compression Ratio: 0.416
Score: 0

Example ID: 3
Original Paragraph Length: 268
Summary Length: 86
Compression Ratio: 0.3208955223880597
Score: 1.0

Example ID: 4
Original Paragraph Length: 116
Summary Length: 48
Compression Ratio: 0.41379310344827586
Score: 0

Example ID: 5
Original Paragraph Length: 249
Summary Length: 108
Compression Ratio: 0.43373493975903615
Score: 0

Example ID: 6
Original Paragraph Length: 254
Summary Length: 83
Compression Ratio: 0.32677165354330706
Score: 1.0

Example ID: 7
Original Paragraph Length: 298
Summary Length: 132
Compression Ratio: 0.4429530201342282
Score: 0

Example ID: 8
Original Paragraph Length: 227
Summary Length: 102
Compression Ratio: 0.44933920704845814
Sc