In [1]:
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BartForConditionalGeneration, BartTokenizer,MBartForConditionalGeneration, MBart50TokenizerFast
import torch

bart_model_name = "fine_tuned_bart_model_eng_to_engSummary"
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def generate_summary(text, max_length=100):
    inputs = bart_tokenizer(text, max_length=max_length, return_tensors="pt", truncation=True)
    input_ids = inputs.input_ids.to(device)

    summary_ids = bart_model.generate(input_ids, max_length=150, min_length=10, length_penalty=2.0, num_beams=5, early_stopping=True)
    
    summary = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    return summary

df = pd.read_csv("TOSDR_labeled_with_summaries.csv")

def generate_summary_for_row(row):
    text = row["Text"]
    generated_summary = generate_summary(text)
    return generated_summary


df["gen_summary"] = df.apply(generate_summary_for_row, axis=1)


df.to_csv("TOSDR_labeled_with_summaries_and_gen.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm


In [30]:
import pandas as pd
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the dataset from CSV
dataset_path = "TOSDR_labeled_with_summaries_and_gen.csv"
data = pd.read_csv(dataset_path)

# Extract reference and generated summaries from the dataset
eng_summaries = data["eng_summary"].tolist()
gen_summaries = data["gen_summary"].tolist()

# Load BART model and tokenizer
bart_model_name = "fine_tuned_bart_model_eng_to_engSummary"
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

# Tokenize the summaries
def tokenize_summaries(summaries, tokenizer):
    tokenized_summaries = tokenizer(summaries, padding=True, truncation=True, return_tensors="pt")
    return tokenized_summaries

eng_tokenized = tokenize_summaries(eng_summaries, bart_tokenizer)
gen_tokenized = tokenize_summaries(gen_summaries, bart_tokenizer)


def calculate_rouge(references, candidates):
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    for ref, cand in zip(references, candidates):
        scores = scorer.score(ref, cand)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    return rouge1_scores, rouge2_scores, rougeL_scores

rouge1, rouge2, rougeL = calculate_rouge(eng_summaries, gen_summaries)


print("ROUGE-1 Score:", sum(rouge1)/len(rouge1))
print("ROUGE-2 Score:", sum(rouge2)/len(rouge2))
print("ROUGE-L Score:", sum(rougeL)/len(rougeL))


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ROUGE-1 Score: 0.405466266173145
ROUGE-2 Score: 0.19073618897552436
ROUGE-L Score: 0.33643418896114047
