### *Loading the fine-tuned model*

### *Installing the requirements*

In [None]:
! pip install -qqq datasets rouge-score bert-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the followi

In [None]:
import pandas as pd
import torch
import bert_score
from datasets import Dataset
from rouge_score import rouge_scorer

### *Loading the dataset*

In [None]:
#@title Function to calculate ROUGE scores

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def calculate_rouge_scores(reference, hypothesis):
    scores = scorer.score(reference, hypothesis)
    return {
        'rouge1_precision': scores['rouge1'].precision,
        'rouge1_recall': scores['rouge1'].recall,
        'rouge1_f1': scores['rouge1'].fmeasure,
        'rouge2_precision': scores['rouge2'].precision,
        'rouge2_recall': scores['rouge2'].recall,
        'rouge2_f1': scores['rouge2'].fmeasure,
        'rougeL_precision': scores['rougeL'].precision,
        'rougeL_recall': scores['rougeL'].recall,
        'rougeL_f1': scores['rougeL'].fmeasure
    }

In [None]:
test_df = pd.read_csv("Model_output.csv")
test_df_dataset = Dataset.from_pandas(test_df)

In [None]:
from tqdm import tqdm

predictions = df["model_predictions"].tolist()

# Calculate ROUGE scores
results = []

for i, (row, prediction) in tqdm(enumerate(zip(test_df_dataset, predictions)), total=len(predictions), desc="Calculating ROUGE scores"):
    summary = row["Perspective-based Summary"]
    rouge_scores = calculate_rouge_scores(summary, prediction)
    results.append(rouge_scores)

In [None]:
# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

In [None]:
# Merge the actual dataset and the adjusted scores dataset on the index
merged_df = pd.merge(test_df, results_df, left_index=True, right_index=True)

### *BERT Score*

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
summary = merged_df["Perspective-based Summary"].tolist() # actual summary

# Compute BERTScore
precision, recall, f1 = bert_score.score(summary, predictions, lang='en', verbose=True, device=device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 5.07 seconds, 19.74 sentences/sec


In [None]:
# Add BERTScore to the DataFrame
merged_df['BERTScore_Precision'] = precision.numpy()
merged_df['BERTScore_Recall'] = recall.numpy()
merged_df['BERTScore_F1'] = f1.numpy()

### *Saving the scores*

In [None]:
# Save the result to a new CSV file
merged_df.to_csv('Evaluation_scores.csv', index=False)

### *Overall Average Scores*

In [None]:
df = pd.read_csv('Evaluation_scores.csv')

In [None]:
# Calculate aggregated results
aggregated_results = {
    'rouge1_precision': df['rouge1_precision'].mean(),
    'rouge1_recall': df['rouge1_recall'].mean(),
    'rouge1_f1': df['rouge1_f1'].mean(),
    'rouge2_precision': df['rouge2_precision'].mean(),
    'rouge2_recall': df['rouge2_recall'].mean(),
    'rouge2_f1': df['rouge2_f1'].mean(),
    'rougeL_precision': df['rougeL_precision'].mean(),
    'rougeL_recall': df['rougeL_recall'].mean(),
    'rougeL_f1': df['rougeL_f1'].mean(),
    'BERTScore_Precision': df['BERTScore_Precision'].mean(),
    'BERTScore_Recall': df['BERTScore_Recall'].mean(),
    'BERTScore_F1': df['BERTScore_F1'].mean()
}
aggregated_rouge_scores = pd.DataFrame(aggregated_results, index=['Value'])
aggregated_rouge_scores