BERTScore

In [None]:
import pandas as pd
from bert_score import BERTScorer

# Initialize the BERT scorer
scorer = BERTScorer(model_type='roberta-large', lang="en")

# Function to calculate BERTScore for each row using one loop
def calculate_bertscore(row):
    # Calculate BERT Scores directly for the ground_truth and llm_generated of the row
    _, _, F1 = scorer.score([row['ground_truth']], [row['llm_generated']])
    return F1.mean().item()  # Return the mean F1 score

# Apply the function to each row in the DataFrame
df_filtered['bert_score'] = df_filtered.apply(calculate_bertscore, axis=1)


In [None]:
import re

# Update the function to handle lists in each row
def extract_first_number_from_list(row):
    for text in row:  # Iterate through each string in the list
        match = re.match(r'^(\d+)', text)
        if match:
            return int(match.group(1))
    return None  # Return None if no number is found

# Apply the updated function to the 'ground_truth' column
df_filtered['section'] = df_filtered['ground_truth'].apply(extract_first_number_from_list)

average_bert_score = df_filtered['bert_score'].mean()


ROUGE score

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import pandas as pd

# Initialize ROUGE scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate similarity metrics for each row
def calculate_metrics(row):
    metrics = {}

    # ROUGE scores
    rouge_scores = rouge_scorer.score(row['ground_truth'], row['llm_generated'])
    metrics['rouge1'] = rouge_scores['rouge1'].fmeasure
    metrics['rouge2'] = rouge_scores['rouge2'].fmeasure
    metrics['rougeL'] = rouge_scores['rougeL'].fmeasure

    # Cosine Similarity
    vectorizer = CountVectorizer().fit_transform([row['ground_truth'], row['llm_generated']])
    vectors = vectorizer.toarray()
    metrics['cosine_similarity'] = cosine_similarity(vectors)[0, 1]

    # Jaccard Similarity
    set1 = set(row['ground_truth'].split())
    set2 = set(row['llm_generated'].split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    metrics['jaccard_similarity'] = intersection / union if union > 0 else 0

    # BLEU Score
    metrics['bleu_score'] = sentence_bleu([row['ground_truth'].split()], row['llm_generated'].split())

    return metrics

# Apply the function to each row in the DataFrame and store results in new columns
metric_results = df_filtered.apply(calculate_metrics, axis=1)

# Expand the dictionary into separate columns
metric_results_df = pd.DataFrame(metric_results.tolist())
df_filtered = pd.concat([df_filtered, metric_results_df], axis=1)


In [None]:
# Calculate the average of each metric
average_metrics = {
    'Average ROUGE-1': df_filtered['rouge1'].mean(),
    'Average ROUGE-2': df_filtered['rouge2'].mean(),
    'Average ROUGE-L': df_filtered['rougeL'].mean(),
    'Average Cosine Similarity': df_filtered['cosine_similarity'].mean(),
    'Average Jaccard Similarity': df_filtered['jaccard_similarity'].mean(),
    'Average BLEU Score': df_filtered['bleu_score'].mean()
}

# Print the average metrics
average_metrics


Topic based evaluation

In [None]:
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Function to extract keywords using KeyBERT
def extract_keywords(text):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words='english')
    return [kw[0] for kw in keywords]  # Extract just the keywords

# Apply KeyBERT to 'ground_truth' and 'LLM_generated' columns
df_filtered['ground_truth_words'] = df_filtered['ground_truth'].apply(extract_keywords)
df_filtered['LLM_generated_words'] = df_filtered['llm_generated'].apply(extract_keywords)


In [None]:
# Function to compute Jaccard Similarity
def jaccard_similarity(row):
    set1 = set(row['ground_truth_words'])
    set2 = set(row['LLM_generated_words'])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0

# Apply Jaccard Similarity to each row
df_filtered['jaccard_similarity_topic'] = df_filtered.apply(jaccard_similarity, axis=1)
df_filtered['jaccard_similarity_topic'].mean()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute Cosine Similarity
def cosine_sim(row):
    vectorizer = CountVectorizer().fit_transform([' '.join(row['ground_truth_words']), ' '.join(row['LLM_generated_words'])])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Apply Cosine Similarity to each row
df_filtered['cosine_similarity_topic'] = df_filtered.apply(cosine_sim, axis=1)
df_filtered['cosine_similarity_topic'].mean()


In [None]:
import re

# Function to extract text between double asterisks
def extract_text_between_asterisks(text):
    matches = re.findall(r'\*\*(.*?)\*\*', text)
    return matches

# Apply the function to both columns and store results in new columns
df_filtered['ground_truth_extracted'] = df_filtered['ground_truth'].apply(extract_text_between_asterisks)
df_filtered['llm_generated_extracted'] = df_filtered['llm_generated'].apply(extract_text_between_asterisks)


BERTScore (heading)

In [None]:
from bert_score import BERTScorer

# Initialize the BERT scorer
scorer = BERTScorer(model_type='roberta-large', lang="en")

# Function to calculate BERTScore for each row
def calculate_bertscore(row):
    # Flatten the lists to single strings
    ground_truth_text = ' '.join(row['ground_truth_extracted'])
    llm_generated_text = ' '.join(row['llm_generated_extracted'])

    # Calculate BERT Scores
    _, _, F1 = scorer.score([ground_truth_text], [llm_generated_text])
    return F1.mean().item()  # Return the mean F1 score

# Apply the function to each row and store the results in a new column
df_filtered['bert_score_heading'] = df_filtered.apply(calculate_bertscore, axis=1)


In [None]:
# Calculate the average of the 'bert_score' column in df_highest_score
average_bert_score = df_filtered['bert_score_heading'].mean()
