In [None]:
import evaluate
import json
import matplotlib.pyplot as plt
import os
import statistics
import itertools
import warnings

from transformers import logging
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized from the model checkpoint")
logging.set_verbosity_error()

# for relatively import
import sys
sys.path.append('/nfs/home/tzulinglin/LLM_reviewer/prompt')

from prompt import PromptTemplate

Path

In [None]:
attack_name = 'StyleAdv'
model_name = "gpt-4o-mini"
dataset_name = "PeerRead_iclr_2017"
# dataset_name = "AgentReview"

data_path = f"../data/clean_review/API/{dataset_name}/{model_name}/"
gt_root_dir = "../data/dataset/PeerRead/data/iclr_2017/"

aspect_tag_types=["NONE", "SUMMARY", "MOTIVATION POSITIVE", "MOTIVATION NEGATIVE", "SUBSTANCE POSITIVE", "SUBSTANCE NEGATIVE", 
                  "ORIGINALITY POSITIVE", "ORIGINALITY NEGATIVE", "SOUNDNESS POSITIVE", "SOUNDNESS NEGATIVE", 
                  "CLARITY POSITIVE", "CLARITY NEGATIVE", "REPLICABILITY POSITIVE", "REPLICABILITY NEGATIVE", 
                  "MEANINGFUL COMPARISON POSITIVE", "MEANINGFUL COMPARISON NEGATIVE"]

outfile_path = f'result_review/PeerRead_iclr_2017/{model_name}/'
outfile_base = '/'.join(outfile_path.split('/')[:2])
os.makedirs(outfile_path, exist_ok=True)  


print(f"Attack Name: {attack_name}\nModel Name: {model_name}\nDataset Name: {dataset_name}\nData Path: {data_path}\nOutput Path: {outfile_path}")

Read LLM-generated Reviews (loop directory)

In [None]:
llmAspectCoverage = []
llm_review = {}

for file in os.listdir(data_path):
    if file.endswith('.txt'):
        filename = os.path.basename(file).split('.')[0]
        with open(os.path.join(data_path, file), 'r') as f:
            content = f.read()
        content = PromptTemplate.parseManualReview(content)
        original_output_review = content[0]
        covered = 0
        for aspect in aspect_tag_types:
            if aspect in original_output_review:
                covered += 1
        llmAspectCoverage.append(covered / len(aspect_tag_types)) # for Aspect Coverage calculation
        llm_review[filename] = original_output_review # for Similarity calculation    
            
content

Read Ground-Truth Reviews written by human

In [None]:
gtAspectCoverage = []
gt_review = {}

for subfolder in ['train', 'dev', 'test']:
    sub_path = os.path.join(gt_root_dir, subfolder, "reviews_annotated/result")
    for paper_id in sorted(os.listdir(sub_path)):
        result_file = os.path.join(sub_path, paper_id, 'result.jsonl')
        covered_list = []  
        gt_review_paper = []
        
        covered = 0
        with open(result_file, 'r') as f:
            for line in f:
                data = json.loads(line)
                
                # for Aspect Coverage calculation
                for label in data['labels']:
                    start_idx, end_idx, label_name = label
                    label_name = label_name.replace("_", " ").upper()

                    if label_name in aspect_tag_types and label_name not in covered_list:
                        covered += 1
                        covered_list.append(label_name) 

                # for Similarity calculation
                gt_review_paper.append(data['text'])
                
        gtAspectCoverage.append(covered / len(aspect_tag_types))
        gt_review[paper_id] = gt_review_paper

### Aspect Coverage

In [None]:
print("Aspect Coverage calculation start!")

'''
calculate the average aspect coverage of the original (LLM generated) data
'''
llmAvg = sum(llmAspectCoverage) / len(llmAspectCoverage)
llmVar = statistics.variance(llmAspectCoverage)
llmMedian = statistics.median(llmAspectCoverage)


'''
calculate the average aspect coverage of the ground truth (human labeled) data
for each paper, count covered aspect set and divide by the total aspect set
avg among all papers
'''    
gtavg = sum(gtAspectCoverage) / len(gtAspectCoverage)
gtVar = statistics.variance(gtAspectCoverage)
gtMedian = statistics.median(gtAspectCoverage)


LLM_data = {
        "type": "original",
        'numbers':  len(llmAspectCoverage),
        "avg": round(llmAvg, 4),
        "var": round(llmVar, 4),
        "median": round(llmMedian, 4),
    }
gt_data = {
        "type": "gt",
        'numbers':  len(gtAspectCoverage),
        "avg": round(gtavg, 4),
        "var": round(gtVar, 4),
        "median": round(gtMedian, 4),
    }


with open(outfile_path+'aspect_coverage.jsonl', 'w') as outfile:
    json.dump(LLM_data, outfile)
    outfile.write('\n')
    json.dump(gt_data, outfile)
    outfile.write('\n')

'''
draw box plot of LLMaspectCoverage and GTaspectCoverage
'''
plt.boxplot([llmAspectCoverage, gtAspectCoverage], labels=[f'{model_name}', 'Human'])
plt.ylabel('Aspect Coverage')
plt.title(f'Aspect Coverage of {attack_name}')
plt.savefig(outfile_path+'aspect_coverage.png')
plt.show()
    
print("Aspect Coverage calculation done!")

### Similarity

Compute Similarity Metric

In [None]:
metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"], keep_in_memory=True)
metric_bertscore = evaluate.load("bertscore", keep_in_memory=True)

def compute_similarity_metrics(pred_str, labels_str):
    # Compute ROUGE scores
    rouge_output = metric_rouge.compute(predictions=pred_str, references=labels_str, use_stemmer=True)
    bertscore = metric_bertscore.compute(predictions=pred_str, references=labels_str, lang="en")
    return rouge_output["rouge1"], rouge_output["rouge2"], rouge_output["rougeL"], bertscore["f1"][0]

In [None]:
print("Similarity calculation start!")

calculate_human_pairwise_sim = True
if os.path.exists(outfile_base+'similarity_humanpair.jsonl'):
    calculate_human_pairwise_sim = False


'''
calculate the similarity between the original (LLM generated) review and the ground truth (human labeled) review
'''
print(len(llm_review), len(gt_review))


'''
calculate the similarity, split by aspect
find max among all aspect pairs
'''   

output = []
human_eval = []
for paper_id in tqdm(llm_review.keys()):
    llm_review_text = " ".join(llm_review[paper_id].values())
    gt_review_text = gt_review[paper_id]
    
    llm_review_text = [llm_review_text] * len(gt_review_text)
    # print(len(llm_review_text), len(gt_review_text))
    
    max_rouge1, max_rouge2, max_rougeL, max_bertscore = 0, 0, 0, 0
    max_total = 0
    chosen_review_id = -1
    for review_id, (llm_str, gt_str) in enumerate(zip(llm_review_text, gt_review_text)):
        rouge1, rouge2, rougeL, bertscore = compute_similarity_metrics([llm_str], [gt_str])
        total = rouge1 + rouge2 + rougeL + bertscore
        
        # choose the review with the highest total score
        if  total > max_total:
            max_total = total
            max_rouge1, max_rouge2, max_rougeL, max_bertscore = rouge1, rouge2, rougeL, bertscore  
            chosen_review_id = review_id
        
        output.append({
            "paper_id": paper_id,
            "chosen_review_id": chosen_review_id,
            "rouge1": round(max_rouge1, 4),
            "rouge2": round(max_rouge2, 4),
            "rougeL": round(max_rougeL, 4),
            "bertscore": round(max_bertscore, 4),
        })
           
           
        # human pairwise evaluation within paper
        if calculate_human_pairwise_sim:
            gt_review_combination = list(itertools.combinations(gt_review_text, 2))
            max_rouge1_humanpair, max_rouge2_humanpair, max_rougeL_humanpair, max_bertscore_humanpair = 0, 0, 0, 0
            max_total_humanpair = 0
            chosen_review_id = -1
            for gt_review1, gt_review2 in gt_review_combination:
                rouge1_humanpair, rouge2_humanpair, rougeL_humanpair, bertscore_humanpair = compute_similarity_metrics([gt_review1], [gt_review2])
                # print(f"rouge1: {rouge1}, rouge2: {rouge2}, rougeL: {rougeL}, bertscore: {bertscore}")

                total_humanpair = rouge1_humanpair + rouge2_humanpair + rougeL_humanpair + bertscore_humanpair
                
                # choose the review with the highest total score
                if  total_humanpair > max_total_humanpair:
                    max_total_humanpair = total_humanpair
                    max_rouge1_humanpair, max_rouge2_humanpair, max_rougeL_humanpair, max_bertscore_humanpair = rouge1_humanpair, rouge2_humanpair, rougeL_humanpair, bertscore_humanpair  
                    chosen_review_id = review_id
                    
            human_eval.append({
                "paper_id": paper_id,
                "rouge1": round(max_rouge1_humanpair, 4),
                "rouge2": round(max_rouge2_humanpair, 4),
                "rougeL": round(max_rougeL_humanpair, 4),
                "bertscore": round(max_bertscore_humanpair, 4),
            })
                

# avg among all papers
avg_rouge1 = round(sum([line['rouge1'] for line in output]) / len(output), 4)
avg_rouge2 = round(sum([line['rouge2'] for line in output]) / len(output), 4)
avg_rougeL = round(sum([line['rougeL'] for line in output]) / len(output), 4)
avg_bertscore = round(sum([line['bertscore'] for line in output]) / len(output), 4)
print(f"[similarity pred/gt] avg rouge1: {avg_rouge1}, avg rouge2: {avg_rouge2}, avg rougeL: {avg_rougeL}, avg bertscore: {avg_bertscore}")

with open(outfile_path+'similarity.jsonl', 'w') as outfile:
    outfile.write(f"avg rouge1: {avg_rouge1}, avg rouge2: {avg_rouge2}, avg rougeL: {avg_rougeL}, avg bertscore: {avg_bertscore} \n")
    for line in output:
        json.dump(line, outfile)
        outfile.write('\n')
        
if calculate_human_pairwise_sim:
    # avg among all papers
    avg_rouge1_humanpair = round(sum([line['rouge1'] for line in human_eval]) / len(human_eval), 4)
    avg_rouge2_humanpair = round(sum([line['rouge2'] for line in human_eval]) / len(human_eval), 4)
    avg_rougeL_humanpair = round(sum([line['rougeL'] for line in human_eval]) / len(human_eval), 4)
    avg_bertscore_humanpair = round(sum([line['bertscore'] for line in human_eval]) / len(human_eval), 4)
    print(f"[similarity human] avg rouge1: {avg_rouge1_humanpair}, avg rouge2: {avg_rouge2_humanpair}, avg rougeL: {avg_rougeL_humanpair}, avg bertscore: {avg_bertscore_humanpair}")
        
    with open(outfile_base+'similarity_humanpair.jsonl', 'w') as outfile:
        outfile.write(f"avg rouge1: {avg_rouge1_humanpair}, avg rouge2: {avg_rouge2_humanpair}, avg rougeL: {avg_rougeL_humanpair}, avg bertscore: {avg_bertscore_humanpair} \n")
        for line in human_eval:
            json.dump(line, outfile)
            outfile.write('\n')
            
print("Similarity calculation done!")