In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

chars = ['Relevancy', 'Correctness', 'Completeness', 'Informativeness', 'Integration', 'Cohesion', 'Coherence', 'Readability', 'Conciseness']

FOLD_NO = 3

def read_json(input_path):
    """
    Reads the ``json`` file of the given ``input_path``.

    :param input_path: Path to the json file
    :return: A loaded json object.
    """
    with open(input_path, encoding="utf-8") as f:
        json_data = json.load(f)

    return json_data

def word_counter_calculator(column):
    N_wc_lower = []
    N_wc_upper = []
    for col in column:
        wc = len(col.strip().split())
        N_wc_upper.append(1 if wc > 200 else 0)
        N_wc_lower.append(1 if wc < 50 else 0)
    return {"wc_upper": N_wc_upper, "wc_lower": N_wc_lower}

def evaluation_report(result_list, model, setting, fold_prefix='gpt-4-eval-'):
    print("MODEL:", model)
    print("SETTING:", setting)
    results_dict = {
        "word-count":[],
        "WC>200": [],
        "WC<50": [],
        "50<=WC<150": [],
        "150<=WC<=220": [],
        "220<=WC<=250": [],
        "150<=WC<=250": [],
        "WC>250":[],
        "paper-structure-score": [],
        
    }

    scores = {
        "Relevancy":0, "Correctness":0, "Completeness":0, 
        "Informativeness": 0, "Integration": 0, "Cohesion":0,
        "Coherence":0, "Readability":0, "Conciseness": 0
    }
    
    for inf in result_list:
        results_dict['WC>200'].append(inf['basic-eval']['word-count']['WC>200'])
        results_dict['WC<50'].append(inf['basic-eval']['word-count']['WC<50'])
        results_dict['50<=WC<150'].append(inf['basic-eval']['word-count']['50<=WC<150'])
        results_dict['150<=WC<=220'].append(inf['basic-eval']['word-count']['150<=WC<=220'])
        results_dict['220<=WC<=250'].append(inf['basic-eval']['word-count']['220<=WC<=250'])
        results_dict['150<=WC<=250'].append(inf['basic-eval']['word-count']['150<=WC<=250'])
        results_dict['WC>250'].append(inf['basic-eval']['word-count']['WC>250'])
        
        results_dict['word-count'].append(inf['basic-eval']['word-count']['count'])
        results_dict['paper-structure-score'].append(inf['basic-eval']['paper-structure'])

        avg_scores = {
            "Relevancy":0, "Correctness":0, "Completeness":0, 
            "Informativeness": 0, "Integration": 0, "Cohesion":0,
            "Coherence":0, "Readability":0, "Conciseness": 0
        }
        for keys in [f'{fold_prefix}{setting}-'+str(idx+1) for idx in range(FOLD_NO)]:
            if keys == 'gpt-4-eval-s1-1' and model=='vanila':
                items = inf[keys].items()
            else:
                items = inf[keys]['eval-result'].items()
            for key, score in items:
                if score['rating'] == '':
                    avg_scores[key] += 5
                else:
                    avg_scores[key] += int(score['rating'])
        for key, score in avg_scores.items():
            avg_scores[key] = score/FOLD_NO

        for key, score in avg_scores.items():
            scores[key] += score

    for metric, score in scores.items():
        scores[metric] /= len(result_list)
    
    avg = sum(results_dict['WC>200'])/len(results_dict['WC>200'])
    print("averaged WC>200:", avg)
    
    avg_wc = sum(results_dict['word-count'])/len(results_dict['word-count'])
    print("averaged word counts:", avg_wc)
    
    avg = sum(results_dict['150<=WC<=250'])/len(results_dict['150<=WC<=250'])
    print("averaged 150<=WC<=250:", avg)
    
    avg  = sum(results_dict['WC<50'])/len(results_dict['WC<50'])
    print("averaged WC<50:", avg)
    
    avg  = sum(results_dict['50<=WC<150'])/len(results_dict['50<=WC<150'])
    print("averaged 50<=WC<150:", avg)
    
    avg = sum(results_dict['150<=WC<=220'])/len(results_dict['150<=WC<=220'])
    print("averaged 150<=WC<=220:", avg)
    
    avg = sum(results_dict['220<=WC<=250'])/len(results_dict['220<=WC<=250'])
    print("averaged 220<=WC<=250:", avg)
    
    avg = sum(results_dict['WC>250'])/len(results_dict['WC>250'])
    print("averaged WC>250:", avg)

    print("averaged paper-structure:", sum(results_dict['paper-structure-score'])/len(results_dict['paper-structure-score']))
    
    return scores, results_dict

def get_avg_scores_per_fold(result_list, setting):
    avg_scores = {
        "Relevancy":0, "Correctness":0, "Completeness":0, 
        "Informativeness": 0, "Integration": 0, "Cohesion":0,
        "Coherence":0, "Readability":0, "Conciseness": 0
    }
    for inf in result_list:
        
        try:
            items = inf[setting]['eval-result'].items()
        except:
            items = inf[setting].items()
        for key, score in items:
            # print (key, score['rating'])
            if score['rating'] == '':
                avg_scores[key] += 5
            else:
                avg_scores[key] += int(score['rating'])
    for metric, score in avg_scores.items():
        avg_scores[metric] /= len(result_list)

    return avg_scores

def build_setting_per_fold_matrix(result_list, setting, chars, fold_prefix='gpt-4-eval-'):
    setting_scores = []
    for setting_fold in [f'{fold_prefix}{setting}-'+str(idx+1) for idx in range(FOLD_NO)]:
        avg = get_avg_scores_per_fold(result_list, setting_fold)
        fold_scores = []
        for char in chars:
            fold_scores.append(avg[char])
        setting_scores.append(fold_scores)
    return setting_scores

def plot_s1_vs_s2(setting1_scores, setting2_scores, model, chars):
    # Convert to numpy arrays for easier manipulation
    setting1_scores = np.array(setting1_scores)
    setting2_scores = np.array(setting2_scores)

    ig, ax = plt.subplots(figsize=(12, 4))

    # Scatter plot for Setting 1
    for i, char in enumerate(chars):
        ax.scatter(np.full_like(setting1_scores[:, i], i*2 - 0.2), setting1_scores[:, i], color='blue', alpha=0.7)

    for i, char in enumerate(chars):
        ax.scatter(np.full_like(np.mean(setting1_scores[:, i]), i*2 - 0.2), np.mean(setting1_scores[:, i]), color='green', alpha=0.7)


    # Scatter plot for Setting 2
    for i, char in enumerate(chars):
        ax.scatter(np.full_like(setting2_scores[:, i], i*2 + 0.2), setting2_scores[:, i],  color='red', alpha=0.7)

    for i, char in enumerate(chars):
        ax.scatter(np.full_like(np.mean(setting2_scores[:, i]), i*2 + 0.2), np.mean(setting2_scores[:, i]), color='black', alpha=0.7)


    # Create custom legend handles
    blue_dot = plt.Line2D([0], [0], marker='o', color='blue', linestyle='None', markersize=5)
    red_dot = plt.Line2D([0], [0], marker='o', color='red', linestyle='None', markersize=5)
    green_dot = plt.Line2D([0], [0], marker='o', color='green', linestyle='None', markersize=5)
    black_dot = plt.Line2D([0], [0], marker='o', color='black', linestyle='None', markersize=5)

    # Customizing the plot
    ax.set_xticks(np.arange(len(chars)) * 2)
    ax.set_xticklabels(chars)
    ax.set_xlabel('Characteristics')
    ax.set_ylabel('Scores')
    # ax.set_title('Scatter Plot of Scores by Characteristics and Evaluation Settings')

    legend_labels = ['Setting-1', 'Setting-2', "Setting-1 Avg. Scores", "Setting-2 Avg. Scores"]

    # Add legend
    ax.legend(handles=[blue_dot, red_dot, green_dot, black_dot], labels=legend_labels, loc='lower right')

    plt.tight_layout()

    plt.savefig(f"images/{model}-s1-vs-s2.jpg")
    plt.show()

def plot_s1_vs_s2_vs_prolofic(setting1_scores, setting2_scores, prolofic_scores, model, chars):
    # Convert to numpy arrays for easier manipulation
    setting1_scores = np.array(setting1_scores)
    setting2_scores = np.array(setting2_scores)
    prolofic_scores =  np.array(prolofic_scores)

    ig, ax = plt.subplots(figsize=(12, 4))

    # Scatter plot for Setting 1
    # for i, char in enumerate(chars):
    #     ax.scatter(np.full_like(setting1_scores[:, i], i*2 - 0.2), setting1_scores[:, i], color='blue', alpha=0.7)

    for i, char in enumerate(chars):
        ax.scatter(np.full_like(np.mean(setting1_scores[:, i]), i*2 - 0.2), np.mean(setting1_scores[:, i]), color='blue', alpha=0.7)


    # Scatter plot for Setting 2
    # for i, char in enumerate(chars):
    #     ax.scatter(np.full_like(setting2_scores[:, i], i*2 + 0.2), setting2_scores[:, i],  color='red', alpha=0.7)

    for i, char in enumerate(chars):
        ax.scatter(np.full_like(np.mean(setting2_scores[:, i]), i*2 + 0.2), np.mean(setting2_scores[:, i]), color='black', alpha=0.7)
    
    # Scatter plot for prolofic_scores 2
    for i, char in enumerate(chars):
        ax.scatter(np.full_like(np.mean(prolofic_scores[:, i]), i*2 + 0.2), np.mean(prolofic_scores[:, i]), color='red', alpha=0.7)

    # Create custom legend handles
    blue_dot = plt.Line2D([0], [0], marker='o', color='blue', linestyle='None', markersize=5)
    red_dot = plt.Line2D([0], [0], marker='o', color='red', linestyle='None', markersize=5)
    green_dot = plt.Line2D([0], [0], marker='o', color='green', linestyle='None', markersize=5)
    black_dot = plt.Line2D([0], [0], marker='o', color='black', linestyle='None', markersize=5)

    # Customizing the plot
    ax.set_xticks(np.arange(len(chars)) * 2)
    ax.set_xticklabels(chars)
    ax.set_xlabel('Characteristics')
    ax.set_ylabel('Scores')
    # ax.set_title('Scatter Plot of Scores by Characteristics and Evaluation Settings')

    legend_labels = ["Setting-1 Avg. Scores", "Setting-2 Avg. Scores", "Prolofic-Human Avg. Scores" ]

    # Add legend
    ax.legend(handles=[blue_dot, black_dot, red_dot], labels=legend_labels, loc='lower right')

    plt.tight_layout()

    plt.savefig(f"images/{model}-s1-vs-s2-vs-prolofic.jpg")
    plt.show()

# Prolofic Human Evaluation comparison

In [None]:
import pandas as pd
from scripts.dataset import format_context
from scripts.utils import paper_structure_score, word_count_score
from scripts.configs import BaseConfig
from scripts import io

args = BaseConfig().get_args()

reward_vocab = io.read_text(args.reward_vocab).split("\n")
reward_vocab = [vocab.lower() for vocab in reward_vocab]
    
mdf = pd.read_csv("dataset/split/Mistral-7B/ProlificHuman/prolific_mistral_methodological.csv")
pdf = pd.read_csv("dataset/split/Mistral-7B/ProlificHuman/prolific_mistral_paper-wise.csv")
tdf = pd.read_csv("dataset/split/Mistral-7B/ProlificHuman/prolific_mistral_thematic.csv")

In [None]:
prolofic_dataset = []


def format_context(row):
    context = ""
    for i in range(5):
        title = row[f'paper_{i+1}_title'] 
        abstract = row[f'paper_{i+1}_abstract'] 
        context += f'{i+1}. ' + ' '.join(title.replace('\n', ' ').split()) + '\n' + ' '.join(abstract.replace('\n', ' ').split()) + '\n\n'
    return context 

synthesis_types = ["methodological", "paperwise", "thematic"]

ph_dfs = [mdf, pdf, tdf]
for idx, synthesis_type in enumerate(synthesis_types):
    
    for index, row in ph_dfs[idx].iterrows(): 
        context = format_context(row)
        research_problem = row['research_problem']
        eval_results = {}
        for person in ["P1", "P2", "P3"]:
            eval_result = {}
            for char in chars:
                char_lower = char.lower()
                rating = row[f'{char_lower}_rating_{person}']
                rationale = row[f'{char_lower}_comment_{person}']
                eval_result[char] = {"rating":rating, "rationale":rationale}
            eval_results[person] = eval_result


        prolofic_dataset.append({
            "synthesis": synthesis_type, 
            "split": 'test-prolific', 
            "sample_id":row['sample_id'], 
            "prompt": "", 
            "inference": row['synthesis_text'],
            "P-1": {"eval-result": eval_results['P1']},
            "P-2": {"eval-result": eval_results['P2']},
            "P-3": {"eval-result": eval_results['P3']},
            "basic-eval": {
                    "paper-structure": paper_structure_score(row['synthesis_text'], reward_vocab), 
                    "word-count": word_count_score(row['synthesis_text'])
            }
        })

len(prolofic_dataset)

In [None]:
prolofic_dataset_score, prolofic_dataset_reults_dict =  evaluation_report(prolofic_dataset, model='prolofic_dataset', setting='P', fold_prefix='')
print(json.dumps(prolofic_dataset_score, indent=4))

print("-"*50)

vanila = read_json("assets/vanila.json")
vanila_prolific = [item for item in vanila if item['split'] == 'test-prolific']

vanila_prolific_score_s1, vanila_prolific_reults_dict_s1 =  evaluation_report(vanila_prolific, model='vanila', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(vanila_prolific_score_s1, indent=4))

print("-"*50)

vanila_prolific_score_s2, vanila_prolific_reults_dict_s2 =  evaluation_report(vanila_prolific, model='vanila', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(vanila_prolific_score_s2, indent=4))

setting1_scores_vanila_prolific = build_setting_per_fold_matrix(result_list=vanila_prolific, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')
setting2_scores_vanila_prolific = build_setting_per_fold_matrix(result_list=vanila_prolific, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
prolific_scores_vanila_prolific = build_setting_per_fold_matrix(result_list=prolofic_dataset, setting='P', chars=chars, fold_prefix='')

plot_s1_vs_s2_vs_prolofic(setting1_scores=setting1_scores_vanila_prolific, 
                          setting2_scores=setting2_scores_vanila_prolific, 
                          prolofic_scores=prolific_scores_vanila_prolific, 
                          model='vanila-prolific', chars=chars)

In [None]:
prolofic_dataset_score, prolofic_dataset_reults_dict =  evaluation_report(prolofic_dataset, model='prolofic_dataset', setting='P', fold_prefix='')
print(json.dumps(prolofic_dataset_score, indent=4))

print("-"*50)

warmup = read_json("assets/warmup-inf.json")
warmup_prolific = [item for item in warmup if item['split'] == 'test-prolific']

warmup_prolific_score_s1, warmup_prolific_reults_dict_s1 =  evaluation_report(warmup_prolific, model='warmup', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(warmup_prolific_score_s1, indent=4))

print("-"*50)

warmup_prolific_score_s2, warmup_prolific_reults_dict_s2 =  evaluation_report(warmup_prolific, model='warmup', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(warmup_prolific_score_s2, indent=4))

setting1_scores_warmup_prolific = build_setting_per_fold_matrix(result_list=warmup_prolific, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')
setting2_scores_warmup_prolific = build_setting_per_fold_matrix(result_list=warmup_prolific, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
prolific_scores_warmup_prolific = build_setting_per_fold_matrix(result_list=prolofic_dataset, setting='P', chars=chars, fold_prefix='')

plot_s1_vs_s2_vs_prolofic(setting1_scores=setting1_scores_warmup_prolific, 
                          setting2_scores=setting2_scores_warmup_prolific, 
                          prolofic_scores=prolific_scores_warmup_prolific, 
                          model='finetuned-prolific', chars=chars)

# Vanila

In [None]:
vanila = read_json("assets/vanila.json")
vanila_score_s1, vanila_reults_dict_s1 =  evaluation_report(vanila, model='vanila', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(vanila_score_s1, indent=4))

print("-"*50)

vanila_score_s2, vanila_reults_dict_s2 =  evaluation_report(vanila, model='vanila', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(vanila_score_s2, indent=4))

setting1_scores_vanila = build_setting_per_fold_matrix(result_list=vanila, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')
setting2_scores_vanila = build_setting_per_fold_matrix(result_list=vanila, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
plot_s1_vs_s2(setting1_scores=setting1_scores_vanila, setting2_scores=setting2_scores_vanila, model='vanila', chars=chars)

# Finetuned

In [None]:
warmup = read_json("assets/warmup-inf.json")
warmup_score_s1, warmup_reults_dict_s1 =  evaluation_report(warmup, model='warmup', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(warmup_score_s1, indent=4))

print("-"*50)

warmup_score_s2, warmup_reults_dict_s2 =  evaluation_report(warmup, model='warmup', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(warmup_score_s2, indent=4))

setting1_scores_warmup = build_setting_per_fold_matrix(result_list=warmup, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')        
setting2_scores_warmup = build_setting_per_fold_matrix(result_list=warmup, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
plot_s1_vs_s2(setting1_scores=setting1_scores_warmup, setting2_scores=setting2_scores_warmup, model='warmup', chars=chars)

# Finetuned + RLHF

In [None]:
finetuned_rlhf = read_json("assets/rlhf-style-with-warmup-inf.json")
finetuned_rlhf_score_s1, finetuned_rlhf_reults_dict_s1 =  evaluation_report(finetuned_rlhf, model='finetuned_rlhf', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(finetuned_rlhf_score_s1, indent=4))

print("-"*50)

finetuned_rlhf_score_s2, finetuned_rlhf_reults_dict_s2 =  evaluation_report(finetuned_rlhf, model='finetuned_rlhf', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(finetuned_rlhf_score_s2, indent=4))

setting1_scores_finetuned_rlhf = build_setting_per_fold_matrix(result_list=finetuned_rlhf, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')        
setting2_scores_finetuned_rlhf = build_setting_per_fold_matrix(result_list=finetuned_rlhf, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
plot_s1_vs_s2(setting1_scores=setting1_scores_finetuned_rlhf, setting2_scores=setting2_scores_finetuned_rlhf, model='finetuned_rlhf', chars=chars)

# RLHF

In [None]:
rlhf = read_json('assets/rlhf-style-inf.json')
rlhf_score_s1, rlhf_reults_dict_s1 =  evaluation_report(rlhf, model='rlhf', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s1, indent=4))

print("-"*50)

rlhf_score_s2, rlhf_reults_dict_s2 =  evaluation_report(rlhf, model='rlhf', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s2, indent=4))

setting1_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')        
setting2_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
plot_s1_vs_s2(setting1_scores=setting1_scores_rlhf, setting2_scores=setting2_scores_rlhf, model='rlhf', chars=chars)

# Finetuned + RLHF (GPT4-Features)

In [None]:
rlhf = read_json('assets/rlhf-style-gpt4-with-warmup-inf.json')
rlhf_score_s1, rlhf_reults_dict_s1 =  evaluation_report(rlhf, model='finetuned_rlhf_gpt4', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s1, indent=4))

print("-"*50)

rlhf_score_s2, rlhf_reults_dict_s2 =  evaluation_report(rlhf, model='finetuned_rlhf_gpt4', setting='s2', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s2, indent=4))

setting1_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')        
setting2_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
plot_s1_vs_s2(setting1_scores=setting1_scores_rlhf, setting2_scores=setting2_scores_rlhf, model='finetuned_rlhf_gpt4', chars=chars)

# RLHF (GPT4-Features)

In [None]:
rlhf = read_json('assets/rlhf-style-gpt4-inf.json')
rlhf_score_s1, rlhf_reults_dict_s1 =  evaluation_report(rlhf, model='rlhf_gpt4', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s1, indent=4))

print("-"*50)

# rlhf_score_s2, rlhf_reults_dict_s2 =  evaluation_report(rlhf, model='rlhf_gpt4', setting='s2', fold_prefix='gpt-4-eval-')
# print(json.dumps(rlhf_score_s2, indent=4))

# setting1_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s1', chars=chars, fold_prefix='gpt-4-eval-')        
# setting2_scores_rlhf = build_setting_per_fold_matrix(result_list=rlhf, setting='s2', chars=chars, fold_prefix='gpt-4-eval-')
# plot_s1_vs_s2(setting1_scores=setting1_scores_rlhf, setting2_scores=setting2_scores_rlhf, model='rlhf_gpt4', chars=chars)

# GPT-4 Vanilla

In [None]:
result_list = read_json('assets/gpt4-vanilla.json')
rlhf_score_s1, rlhf_reults_dict_s1 =  evaluation_report(result_list, model='gpt4_vanila', setting='s1', fold_prefix='gpt-4-eval-')
print(json.dumps(rlhf_score_s1, indent=4))