In [1]:
%load_ext autoreload
import os
import sys

sys.path.append('./src-py')

In [2]:
import datasets
import json
import os
import numpy as np
import pandas as pd
import torch
import re
from collections import Counter

pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

#### Write Coversations to Label-studio

This shows the scores of ChatGPT and the reasoning in the label-studio interface

In [3]:
def write_dialogues_to_label_studio_format(eval_results, model_name, output_path):
    output = []
    for item in eval_results:
        dlg = [{'text': turn['content'], 'author': 'Journalist' if turn['role']=='assistant' else 'Researcher'} for turn in item['generated_conversation'][2:]]
        obj = {
            'prompt_name': '',
            'sc_title' : item['paper_id'],
            'sc_abstract':item['generated_conversation'][1]['content'],
            'dialogue' : dlg,
            'llm_eval'  : '\n\n'.join(['{}: {} \n {}'.format(aspect, item['{}_eval_prompt_scoring_parsed'.format(aspect)]['score'],
                                                             item['{}_eval_prompt_scoring_parsed'.format(aspect)]['reasons']) 
                                       for aspect in ['scientific', 'societal', 'clarity']]),
            'llam_societal_impact_eval': 'Score: {}\nReason: {}'.format(item['societal_eval_prompt_scoring_parsed']['score'], 
                                                                       item['societal_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llam_scientific_context_eval': 'Score: {}\nReason {}'.format(item['scientific_eval_prompt_scoring_parsed']['score'], 
                                                                       item['scientific_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llam_clarity_eval': 'Score: {}\nReason {}'.format(item['clarity_eval_prompt_scoring_parsed']['score'], 
                                                                       item['clarity_eval_prompt_scoring_parsed']['reasons']
                                                                ) ,
            'gen-model': model_name
        }

        #obj['dialogue'] = [x for turn in obj['dialogue'] for x in turn]
        
        for aspect in ['scientific', 'societal', 'clarity']:
            obj[aspect] = item['{}_eval_prompt_scoring_parsed'.format(aspect)]['score']
        output.append(obj)
    
    json.dump(output, open(output_path, 'w'))
    return output

In [4]:
ds_path = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public'

In [5]:
# Load dataset
baseline_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline-advanced-prompt-llama3-test-conv-ds/ds_eval')
#ft_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/ft-40k-llama3-test-conv-ds/ds_eval')
baseline_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline-qwen-test-conv-ds/ds_eval')
#ft_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/ft-40k-qwen-test-conv-ds/ds_eval')

In [11]:
baseline_qwen_evaluated_ds

Dataset({
    features: ['paper_id', 'paper_title', 'paper_text', 'prompt', 'completion', '__index_level_0__', 'pr-article', 'generated_conversation', 'conversation', 'clarity_eval_prompt_scoring_parsed', 'scientific_eval_prompt_scoring_parsed', 'societal_eval_prompt_scoring_parsed'],
    num_rows: 500
})

In [6]:
json_output  = write_dialogues_to_label_studio_format(baseline_llama3_evaluated_ds, 'baseline-llama3', './data/llama3_baseline_label_studio_tasks.json')
#json_output  = write_dialogues_to_label_studio_format(ft_llama3_evaluated_ds, 'ft-llama3', './data/llama3_ft_label_studio_tasks.json')
json_output  = write_dialogues_to_label_studio_format(baseline_qwen_evaluated_ds, 'baseline-qwen', './data/qwen_baseline_label_studio_tasks.json')
#json_output  = write_dialogues_to_label_studio_format(ft_qwen_evaluated_ds, 'ft-qwen', './data/qwen_ft_label_studio_tasks.json')

### This will compare chat-gpt ranking with human ranking

In [113]:
# all_prompts = get_prompt_compositions()

# gpt3_eval_results = get_prompts_conversations('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/gpt3-gen-conv/', 100, all_prompts, w_eval_res=True)
# llama3_eval_results = get_prompts_conversations('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/llama3-gen-conv/', 100, all_prompts, w_eval_res=True)

# gpt3_eval_results['composite-na-generic-guidelines'] = gpt3_eval_results['composite-na-generic-guidelines'].add_column('gen-source', ['gpt3-generic-guidelines']* 50)
# gpt3_eval_results['composite-na-pr-guided']          = gpt3_eval_results['composite-na-pr-guided'].add_column('gen-source', ['gpt3-pr-guided']* 50)
# llama3_eval_results['composite-na-generic-guidelines'] = llama3_eval_results['composite-na-generic-guidelines'].add_column('gen-source', ['llama3-generic-guidelines']* 49)
# llama3_eval_results['composite-na-pr-guided'] = llama3_eval_results['composite-na-pr-guided'].add_column('gen-source', ['llama3-pr-guided']* 50)

# dataset_df = datasets.concatenate_datasets([gpt3_eval_results['composite-na-generic-guidelines'], gpt3_eval_results['composite-na-pr-guided'], llama3_eval_results['composite-na-generic-guidelines'], llama3_eval_results['composite-na-pr-guided']]).to_pandas()

# dataset_df.to_json('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/all_sample_generated_conversations_with_eval_100.json')

Now, we have two datasets with evalaution '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/all_sample_generated_conversations_with_eval.json' old and the new one '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/all_sample_generated_conversations_with_eval_100.json'

In [None]:
dataset_df = pd.read_json('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/all_sample_generated_conversations_with_eval_100.json')

In [None]:
dataset_df[['Topic']].value_counts()

In [None]:
def generate_label_studio_eval(dataset_df, evaluation_aspect):
    dataset_df['score'] = dataset_df[evaluation_aspect  + '_eval_prompt_scoring_parsed'].apply(lambda x: x['score'])
    dataset_df['reasons']= dataset_df[evaluation_aspect + '_eval_prompt_scoring_parsed'].apply(lambda x: str(x['score']) + ' : ' + str(x['reasons']))
    
    dataset_df_grouped = dataset_df.groupby(['pr-title']).agg({
        'conversation': list,
        'score': list,
        'reasons': list,
        'gen-source': list,
        'Topic': lambda x: list(x)[0],
    }).reset_index()
    
    dataset_df_grouped['conversation_comparison'] = dataset_df_grouped.apply(lambda row: sorted(list(zip(row['conversation'], row['gen-source'], row['reasons'], row['score'])), key=lambda x: -x[3]), axis=1)
    
    #creating pairs for comparison
    pairs_ds = []
    for idx, row in dataset_df_grouped.iterrows():
        ranked_conv = row['conversation_comparison']
        if ranked_conv[0][-1] > ranked_conv[-1][-1]:
            dlg1 = [utter.split(':') for utter in ranked_conv[0][0].split('\n\n') if 'Journalist' in utter or 'Researcher' in utter]
            dlg1 = [utter for utter in dlg1 if len(utter) > 1]
            dlg1 = [{'text': utter[1].replace("**", ""), 'author': utter[0].replace("**", "")} for utter in dlg1]

            dlg2 = [utter.split(':') for utter in ranked_conv[-1][0].split('\n\n') if 'Journalist' in utter or 'Researcher' in utter]
            dlg2 = [utter for utter in dlg2 if len(utter) > 1]
            dlg2 = [{'text': utter[1].replace("**", ""), 'author': utter[0].replace("**", "")} for utter in dlg2]
            
            json_obj = {
                'item_1_conv'  : dlg1,
                'item_1_source': ranked_conv[0][1],
                'item_1_scoring' : ranked_conv[0][2],
                'item_2_conv'  : dlg2,
                'item_2_source': ranked_conv[-1][1],
                'item_2_scoring' : ranked_conv[-1][2],
                'pr_title' : row['pr-title'],
                'topic': row['Topic'],
                'eval_aspect': evaluation_aspect
            }

            instance = dataset_df[dataset_df['pr-title'] == row['pr-title']]
            #print(instance)
            json_obj['sc_abstract'] = instance['sc-intro'].tolist()[0]
            json_obj['pr_article']  = instance['pr-summary'].tolist()[0] + '\n=======\n\n' + instance['pr-article'].tolist()[0]
            pairs_ds.append(json_obj)
    return pairs_ds

In [121]:
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='faithfull'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-faithfull_new.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='scientific'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-scientific_new.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='societal'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-societal_new.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='relevancy'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-relevancy_new.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='clarity'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-clarity_new.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='factuality'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-factuality_new.json', 'w'))

In [146]:
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='faithfull'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-faithfull.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='scientific'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-scientific.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='societal'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-societal.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='relevancy'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-relevancy.json', 'w'))
json.dump(generate_label_studio_eval(dataset_df, evaluation_aspect='clarity'), open('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/evaluate-gpt-4-clarity.json', 'w'))

In [45]:
json_output  = write_dialogues_to_label_studio_format(ft_llama3_evaluated_ds, 'ft-llama3', './data/ft_llama3_for_label_studio.json')