In [1]:
%load_ext autoreload
import os
import sys

sys.path.append('./src-py')

In [2]:
import datasets
import json
import os
import numpy as np
import pandas as pd
import torch
import re
from collections import Counter

pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

#### Write Coversations to Label-studio

This shows the scores of ChatGPT and the reasoning in the label-studio interface

In [125]:
import markdown
from torch.utils.data import DataLoader


def render_conversation_html(conversation):
  """
  Renders a conversation (list of role-message pairs) as HTML.

  Args:
    conversation: A list of tuples, where each tuple is (role, message).
                  Messages can contain Markdown.

  Returns:
    A string containing the HTML representation of the conversation.
  """
  html_output = "<div>\n"  # Start with a container div
  for item in conversation:
    role, message = item['author'], item['text']
    # Convert Markdown message to HTML
    rendered_message = markdown.markdown(message)
    # Define inline styles based on the role and index for alternating colors
    inline_styles = "border: 1px solid #ccc; padding: 10px; margin-bottom: 10px; border-radius: 5px;"
    if role.lower() == "journalist":
      inline_styles += " background-color: #f0f0f0;"  # Light grey background for user messages
    elif role.lower() == "researcher":
      inline_styles += " background-color: #e0e0e0;"  # Slightly darker grey for assistant messages

    # Add role and rendered message to the HTML output, with inline styles
    html_output += f"  <div style='{inline_styles}'>\n"  # Use style attribute for inline styling
    html_output += f"    <strong>{role}:</strong>\n"
    html_output += f"    {rendered_message}\n"
    html_output += "  </div>\n"
  html_output += "</div>"
  return html_output

def write_dialogues_to_label_studio_format(eval_results1, eval_results2, model_name, output_path):
    
    output = []
    for item1 in eval_results1:
        
        item2 = eval_results2.filter(lambda row: row['paper_id'] == item1['paper_id'])[0]

        print(item1['paper_id'], item2['paper_id'])
        dlg = [{'text': turn['content'], 'author': 'Journalist' if turn['role']=='assistant' else 'Researcher'} 
               for turn in item1['generated_conversation'][2:]]
        
        obj = {
            'prompt_name': '',
            'sc_title' : item1['paper_id'],
            'topic': item1['topic'],
            'sc_abstract':item1['generated_conversation'][1]['content'],
            'html_conv': render_conversation_html(dlg),
            'dialogue' : dlg,

            'llm1_societal_impact_eval': 'Score: {}\nReason: {}'.format(item1['societal_eval_prompt_scoring_parsed']['score'], 
                                                                       item1['societal_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llm1_scientific_context_eval': 'Score: {}\nReason: {}'.format(item1['scientific_eval_prompt_scoring_parsed']['score'], 
                                                                       item1['scientific_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llm1_clarity_eval': 'Score: {}\nReason: {}'.format(item1['clarity_eval_prompt_scoring_parsed']['score'], 
                                                                       item1['clarity_eval_prompt_scoring_parsed']['reasons']
                                                                ) ,

            'llm2_societal_impact_eval': 'Score: {}\nReason: {}'.format(item2['societal_eval_prompt_scoring_parsed']['score'], 
                                                                       item2['societal_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llm2_scientific_context_eval': 'Score: {}\nReason: {}'.format(item2['scientific_eval_prompt_scoring_parsed']['score'], 
                                                                       item2['scientific_eval_prompt_scoring_parsed']['reasons']
                                                                ),
            'llm2_clarity_eval': 'Score: {}\nReason: {}'.format(item2['clarity_eval_prompt_scoring_parsed']['score'], 
                                                                       item2['clarity_eval_prompt_scoring_parsed']['reasons']
                                                                ) ,
            'gen-model': model_name
        }

        #obj['dialogue'] = [x for turn in obj['dialogue'] for x in turn]
        
        # for aspect in ['scientific', 'societal', 'clarity']:
        #     obj[aspect] = item['{}_eval_prompt_scoring_parsed'.format(aspect)]['score']
        output.append(obj)
    
    json.dump(output, open(output_path, 'w'))
    return output

In [104]:
ds_path = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/eval_experiment_500/'

In [126]:
# Load datasets
gpt_baseline_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline_llama3_gen_conv/ds_eval/gpt-4.1-2025-04-14/')
gpt_ft_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/ft_llama3_gen_conv/ds_eval/gpt-4.1-2025-04-14/')
gpt_baseline_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline_qwen_gen_conv/ds_eval/gpt-4.1-2025-04-14/')
gpt_ft_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/ft_qwen_gen_conv/ds_eval/gpt-4.1-2025-04-14/')

baseline_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline_llama3_gen_conv/ds_eval/deepseek-ai/')
ft_llama3_evaluated_ds = datasets.load_from_disk(ds_path + '/ft_llama3_gen_conv/ds_eval/deepseek-ai/')
baseline_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/baseline_qwen_gen_conv/ds_eval/deepseek-ai/')
ft_qwen_evaluated_ds = datasets.load_from_disk(ds_path + '/ft_qwen_gen_conv/ds_eval/deepseek-ai/')


In [127]:
paper_ids = baseline_llama3_evaluated_ds.shuffle().select(range(15))['paper_id']

In [128]:
json_output  = write_dialogues_to_label_studio_format(baseline_llama3_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), gpt_baseline_llama3_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), 'baseline-llama3', './data/llama3_baseline_label_studio_tasks.json')
json_output  = write_dialogues_to_label_studio_format(ft_llama3_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), gpt_ft_llama3_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), 'ft-llama3', './data/llama3_ft_label_studio_tasks.json')
json_output  = write_dialogues_to_label_studio_format(baseline_qwen_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), gpt_baseline_qwen_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), 'baseline-qwen', './data/qwen_baseline_label_studio_tasks.json')
json_output  = write_dialogues_to_label_studio_format(ft_qwen_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), gpt_ft_qwen_evaluated_ds.filter(lambda row: row['paper_id'] in paper_ids), 'ft-qwen', './data/qwen_ft_label_studio_tasks.json')

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-019-1290-4 10.1038/s41586-019-1290-4


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41529-019-0092-3 10.1038/s41529-019-0092-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-019-11902-6 10.1038/s41467-019-11902-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-017-08692-6 10.1038/s41598-017-08692-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-023-36431-1 10.1038/s41467-023-36431-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41563-018-0258-3 10.1038/s41563-018-0258-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1136/bmjopen-2022-066702 10.1136/bmjopen-2022-066702


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-018-0276-y 10.1038/s41586-018-0276-y


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1186/s12889-016-3074-1 10.1186/s12889-016-3074-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

www.bmj.com/cgi/doi/10.1136/bmj.e8707 www.bmj.com/cgi/doi/10.1136/bmj.e8707


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-84762-0 10.1038/s41598-021-84762-0


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-02901-z 10.1038/s41598-021-02901-z


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/srep39589 10.1038/srep39589


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41592-021-01155-x 10.1038/s41592-021-01155-x


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/ncb3149 10.1038/ncb3149


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-019-1290-4 10.1038/s41586-019-1290-4


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41529-019-0092-3 10.1038/s41529-019-0092-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-019-11902-6 10.1038/s41467-019-11902-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-017-08692-6 10.1038/s41598-017-08692-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-023-36431-1 10.1038/s41467-023-36431-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41563-018-0258-3 10.1038/s41563-018-0258-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1136/bmjopen-2022-066702 10.1136/bmjopen-2022-066702


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-018-0276-y 10.1038/s41586-018-0276-y


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1186/s12889-016-3074-1 10.1186/s12889-016-3074-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

www.bmj.com/cgi/doi/10.1136/bmj.e8707 www.bmj.com/cgi/doi/10.1136/bmj.e8707


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-84762-0 10.1038/s41598-021-84762-0


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-02901-z 10.1038/s41598-021-02901-z


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/srep39589 10.1038/srep39589


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41592-021-01155-x 10.1038/s41592-021-01155-x


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/ncb3149 10.1038/ncb3149


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-019-1290-4 10.1038/s41586-019-1290-4


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41529-019-0092-3 10.1038/s41529-019-0092-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-019-11902-6 10.1038/s41467-019-11902-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-017-08692-6 10.1038/s41598-017-08692-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-023-36431-1 10.1038/s41467-023-36431-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41563-018-0258-3 10.1038/s41563-018-0258-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1136/bmjopen-2022-066702 10.1136/bmjopen-2022-066702


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-018-0276-y 10.1038/s41586-018-0276-y


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1186/s12889-016-3074-1 10.1186/s12889-016-3074-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

www.bmj.com/cgi/doi/10.1136/bmj.e8707 www.bmj.com/cgi/doi/10.1136/bmj.e8707


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-84762-0 10.1038/s41598-021-84762-0


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-02901-z 10.1038/s41598-021-02901-z


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/srep39589 10.1038/srep39589


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41592-021-01155-x 10.1038/s41592-021-01155-x


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/ncb3149 10.1038/ncb3149


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-019-1290-4 10.1038/s41586-019-1290-4


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41529-019-0092-3 10.1038/s41529-019-0092-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-019-11902-6 10.1038/s41467-019-11902-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-017-08692-6 10.1038/s41598-017-08692-6


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41467-023-36431-1 10.1038/s41467-023-36431-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41563-018-0258-3 10.1038/s41563-018-0258-3


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1136/bmjopen-2022-066702 10.1136/bmjopen-2022-066702


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41586-018-0276-y 10.1038/s41586-018-0276-y


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1186/s12889-016-3074-1 10.1186/s12889-016-3074-1


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

www.bmj.com/cgi/doi/10.1136/bmj.e8707 www.bmj.com/cgi/doi/10.1136/bmj.e8707


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-84762-0 10.1038/s41598-021-84762-0


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41598-021-02901-z 10.1038/s41598-021-02901-z


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/srep39589 10.1038/srep39589


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/s41592-021-01155-x 10.1038/s41592-021-01155-x


Filter:   0%|          | 0/15 [00:00<?, ? examples/s]

10.1038/ncb3149 10.1038/ncb3149


### Analyzing human evaluation of LLM-judge

In [3]:
import json

In [44]:
eval_results = json.load(open('./data/export_163509_project-163509-at-2025-07-15-10-21-6754b765.json'))

In [45]:
len(eval_results)

60

In [46]:
side_to_model = {'right': 'gpt-4', 'left': 'deepseek'}

In [47]:
data = []
for task in eval_results:
    for annotations in task['annotations']:
        item = {}
        for x in task['data'].items():
            item[x[0]] = x[1]

        item['annotator'] = annotations['completed_by']['email']
        for ann_res in annotations['result']:
            if ann_res['type'] == 'pairwise':
                item[ann_res['to_name']] = side_to_model[ann_res['value']['selected']]
            else:
                item[ann_res['to_name'] + '-comment'] = ann_res['value']['text']
        data.append(item)

results_df = pd.DataFrame(data)

In [48]:
results_df[['topic', 'gen-model', 'llm1_clarity_eval','llm2_clarity_eval','llm1_societal_impact_eval','llm2_societal_impact_eval','llm1_scientific_context_eval','llm2_scientific_context_eval','annotator','pw_llm_societal_impact','pw_llm_scientific_context','pw_llm_clarity']].head()

Unnamed: 0,topic,gen-model,llm1_clarity_eval,llm2_clarity_eval,llm1_societal_impact_eval,llm2_societal_impact_eval,llm1_scientific_context_eval,llm2_scientific_context_eval,annotator,pw_llm_societal_impact,pw_llm_scientific_context,pw_llm_clarity
0,Biology,baseline-qwen,"Score: 5\nReason: The conversation successfully balances technical terms with clear explanations. The researcher uses accessible analogies, like comparing the sponges' cell behavior to human stem cells, and provides concrete examples with specific sponge species. While some technical terms are present, such as 'transdifferentiate' and 'pluripotent stem cells,' they are explained thoroughly, making the content understandable to the general public. The记者 also ensures clarity by asking for examples and clarifications, which helps in breaking down complex ideas.","Score: 5\nReason: The conversation covers complex concepts such as 'transient metastable state,' 'transdifferentiation,' and 'pluripotent stem cells,' but each of these is clearly explained in accessible language. The researcher uses concrete examples, analogies to stem cells, and a specific species (*Amphimedon queenslandica*) to clarify the concepts. The responses build background, provide descriptive explanations, and offer 'imagine' scenarios to help the journalist (and by extension the public) understand the concepts. The journalist repeatedly asks for clarifications and examples, ensuring nothing technical is left unexplained. Overall, the conversation is engaging, informative, and appropriately accessible to a public audience.","Score: 1\nReason: The conversation focuses on the scientific details of cellular transdifferentiation in sponges, discussing how choanocytes can change into archaeocytes and the implications for the evolution of multicellularity. However, it does not explore the broader societal implications of this research. There is no mention of how these findings might impact human understanding, technology, policy, or public perceptions, nor does it discuss positive or negative societal outcomes.","Score: 3\nReason: The conversation does a strong job of contextualizing the research within broader societal and conceptual frameworks. The researcher thoroughly explains how the flexible cell-state transitions observed in sponges could have impacted the evolution of multicellularity—a major transition in the history of life, with lasting implications for understanding animal complexity and evolution. They provide concrete examples (e.g., scaling up feeding apparatus, tissue repair) and describe the relevance of this flexibility by linking it to environmental adaptability and tissue maintenance, which are themes connected to resilience and complexity in multicellular organisms. While the discussion is somewhat focused on evolutionary and biological aspects, it also addresses conceptual impacts (e.g., challenging traditional views of animal evolution) and highlights the importance of cellular plasticity mechanisms found in early animals and their relevance to current biological understanding. Negative aspects aren't explicitly discussed, but the detailed explanation, real-world examples, and evolutionary framing show a high level of societal contextualization.","Score: 3\nReason: The conversation provides detailed examples of how the research on sponge cell plasticity compares to both early multicellular organisms and modern holozoans, highlighting the novelty of the findings in understanding the evolutionary pathways of cell differentiation.","Score: 3\nReason: The conversation does a good job of situating the current research in the broader scientific context. The Researcher explicitly compares the findings to those in modern unicellular holozoans, discussing how the transient metastable states and transdifferentiation seen in sponges are more robust and internally regulated, in contrast to the more environmentally induced changes in unicellular holozoans. The researcher also explains the novelty by highlighting the transition from simple, rigid multicellular organization to more flexible, dynamic cellular behavior in early animals. This demonstrates how the work is innovative compared to previous views. Additionally, the implications for understanding the evolution of complex multicellularity are discussed, showing how this research advances scientific understanding in the field.",adedejisaheed007@gmail.com,deepseek,gpt-4,gpt-4
1,Biology,baseline-qwen,"Score: 5\nReason: The conversation successfully balances technical terms with clear explanations. The researcher uses accessible analogies, like comparing the sponges' cell behavior to human stem cells, and provides concrete examples with specific sponge species. While some technical terms are present, such as 'transdifferentiate' and 'pluripotent stem cells,' they are explained thoroughly, making the content understandable to the general public. The记者 also ensures clarity by asking for examples and clarifications, which helps in breaking down complex ideas.","Score: 5\nReason: The conversation covers complex concepts such as 'transient metastable state,' 'transdifferentiation,' and 'pluripotent stem cells,' but each of these is clearly explained in accessible language. The researcher uses concrete examples, analogies to stem cells, and a specific species (*Amphimedon queenslandica*) to clarify the concepts. The responses build background, provide descriptive explanations, and offer 'imagine' scenarios to help the journalist (and by extension the public) understand the concepts. The journalist repeatedly asks for clarifications and examples, ensuring nothing technical is left unexplained. Overall, the conversation is engaging, informative, and appropriately accessible to a public audience.","Score: 1\nReason: The conversation focuses on the scientific details of cellular transdifferentiation in sponges, discussing how choanocytes can change into archaeocytes and the implications for the evolution of multicellularity. However, it does not explore the broader societal implications of this research. There is no mention of how these findings might impact human understanding, technology, policy, or public perceptions, nor does it discuss positive or negative societal outcomes.","Score: 3\nReason: The conversation does a strong job of contextualizing the research within broader societal and conceptual frameworks. The researcher thoroughly explains how the flexible cell-state transitions observed in sponges could have impacted the evolution of multicellularity—a major transition in the history of life, with lasting implications for understanding animal complexity and evolution. They provide concrete examples (e.g., scaling up feeding apparatus, tissue repair) and describe the relevance of this flexibility by linking it to environmental adaptability and tissue maintenance, which are themes connected to resilience and complexity in multicellular organisms. While the discussion is somewhat focused on evolutionary and biological aspects, it also addresses conceptual impacts (e.g., challenging traditional views of animal evolution) and highlights the importance of cellular plasticity mechanisms found in early animals and their relevance to current biological understanding. Negative aspects aren't explicitly discussed, but the detailed explanation, real-world examples, and evolutionary framing show a high level of societal contextualization.","Score: 3\nReason: The conversation provides detailed examples of how the research on sponge cell plasticity compares to both early multicellular organisms and modern holozoans, highlighting the novelty of the findings in understanding the evolutionary pathways of cell differentiation.","Score: 3\nReason: The conversation does a good job of situating the current research in the broader scientific context. The Researcher explicitly compares the findings to those in modern unicellular holozoans, discussing how the transient metastable states and transdifferentiation seen in sponges are more robust and internally regulated, in contrast to the more environmentally induced changes in unicellular holozoans. The researcher also explains the novelty by highlighting the transition from simple, rigid multicellular organization to more flexible, dynamic cellular behavior in early animals. This demonstrates how the work is innovative compared to previous views. Additionally, the implications for understanding the evolution of complex multicellularity are discussed, showing how this research advances scientific understanding in the field.",nataliedulaney11@gmail.com,gpt-4,deepseek,gpt-4
2,Biology,baseline-qwen,"Score: 5\nReason: The conversation successfully balances technical terms with clear explanations. The researcher uses accessible analogies, like comparing the sponges' cell behavior to human stem cells, and provides concrete examples with specific sponge species. While some technical terms are present, such as 'transdifferentiate' and 'pluripotent stem cells,' they are explained thoroughly, making the content understandable to the general public. The记者 also ensures clarity by asking for examples and clarifications, which helps in breaking down complex ideas.","Score: 5\nReason: The conversation covers complex concepts such as 'transient metastable state,' 'transdifferentiation,' and 'pluripotent stem cells,' but each of these is clearly explained in accessible language. The researcher uses concrete examples, analogies to stem cells, and a specific species (*Amphimedon queenslandica*) to clarify the concepts. The responses build background, provide descriptive explanations, and offer 'imagine' scenarios to help the journalist (and by extension the public) understand the concepts. The journalist repeatedly asks for clarifications and examples, ensuring nothing technical is left unexplained. Overall, the conversation is engaging, informative, and appropriately accessible to a public audience.","Score: 1\nReason: The conversation focuses on the scientific details of cellular transdifferentiation in sponges, discussing how choanocytes can change into archaeocytes and the implications for the evolution of multicellularity. However, it does not explore the broader societal implications of this research. There is no mention of how these findings might impact human understanding, technology, policy, or public perceptions, nor does it discuss positive or negative societal outcomes.","Score: 3\nReason: The conversation does a strong job of contextualizing the research within broader societal and conceptual frameworks. The researcher thoroughly explains how the flexible cell-state transitions observed in sponges could have impacted the evolution of multicellularity—a major transition in the history of life, with lasting implications for understanding animal complexity and evolution. They provide concrete examples (e.g., scaling up feeding apparatus, tissue repair) and describe the relevance of this flexibility by linking it to environmental adaptability and tissue maintenance, which are themes connected to resilience and complexity in multicellular organisms. While the discussion is somewhat focused on evolutionary and biological aspects, it also addresses conceptual impacts (e.g., challenging traditional views of animal evolution) and highlights the importance of cellular plasticity mechanisms found in early animals and their relevance to current biological understanding. Negative aspects aren't explicitly discussed, but the detailed explanation, real-world examples, and evolutionary framing show a high level of societal contextualization.","Score: 3\nReason: The conversation provides detailed examples of how the research on sponge cell plasticity compares to both early multicellular organisms and modern holozoans, highlighting the novelty of the findings in understanding the evolutionary pathways of cell differentiation.","Score: 3\nReason: The conversation does a good job of situating the current research in the broader scientific context. The Researcher explicitly compares the findings to those in modern unicellular holozoans, discussing how the transient metastable states and transdifferentiation seen in sponges are more robust and internally regulated, in contrast to the more environmentally induced changes in unicellular holozoans. The researcher also explains the novelty by highlighting the transition from simple, rigid multicellular organization to more flexible, dynamic cellular behavior in early animals. This demonstrates how the work is innovative compared to previous views. Additionally, the implications for understanding the evolution of complex multicellularity are discussed, showing how this research advances scientific understanding in the field.",reggieclark2991@gmail.com,deepseek,deepseek,deepseek
3,Chemistry,baseline-qwen,"Score: 4\nReason: The conversation provided numerous and detailed explanations about the research. The Researcher used an analogy of a sponge to explain the Cu-rich layer, which made the concept understandable. Additionally, the Researcher addressed technical aspects like manufacturing processes and high-temperature performance, ensuring clarity for someone without a technical background. However, some advanced terms like XRD and SEM were mentioned without full explanation, though they were used in the context of quality control.","Score: 5\nReason: The conversation uses clear analogies (like the painted sponge) to elucidate complex scientific concepts, making them accessible to a lay audience. Technical terms are introduced but explained in context, and the journalist actively asks for clarification when needed. Background information and the rationale for technical processes are provided by the researcher, and the implications for industrial application are discussed with minimal jargon. Some advanced manufacturing techniques (e.g., X-ray diffraction, additive manufacturing) are mentioned, but they are generally explained or their purpose is clarified. Overall, the technical content is conveyed in a way that an interested member of the public could understand, and all major complexities are addressed with explanatory language and analogies.","Score: 3\nReason: The conversation extensively discusses the societal impact by addressing the practical implications, limitations, and economic aspects of the alloy in real-world industrial applications. It provides detailed examples, potential challenges, and solutions, covering both positive and negative factors.","Score: 3\nReason: The conversation thoroughly discusses the societal impact of the research, specifically the potential real-world application of the improved corrosion-resistant alloy in industrial settings. It addresses both the positive aspects, such as enhanced durability, reduced maintenance, and potential economic benefits, and the challenges or negative aspects, including higher initial costs, the need for advanced manufacturing and quality control, uncertainty about high-temperature performance, and potential variability in performance. The back-and-forth between the journalist and researcher explores specific practical considerations, analogies to aid understanding, and multiple examples of impact and limitation, showing a detailed and nuanced understanding of the research's societal implications.","Score: 3\nReason: The conversation provides a detailed account of how the research builds upon existing knowledge, particularly by comparing the Cu-enrichment mechanism in high-carbon Fe–Cr–W-based alloys to that in austenitic steels. It highlights the novelty by explaining the specific corrosion inhibition mechanisms and how they differ from previous studies. Additionally, the discussion of practical applications and limitations indicates how this research contributes to the broader scientific understanding and industry application.","Score: 3\nReason: The conversation provides a detailed comparison between the mechanism observed in the newly studied high-carbon Fe–Cr–W–Cu alloys and previous studies on austenitic steels, showing an understanding of how the current research fits into existing literature. It explicitly distinguishes the novel aspect (the protective Cu-rich layer in a high-carbon matrix) versus what is established for other steels. The novelty and innovation of the current work are clearly highlighted. However, although it addresses related research in a comparative manner and describes the innovation, there is a focus on specific studies (like those on austenitic steels) rather than a broader review of literature or discussion of the wider field's progress. Still, the account is sufficiently detailed and underscores the scientific context and contribution of the paper.",reggieclark2991@gmail.com,gpt-4,deepseek,deepseek
4,Chemistry,baseline-qwen,"Score: 4\nReason: The conversation provided numerous and detailed explanations about the research. The Researcher used an analogy of a sponge to explain the Cu-rich layer, which made the concept understandable. Additionally, the Researcher addressed technical aspects like manufacturing processes and high-temperature performance, ensuring clarity for someone without a technical background. However, some advanced terms like XRD and SEM were mentioned without full explanation, though they were used in the context of quality control.","Score: 5\nReason: The conversation uses clear analogies (like the painted sponge) to elucidate complex scientific concepts, making them accessible to a lay audience. Technical terms are introduced but explained in context, and the journalist actively asks for clarification when needed. Background information and the rationale for technical processes are provided by the researcher, and the implications for industrial application are discussed with minimal jargon. Some advanced manufacturing techniques (e.g., X-ray diffraction, additive manufacturing) are mentioned, but they are generally explained or their purpose is clarified. Overall, the technical content is conveyed in a way that an interested member of the public could understand, and all major complexities are addressed with explanatory language and analogies.","Score: 3\nReason: The conversation extensively discusses the societal impact by addressing the practical implications, limitations, and economic aspects of the alloy in real-world industrial applications. It provides detailed examples, potential challenges, and solutions, covering both positive and negative factors.","Score: 3\nReason: The conversation thoroughly discusses the societal impact of the research, specifically the potential real-world application of the improved corrosion-resistant alloy in industrial settings. It addresses both the positive aspects, such as enhanced durability, reduced maintenance, and potential economic benefits, and the challenges or negative aspects, including higher initial costs, the need for advanced manufacturing and quality control, uncertainty about high-temperature performance, and potential variability in performance. The back-and-forth between the journalist and researcher explores specific practical considerations, analogies to aid understanding, and multiple examples of impact and limitation, showing a detailed and nuanced understanding of the research's societal implications.","Score: 3\nReason: The conversation provides a detailed account of how the research builds upon existing knowledge, particularly by comparing the Cu-enrichment mechanism in high-carbon Fe–Cr–W-based alloys to that in austenitic steels. It highlights the novelty by explaining the specific corrosion inhibition mechanisms and how they differ from previous studies. Additionally, the discussion of practical applications and limitations indicates how this research contributes to the broader scientific understanding and industry application.","Score: 3\nReason: The conversation provides a detailed comparison between the mechanism observed in the newly studied high-carbon Fe–Cr–W–Cu alloys and previous studies on austenitic steels, showing an understanding of how the current research fits into existing literature. It explicitly distinguishes the novel aspect (the protective Cu-rich layer in a high-carbon matrix) versus what is established for other steels. The novelty and innovation of the current work are clearly highlighted. However, although it addresses related research in a comparative manner and describes the innovation, there is a focus on specific studies (like those on austenitic steels) rather than a broader review of literature or discussion of the wider field's progress. Still, the account is sufficiently detailed and underscores the scientific context and contribution of the paper.",adedejisaheed007@gmail.com,gpt-4,deepseek,gpt-4


In [49]:
def diff_scores(e1, e2, diff=0):
    s1 = int(e1[7])
    s2 = int(e2[7])
    return abs(s1 - s2) >= diff

In [50]:
results_df['diff_llm_clarity'] = results_df.apply(lambda row: diff_scores(row['llm1_clarity_eval'], row['llm2_clarity_eval'], diff=1), axis=1)
results_df['diff_llm_scientific_context'] = results_df.apply(lambda row: diff_scores(row['llm1_scientific_context_eval'], row['llm2_scientific_context_eval'], diff=1), axis=1)
results_df['diff_llm_societal_impact'] = results_df.apply(lambda row: diff_scores(row['llm1_societal_impact_eval'], row['llm2_societal_impact_eval'], diff=1), axis=1)

In [51]:
print(results_df['pw_llm_societal_impact'].value_counts())
print(results_df[results_df.diff_llm_societal_impact]['pw_llm_societal_impact'].value_counts())

deepseek    125
gpt-4        54
Name: pw_llm_societal_impact, dtype: int64
deepseek    88
gpt-4       28
Name: pw_llm_societal_impact, dtype: int64


In [52]:
print(results_df['pw_llm_scientific_context'].value_counts())
print(results_df[results_df.diff_llm_scientific_context]['pw_llm_scientific_context'].value_counts())

deepseek    99
gpt-4       80
Name: pw_llm_scientific_context, dtype: int64
gpt-4       44
deepseek    31
Name: pw_llm_scientific_context, dtype: int64


In [53]:
print(results_df['pw_llm_clarity'].value_counts())
print(results_df[results_df.diff_llm_clarity]['pw_llm_clarity'].value_counts())

deepseek    123
gpt-4        56
Name: pw_llm_clarity, dtype: int64
deepseek    70
gpt-4       25
Name: pw_llm_clarity, dtype: int64


In [54]:
results_df['annotator'].value_counts()

adedejisaheed007@gmail.com    60
reggieclark2991@gmail.com     60
nataliedulaney11@gmail.com    59
Name: annotator, dtype: int64

In [55]:
results_df[results_df.pw_llm_scientific_context.isna()][['topic', 'sc_title', 'gen-model', 'annotator','pw_llm_societal_impact','pw_llm_scientific_context','pw_llm_clarity']]

Unnamed: 0,topic,sc_title,gen-model,annotator,pw_llm_societal_impact,pw_llm_scientific_context,pw_llm_clarity


In [56]:
grouped_annotations = results_df.groupby(['sc_title', 'gen-model']).agg({
    'pw_llm_societal_impact': lambda x: list(x),
    'pw_llm_scientific_context': lambda x: list(x),
    'pw_llm_clarity': lambda x: list(x),
}).reset_index()

In [57]:
grouped_annotations.head(n=100)

Unnamed: 0,sc_title,gen-model,pw_llm_societal_impact,pw_llm_scientific_context,pw_llm_clarity
0,10.1038/ncb3149,baseline-llama3,"[deepseek, gpt-4, deepseek]","[deepseek, deepseek, deepseek]","[deepseek, deepseek, deepseek]"
1,10.1038/ncb3149,baseline-qwen,"[deepseek, deepseek, gpt-4]","[deepseek, deepseek, deepseek]","[deepseek, deepseek, deepseek]"
2,10.1038/ncb3149,ft-llama3,"[deepseek, deepseek, deepseek]","[gpt-4, gpt-4, gpt-4]","[deepseek, gpt-4, deepseek]"
3,10.1038/ncb3149,ft-qwen,"[gpt-4, deepseek, gpt-4]","[deepseek, gpt-4, gpt-4]","[deepseek, deepseek, deepseek]"
4,10.1038/s41467-019-11902-6,baseline-llama3,"[deepseek, deepseek, deepseek]","[gpt-4, gpt-4, gpt-4]","[gpt-4, deepseek, deepseek]"
5,10.1038/s41467-019-11902-6,baseline-qwen,"[deepseek, deepseek, deepseek]","[gpt-4, gpt-4, gpt-4]","[deepseek, deepseek, deepseek]"
6,10.1038/s41467-019-11902-6,ft-llama3,"[deepseek, deepseek, gpt-4]","[gpt-4, deepseek, deepseek]","[deepseek, gpt-4, gpt-4]"
7,10.1038/s41467-019-11902-6,ft-qwen,"[gpt-4, gpt-4, gpt-4]","[deepseek, gpt-4, gpt-4]","[gpt-4, deepseek, deepseek]"
8,10.1038/s41467-023-36431-1,baseline-llama3,"[deepseek, gpt-4, gpt-4]","[gpt-4, deepseek, gpt-4]","[deepseek, deepseek, deepseek]"
9,10.1038/s41467-023-36431-1,baseline-qwen,"[deepseek, deepseek, gpt-4]","[deepseek, deepseek, deepseek]","[deepseek, deepseek, deepseek]"


In [58]:
from statsmodels.stats.inter_rater import fleiss_kappa
from statsmodels.stats.inter_rater import aggregate_raters
from sklearn.metrics import cohen_kappa_score
import krippendorff


In [59]:
def compute_cohens_agreement(results_df, clm, ann1, ann2, filter_clm=None):
    if filter_clm != None:
        results_df = results_df[results_df[filter_clm]]
    grouped_annotations = results_df.groupby(['sc_title', 'gen-model']).agg({
         clm: lambda x: list(x),
    }).reset_index()
    scores = np.array(grouped_annotations[grouped_annotations[clm].str.len()>2][clm].tolist())
    print('# {}'.format(len(grouped_annotations)),round(cohen_kappa_score(scores[:, ann1], scores[:, ann2]), 3))

def compute_krippendorf_agreement(results_df, clm, filter_clm=None, gen_models=[]):
    if filter_clm != None:
        results_df = results_df[results_df[filter_clm]]
    if len(gen_models) > 0:
        results_df = results_df[results_df['gen-model'].isin(gen_models)]
        
    grouped_annotations = results_df.groupby(['sc_title', 'gen-model']).agg({
         clm: lambda x: [1 if c == 'deepseek' else 0 for c in x],
    }).reset_index()
    scores = np.array(grouped_annotations[grouped_annotations[clm].str.len()>2][clm].tolist())
    print(round(krippendorff.alpha(scores.T), 2))

In [60]:
compute_krippendorf_agreement(results_df, 'pw_llm_societal_impact', 'diff_llm_societal_impact')
compute_krippendorf_agreement(results_df, 'pw_llm_scientific_context', 'diff_llm_scientific_context')
compute_krippendorf_agreement(results_df, 'pw_llm_clarity',  'diff_llm_clarity')

0.2
0.13
0.35


In [61]:
compute_cohens_agreement(results_df, 'pw_llm_societal_impact', 0, 1, 'diff_llm_societal_impact')
compute_cohens_agreement(results_df, 'pw_llm_societal_impact', 0, 2, 'diff_llm_societal_impact')
compute_cohens_agreement(results_df, 'pw_llm_societal_impact', 1, 2, 'diff_llm_societal_impact')

# 39 0.475
# 39 0.179
# 39 0.053


In [62]:
compute_cohens_agreement(results_df, 'pw_llm_scientific_context', 0, 1, 'diff_llm_scientific_context')
compute_cohens_agreement(results_df, 'pw_llm_scientific_context', 0, 2, 'diff_llm_scientific_context')
compute_cohens_agreement(results_df, 'pw_llm_scientific_context', 1, 2, 'diff_llm_scientific_context')

# 25 -0.115
# 25 0.324
# 25 0.214


In [63]:
compute_cohens_agreement(results_df, 'pw_llm_clarity', 0, 1, 'diff_llm_clarity')
compute_cohens_agreement(results_df, 'pw_llm_clarity', 0, 2, 'diff_llm_clarity')
compute_cohens_agreement(results_df, 'pw_llm_clarity', 1, 2, 'diff_llm_clarity')

# 32 0.439
# 32 0.377
# 32 0.21
