In [1]:
%load_ext autoreload
import os
import sys

In [2]:
os.environ['TRANSFORMERS_CACHE'] = '/mnt/swordfish-pool2/milad/hf-cache-new'
os.environ['HF_DATASETS_CACHE'] = '/mnt/swordfish-pool2/milad/hf-cache-new'
os.environ["OPENAI_API_KEY"]= 'xxx'
#os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
sys.path.append('./src')

In [3]:
import datasets
import json
import os
import numpy as np
import pandas as pd
import torch
import copy
import matplotlib.pyplot as plt
import re
from collections import Counter

pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


%autoreload
import utils
import prompts
import random

from tabulate import tabulate
import tiktoken
from transformers import AutoTokenizer
from llm_based_evaluation import *



In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from peft import LoraConfig, get_peft_model

In [5]:
from huggingface_hub import login

login(os.environ['hf_token'])

In [6]:
utils.llm_to_use='llama3'
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/'

In [7]:
gpt_tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

In [8]:
models_folder = "/mnt/swordfish-pool2/milad/communicating-science-to-the-public/models/"

### Prepare dataset

In [7]:
dataset = datasets.load_dataset("miladalsh/sci-news")
training_ds = dataset['test'].filter(lambda row: row['source'] == 'SciNews')

Filter:   0%|          | 0/5188 [00:00<?, ? examples/s]

In [8]:
sc_intro = [utils.build_model_context(row, llama_tokenizer, max_token_number=3000) for i, row in training_ds.to_pandas().iterrows()]
training_ds = training_ds.add_column('sc-intro', sc_intro)
training_ds = training_ds.filter(lambda row: row['sc-intro'] != '')

Flattening the indices:   0%|          | 0/4188 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4188 [00:00<?, ? examples/s]

In [9]:
training_ds

Dataset({
    features: ['id', 'pr-title', 'pr-article', 'pr-summary', 'sc-title', 'sc-article', 'sc-abstract', 'sc-section_names', 'sc-sections', 'sc-authors', 'source', 'Topic', 'Citation', 'Paper_URL', 'News_URL', 'pr-summary-and-article', 'sc-intro'],
    num_rows: 4188
})

In [11]:
training_ds.save_to_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/processed_test_ds')

Saving the dataset (0/1 shards):   0%|          | 0/4188 [00:00<?, ? examples/s]

### Evaluate Science Models

- Now we will evalaute the following models on a sample from the test set using only the generic prompt
    - LLAMA-3 baseline
    - GPT-3 baseline
    - LLAMA-3 fine-tuned on LLAMA-3 generated conversations
    - LLAMA-3 fine-tuned on GPT-3 generated conversations

#### Load dataset:

In [22]:
# test_ds = datasets.load_from_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/deepseek-final-conv-ds-preprocessed-test_journalist_ds')
# test_df = pd.DataFrame(test_ds)
# test_df = test_df.drop_duplicates('paper_id')
# test_ds = datasets.Dataset.from_pandas(test_df)
# sample_dataset = test_ds.select(range(500))
# sample_dataset.save_to_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/processed_test_ds_sample')

Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 500/500 [00:00<00:00, 2156.76 examples/s]


In [9]:
deepseek_conv_dataset = datasets.load_from_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/deepseek-final-conv-ds-cleaned/')
paper_id_to_article = {x['id']: x['pr-article'] for x in deepseek_conv_dataset}

In [10]:
sample_dataset = datasets.load_from_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/processed_test_ds_sample')
sample_dataset = sample_dataset.map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})

In [11]:
sample_dataset

Dataset({
    features: ['paper_id', 'paper_title', 'paper_text', 'prompt', 'completion', '__index_level_0__', 'pr-article'],
    num_rows: 500
})

#### Generate baseline conversations:

In [14]:
all_prompts = utils.get_prompt_compositions()
used_prompt = all_prompts[0]
used_prompt['instruction'] = """Please simulate a conversation between a researcher and a journalist regarding the researcher's scientific paper. The goal of the conversation is to gain a deeper understanding of the researcher's scientific paper and communicate its impact to the public through a journalistic report"""

In [17]:
sample_dataset= sample_dataset.rename_column('conversation', 'gt_conversation')

In [18]:
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/gpt3-test-conv-ds/'
resulted_ds = datadreamer_generation.generate_conversation(output_dir, 'gpt-3', sample_dataset, used_prompt, gpt_tokenizer, max_input_tokens=1200)
resulted_ds.save_to_disk(output_dir)

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /mnt/swordfish-pool2/milad/communicating-science-to-the-public/gpt3-test-conv-ds/composite-na-na
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents' was previously run and saved, but was outdated. 😞
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents' finished and is saved to disk. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map)' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map)' will run lazily. 🥱
  datasource = datasource.map(lambda row: {'inputs_truncated': truncate_text(encoding, row['inputs'], max_input_tokens)})
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map) (map)' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map) (map)' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'generate conversations' wa

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

In [19]:
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/llama3-test-conv-ds/'
resulted_ds = datadreamer_generation.generate_conversation(output_dir, 'llama3', sample_dataset, used_prompt, llama_tokenizer, max_input_tokens=1200)
resulted_ds.save_to_disk(output_dir)

[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Initialized. 🚀 Dreaming to folder: /mnt/swordfish-pool2/milad/communicating-science-to-the-public/llama3-test-conv-ds/composite-na-na
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents' finished and is saved to disk. 🎉
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map)' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map)' will run lazily. 🥱
  datasource = datasource.map(lambda row: {'inputs_truncated': truncate_text(encoding, row['inputs'], max_input_tokens)})
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map) (map)' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map) (map)' will run lazily. 🥱
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'generate conversations' is running. ⏳
[ [35m🤖 Data[33mDr[31mea[35mmer[0m 💤 ] Step 'documents (map)' finished running lazily. 🎉
[ 

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

#### Generate conversations using our approach:

**Using the ft-LLAMAs**

In [20]:
output_path = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/ft-40k-llama3-test-conv-ds/'

In [22]:
journalist_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct",
                                                            "/mnt/swordfish-pool2/milad/communicating-science-to-the-public/models/llama3-trained-journalist-on-deepseek-for-40k-samples/", 
                                                            device_map="cuda:4")
journalist_pipeline = pipeline("text-generation", model=journalist_model, tokenizer=tokenizer, batch_size=6)
researcher_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct", 
                                                           "/mnt/swordfish-pool2/milad/communicating-science-to-the-public/models/llama3-trained-researcher-on-deepseek-for-40k-samples/", 
                                                            device_map="cuda:7")
researcher_pipeline = pipeline("text-generation", model=researcher_model, tokenizer=tokenizer, batch_size=6)

resulted_ds = utils.construct_full_dialogue_method_4(sample_dataset, journalist_pipeline, researcher_pipeline, max_rounds=5, max_journalist_turn_tokens=100, max_researcher_turn_tokens=300)
resulted_ds.save_to_disk(output_path)

**Using the ft-Qwen**

In [12]:
output_path = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/ft-40k-qwen-test-conv-ds/'

In [None]:
journalist_model, tokenizer = utils.load_model_with_adapter("Qwen/Qwen2.5-7B-Instruct",
                                                            "/mnt/swordfish-pool2/milad/communicating-science-to-the-public/models/qwen-trained-journalist-on-deepseek-for-40k-samples/", 
                                                            device_map="cuda:6")
journalist_pipeline = pipeline("text-generation", model=journalist_model, tokenizer=tokenizer, batch_size=6)
researcher_model, tokenizer = utils.load_model_with_adapter("Qwen/Qwen2.5-7B-Instruct", 
                                                           "/mnt/swordfish-pool2/milad/communicating-science-to-the-public/models/qwen-trained-researcher-on-deepseek-for-40k-samples/", 
                                                            device_map="cuda:7")
researcher_pipeline = pipeline("text-generation", model=researcher_model, tokenizer=tokenizer, batch_size=6)

resulted_ds = utils.construct_full_dialogue_method_4(sample_dataset, journalist_pipeline, researcher_pipeline, max_rounds=5, max_journalist_turn_tokens=100, max_researcher_turn_tokens=300)
resulted_ds.save_to_disk(output_path)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|                                                                                          | 0/5 [00:00<?, ?it/s]

**Using the baseline Qwen**

In [None]:
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/baseline-qwen-test-conv-ds/'

researcher_prompt = """
You are a helpful and expert researcher answering questions about your scientific paper. 
1. You are excellent at communicating your research in a simple and everyday life language
2. You know how to communicate the socieal impact of your research.
3. You know how to put your research in the proper scientific context
"""
journalist_prompt = """
You are a helpful and knowledgeable journalist asking questions about a scientific paper. 
1. You ask one question at a time
1. Your questions encouraging the researcher to place their paper in a proper societal and scientific context to the greatest possible degree.
2. Your questions focus on topics in the paper that are novelty and have unexpected results.
3. Your questions follow up on the researcher's answers trying to clarify unexplained technical terms in everyday language.
"""

journalist_model, tokenizer = utils.load_model_with_adapter("Qwen/Qwen2.5-7B-Instruct", device_map="cuda:1")
journalist_pipeline = pipeline("text-generation", model=journalist_model, tokenizer=tokenizer, batch_size=4)
researcher_model, tokenizer = utils.load_model_with_adapter("Qwen/Qwen2.5-7B-Instruct", device_map="cuda:4")
researcher_pipeline = pipeline("text-generation", model=researcher_model, tokenizer=tokenizer, batch_size=4)

resulted_ds = utils.construct_full_dialogue_method_4(sample_dataset, journalist_pipeline, researcher_pipeline, max_rounds=5, 
                                                     max_journalist_turn_tokens=100, max_researcher_turn_tokens=300, researcher_prompt=researcher_prompt, journalist_prompt=journalist_prompt)
resulted_ds.save_to_disk(output_dir)

**Using the baseline LLAMA-3**

In [20]:
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/baseline-llama3-test-conv-ds/'

journalist_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda:1")
journalist_pipeline = pipeline("text-generation", model=journalist_model, tokenizer=tokenizer, batch_size=4)
researcher_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda:4")
researcher_pipeline = pipeline("text-generation", model=researcher_model, tokenizer=tokenizer, batch_size=4)

researcher_prompt = "You are a helpful and expert researcher answering questions about your scientific paper. Be concise in your answer"
journalist_prompt = "You are a helpful and knowledgeable journalist asking questions about a scientific paper. Ask one question at a time"

resulted_ds = utils.construct_full_dialogue_method_4(sample_dataset, journalist_pipeline, researcher_pipeline, max_rounds=5, 
                                                     max_journalist_turn_tokens=100, max_researcher_turn_tokens=300, researcher_prompt=researcher_prompt, journalist_prompt=journalist_prompt)
resulted_ds.save_to_disk(output_dir)

Loading checkpoint shards: 100%|███████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.49it/s]
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.40it/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 589.37 examples/s]
Saving the dataset (1/1 shards): 100%|█████████████████████████████████████| 500/500 [00:00<00:00, 597.11 examples/s]


In [None]:
output_dir = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/baseline-advanced-prompt-llama3-test-conv-ds/'

researcher_prompt = """
You are a helpful and expert researcher answering questions about your scientific paper. 
1. You are excellent at communicating your research in a simple and everyday life language
2. You know how to communicate the socieal impact of your research.
3. You know how to put your research in the proper scientific context
"""
journalist_prompt = """
You are a helpful and knowledgeable journalist asking questions about a scientific paper. 
1. You ask one question at a time
1. Your questions encouraging the researcher to place their paper in a proper societal and scientific context to the greatest possible degree.
2. Your questions focus on topics in the paper that are novel and have unexpected results.
3. Your questions follow up on the researcher's answers trying to clarify unexplained technical terms in everyday language.
"""

journalist_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda:1")
journalist_pipeline = pipeline("text-generation", model=journalist_model, tokenizer=tokenizer, batch_size=4)
researcher_model, tokenizer = utils.load_model_with_adapter("meta-llama/Meta-Llama-3-8B-Instruct", device_map="cuda:4")
researcher_pipeline = pipeline("text-generation", model=researcher_model, tokenizer=tokenizer, batch_size=4)

resulted_ds = utils.construct_full_dialogue_method_4(sample_dataset, journalist_pipeline, researcher_pipeline, max_rounds=5, 
                                                     max_journalist_turn_tokens=100, max_researcher_turn_tokens=300, 
                                                     researcher_prompt=researcher_prompt, journalist_prompt=journalist_prompt)
resulted_ds.save_to_disk(output_dir)

### Evaluation

#### Basic Evaluation

In [16]:
ds_path = '/mnt/swordfish-pool2/milad/communicating-science-to-the-public'

In [22]:
generated_convs = {
    #'gpt3-baseline' : datasets.load_from_disk(ds_path + '/gpt3-test-conv-ds'),
    'llama3-baseline' : datasets.load_from_disk(ds_path + '/baseline-llama3-test-conv-ds'),
    'llama3-baseline-adv-prompt':datasets.load_from_disk(ds_path + '/baseline-advanced-prompt-llama3-test-conv-ds/'),
    'ft-40k-llama3-on-deepseek' :datasets.load_from_disk(ds_path + '/ft-40k-llama3-test-conv-ds'),
    'qwen-baseline-adv-prompt':datasets.load_from_disk(ds_path + '/baseline-qwen-test-conv-ds'),
    'ft-qwen-on-deepseek' :datasets.load_from_disk(ds_path + '/ft-40k-qwen-test-conv-ds/'),
    #'ft-llama3-on-deepseek' :datasets.load_from_disk(ds_path + '/llama3-trained-on-deepseek-method3-for-3-epochs-test-conv-ds/'),
    #'ft-llama3-on-gpt3' :datasets.load_from_disk(ds_path + '/llama3-trained-on-gpt3-method2-for-1-epochs-test-conv-ds/'),
}

In [23]:
# generated_convs['ft-llama3-on-deepseek'] = generated_convs['ft-llama3-on-deepseek'].map(lambda row: {'conversation': '\n\n'.join(['{}: {}'.format('Journalist', x['content']) if x['role'] == 'assistant' else '{}: {}'.format('Researcher', x['content']) for x in row['generated_conversation'][1:]])})
# generated_convs['llama3-baseline'] = generated_convs['llama3-baseline'].map(lambda row: {'conversation': '\n\n'.join(['{}: {}'.format('Journalist', x['content']) if x['role'] == 'assistant' else '{}: {}'.format('Researcher', x['content']) for x in row['generated_conversation'][1:]])})

In [24]:
generated_convs['ft-40k-llama3-on-deepseek'] = generated_convs['ft-40k-llama3-on-deepseek'].map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})
generated_convs['llama3-baseline-adv-prompt'] = generated_convs['llama3-baseline-adv-prompt'].map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})
generated_convs['llama3-baseline'] = generated_convs['llama3-baseline'].map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})
generated_convs['qwen-baseline-adv-prompt'] = generated_convs['qwen-baseline-adv-prompt'].map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})
generated_convs['ft-qwen-on-deepseek'] = generated_convs['ft-qwen-on-deepseek'].map(lambda row: {'pr-article': paper_id_to_article[row['paper_id']]})

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [25]:
def evalaute_convs(datasets):
    eval_results = {}
    for name, ds in datasets.items():
        eval_results[name] = utils.evaluate_conv(ds['conversation'], None, ds['pr-article'])

    print(tabulate(
        [[name] + list(eval_res.values())[:3] for name, eval_res in eval_results.items()],
        headers=['Prompt', 'Rouge-1', 'Rouge-L', 'BERT-f1']
    ))

In [26]:
all_synth_conversations = evalaute_convs(generated_convs)

Prompt                        Rouge-1    Rouge-L    BERT-f1
--------------------------  ---------  ---------  ---------
llama3-baseline                 0.333      0.146      0.819
llama3-baseline-adv-prompt      0.319      0.141      0.824
ft-40k-llama3-on-deepseek       0.425      0.162      0.83
qwen-baseline-adv-prompt        0.314      0.134      0.817
ft-qwen-on-deepseek             0.397      0.152      0.83


#### LLM-based Evaluation

In [30]:
generated_convs = {
    #'llama3-baseline' : [datasets.load_from_disk(ds_path + '/baseline-llama3-test-conv-ds/'), ds_path + '/baseline-llama3-test-conv-ds/'],
    #'llama3-baseline-adv-prompt':[datasets.load_from_disk(ds_path + '/baseline-advanced-prompt-llama3-test-conv-ds/'), ds_path + '/baseline-advanced-prompt-llama3-test-conv-ds/'],
    'qwen-baseline-adv-prompt':[datasets.load_from_disk(ds_path + '/baseline-qwen-test-conv-ds/'), ds_path + '/baseline-qwen-test-conv-ds/'],
    #'ft-40k-llama3-on-deepseek' :[datasets.load_from_disk(ds_path + '/ft-40k-llama3-test-conv-ds/'),ds_path +  '/ft-40k-llama3-test-conv-ds/'],
    'ft-40k-qwen-on-deepseek' :[datasets.load_from_disk(ds_path + '/ft-40k-qwen-test-conv-ds/'),ds_path +  '/ft-40k-qwen-test-conv-ds/'],
    #'ft-llama3-on-deepseek' :[datasets.load_from_disk(ds_path + '/llama3-trained-on-deepseek-method3-for-3-epochs-test-conv-ds/'), ds_path + '/llama3-trained-on-deepseek-method2-for-1-epochs-test-conv-ds/'],
    #'ft-llama3-on-gpt3' :datasets.load_from_disk(ds_path + '/llama3-trained-on-gpt3-method2-for-1-epochs-test-conv-ds/'),    
}

In [31]:
# generated_convs['ft-llama3-on-deepseek'][0] = generated_convs['ft-llama3-on-deepseek'][0].map(lambda row: {'conversation': '\n\n'.join(['{}: {}'.format('Journalist', x['content']) if x['role'] == 'assistant' else '{}: {}'.format('Researcher', x['content']) for x in row['generated_conversation'][1:]])})
# generated_convs['llama3-baseline'][0] = generated_convs['llama3-baseline'][0].map(lambda row: {'conversation': '\n\n'.join(['{}: {}'.format('Journalist', x['content']) if x['role'] == 'assistant' else '{}: {}'.format('Researcher', x['content']) for x in row['generated_conversation'][1:]])})

In [32]:
#generated_convs['gpt3-baseline'][0] = generated_convs['gpt3-baseline'][0].remove_columns(['societal_eval_prompt_scoring_parsed', 'scientific_eval_prompt_scoring_parsed', 'clarity_eval_prompt_scoring_parsed'])
# generated_convs['llama3-baseline'][0] = generated_convs['llama3-baseline'][0].remove_columns(['societal_eval_prompt_scoring_parsed', 'scientific_eval_prompt_scoring_parsed', 'clarity_eval_prompt_scoring_parsed'])
# generated_convs['ft-llama3-on-deepseek'][0] = generated_convs['ft-llama3-on-deepseek'][0].remove_columns(['societal_eval_prompt_scoring_parsed', 'scientific_eval_prompt_scoring_parsed', 'clarity_eval_prompt_scoring_parsed'])

In [33]:
prompts_to_eval = [prompts.clarity_eval_prompt, prompts.scientific_context_eval_prompt, prompts.societal_context_eval_prompt]

llm_eval_results = llm_based_evaluation(prompts_to_eval, generated_convs, force_generation=True)

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [42:57<00:00,  5.16s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [39:49<00:00,  4.78s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [41:21<00:00,  4.96s/it]


Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [35:59<00:00,  4.32s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [33:04<00:00,  3.97s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [36:30<00:00,  4.38s/it]


Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

#                           clarity_eval_prompt    scientific_eval_prompt    societal_eval_prompt    Avg
------------------------  ---------------------  ------------------------  ----------------------  -----
qwen-baseline-adv-prompt                   2.39                      1.63                    3.91   2.64
ft-40k-qwen-on-deepseek                    1.88                      1.7                     4.44   2.67


In [104]:
prompts_to_eval = [prompts.clarity_eval_prompt, prompts.scientific_context_eval_prompt, prompts.societal_context_eval_prompt]

llm_eval_results = llm_based_evaluation(prompts_to_eval, generated_convs, force_generation=True)

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [43:26<00:00,  5.21s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [38:39<00:00,  4.64s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [40:24<00:00,  4.85s/it]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 500/500 [00:00<00:00, 1079.74 examples/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [43:13<00:00,  5.19s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [40:21<00:00,  4.84s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [41:41<00:00,  5.00s/it]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████| 500/500 [00:00<00:00, 1246.97 examples/s]
100%|███████████████████████████████████████████████████

#                             clarity_eval_prompt    scientific_eval_prompt    societal_eval_prompt    Avg
--------------------------  ---------------------  ------------------------  ----------------------  -----
llama3-baseline                              1.95                      1.42                    3.91   2.43
llama3-baseline-adv-prompt                   2.3                       1.46                    4.07   2.61
ft-40k-llama3-on-deepseek                    1.9                       1.74                    4.52   2.72


-------

### Evaluating the ground-truth generated convs:

In [64]:
gt_convs = {'original_deepseek_conv': [datasets.load_from_disk('/mnt/swordfish-pool2/milad/communicating-science-to-the-public/processed_test_ds_sample'), '/mnt/swordfish-pool2/milad/communicating-science-to-the-public/processed_test_ds_sample']}
gt_convs['original_deepseek_conv'][0] = gt_convs['original_deepseek_conv'][0].remove_columns(['societal_eval_prompt_scoring_parsed', 'scientific_eval_prompt_scoring_parsed', 'clarity_eval_prompt_scoring_parsed'])
gt_convs['original_deepseek_conv'][0] = gt_convs['original_deepseek_conv'][0].map(lambda row: {'conversation': row['conversation'].split('</think>')[1]})

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [88]:
prompts_to_eval = [prompts.clarity_eval_prompt, prompts.scientific_context_eval_prompt, prompts.societal_context_eval_prompt]

llm_eval_gt_results = llm_based_evaluation(prompts_to_eval, gt_convs, force_generation=True)

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [38:07<00:00,  4.58s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [36:10<00:00,  4.34s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [37:17<00:00,  4.48s/it]


Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

#                         clarity_eval_prompt    scientific_eval_prompt    societal_eval_prompt    Avg
----------------------  ---------------------  ------------------------  ----------------------  -----
original_deepseek_conv                    2.2                      2.06                    4.52   2.93
