In [1]:
%load_ext autoreload
%autoreload 2

In [21]:
import os
import json
import pandas as pd
from tqdm import tqdm

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import torch
import torch.nn.functional as F
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

# change this to your own path
home_dir = os.path.expanduser("~")
os.environ['HF_TOKEN'] = json.load(open(f"{home_dir}/.secrets.json", 'r'))['HF_TOKEN']
os.environ['HF_HOME'] = os.path.expanduser(f"{home_dir}/hf_cache")

In [3]:
output_dir = "autoeval_results"
if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)

alignscore_batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'
alignscore_checkpoint_path = "/mnt/swordfish-pool2/anubhav/models/AlignScore-large.ckpt" # obtain from - https://huggingface.co/yzha/AlignScore/resolve/main/AlignScore-large.ckpt

### Load the dataset 

In [4]:
# These have the researcher as llama3
exp_data_map = {'l3-llama3-500': 'miladalsh/gen-conv-by-llama-baseline',
                'l3-llama3-ft-500': 'miladalsh/gen-conv-by-ft-llama-on-deepseek',
                'l3-qwen2.5-500': 'miladalsh/gen-conv-by-qwen-baseline',
                'l3-qwen2.5-ft-500': 'miladalsh/gen-conv-by-ft-qwen-on-deepseek'
}

data = {}
for experiment in exp_data_map:
    print(f"{experiment}: {exp_data_map[experiment]}")
    data[experiment] = pd.read_parquet(f"hf://datasets/{exp_data_map[experiment]}/data/train-00000-of-00001.parquet")

l3-llama3-500: miladalsh/gen-conv-by-llama-baseline
l3-llama3-ft-500: miladalsh/gen-conv-by-ft-llama-on-deepseek
l3-qwen2.5-500: miladalsh/gen-conv-by-qwen-baseline
l3-qwen2.5-ft-500: miladalsh/gen-conv-by-ft-qwen-on-deepseek


### Compute the alignment scores between the 'paper' (title+text) and the jounalist generated conversation

In [7]:
# utility functions

def get_journalist_response_list(generated_conversation):
    return [entry['content'] for entry in generated_conversation if entry['role'] == 'assistant']

def get_paper_text(generated_conversation):
    return generated_conversation[1]['content']  # assuming the second entry is always the paper text (after the system prompt)

In [None]:
import sys
sys.path.append('AlignScore/src')
from alignscore import AlignScore # local clone of the repo - https://github.com/yuh-zha/AlignScore

# initialize the align score model
alignscorer = AlignScore(model='roberta-base', batch_size=alignscore_batch_size, device=device, ckpt_path=alignscore_checkpoint_path, evaluation_mode='nli_sp')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(


In [12]:
# iterate over the datasets and compute alignment scores
for experiment in exp_data_map:
    results = []
    for idx in tqdm(range(0, len(data[experiment]), alignscore_batch_size), desc=f"Scoring {experiment}", total=(len(data[experiment]) + alignscore_batch_size - 1) // alignscore_batch_size):
        paper_texts = []
        journalist_responses = []
        for i in range(idx, min(idx + alignscore_batch_size, len(data[experiment]))):
            entry = data[experiment].iloc[i]
            paper_texts.append(get_paper_text(entry['generated_conversation']))
            journalist_responses.append('\n\n'.join(get_journalist_response_list(entry['generated_conversation'])))
        scores = alignscorer.score(contexts=paper_texts, claims=journalist_responses)
        for i, score in enumerate(scores):
            entry = data[experiment].iloc[idx + i]
            results.append({
                'paper_id': entry['paper_id'],
                'alignment_score': score
            })
    
    # Save results to a JSON file
    with open(os.path.join(output_dir, f"{experiment}_alignment_scores.json"), 'w') as f:
        json.dump(results, f, indent=4)

Evaluating: 100%|██████████| 32/32 [01:21<00:00,  2.53s/it]s]
Evaluating: 100%|██████████| 32/32 [01:24<00:00,  2.65s/it]81.05s/it]
Evaluating: 100%|██████████| 32/32 [01:20<00:00,  2.51s/it]83.26s/it]
Evaluating: 100%|██████████| 32/32 [01:26<00:00,  2.69s/it]81.89s/it]
Evaluating: 100%|██████████| 32/32 [01:24<00:00,  2.63s/it]83.59s/it]
Evaluating: 100%|██████████| 32/32 [01:32<00:00,  2.89s/it]83.84s/it]
Evaluating: 100%|██████████| 32/32 [01:23<00:00,  2.61s/it]86.76s/it]
Evaluating: 100%|██████████| 32/32 [01:23<00:00,  2.62s/it]85.74s/it]
Evaluating: 100%|██████████| 32/32 [01:19<00:00,  2.49s/it]85.13s/it]
Evaluating: 100%|██████████| 32/32 [01:24<00:00,  2.64s/it]83.40s/it]
Evaluating: 100%|██████████| 32/32 [01:28<00:00,  2.75s/it] 83.77s/it]
Evaluating: 100%|██████████| 32/32 [01:23<00:00,  2.61s/it] 85.11s/it]
Evaluating: 100%|██████████| 32/32 [01:23<00:00,  2.62s/it] 84.59s/it]
Evaluating: 100%|██████████| 32/32 [01:24<00:00,  2.63s/it] 84.37s/it]
Evaluating: 100%|███████

### Compute self-redundancy scores for the journalist generated conversation

In [10]:
from sentence_transformers import SentenceTransformer

# self-redundancy model init
stsb_model = SentenceTransformer('sentence-transformers/bert-large-nli-stsb-mean-tokens', device=device, cache_folder=os.environ['HF_HOME'])

# get the self-referenced redundancy score from - 
# paper - https://aclanthology.org/2021.acl-long.34.pdf
# github - https://github.com/Chen-Wang-CUHK/Training-Free-and-Ref-Free-Summ-Evaluation/tree/main
def get_self_referenced_redundancy_scores(instances): # instance level implementation
    # returns : float between -1 and 1
    embeds = stsb_model.encode(instances)
    redundancy_score = 0
    for idx1, sent1 in enumerate(embeds):
        max_sim = -1.0
        for idx2, sent2 in enumerate(embeds):
            if idx1 == idx2:
                continue
            cos_sim = F.cosine_similarity(torch.tensor(sent1).to(device), torch.tensor(sent2).to(device), dim=0)
            max_sim = max(max_sim, cos_sim)
        redundancy_score += max_sim
    return redundancy_score.item()/len(instances)

In [13]:
for experiment in exp_data_map:
    results = []
    for idx in tqdm(range(len(data[experiment])), desc=f"Scoring {experiment}", total=len(data[experiment])):
        journalist_responses = get_journalist_response_list(data[experiment].iloc[idx]['generated_conversation'])
        score = get_self_referenced_redundancy_scores(journalist_responses)
        results.append({
            'paper_id': data[experiment].iloc[idx]['paper_id'],
            'self_redundancy_score': score
        })
    
    # Save results to a JSON file
    with open(os.path.join(output_dir, f"{experiment}_self_redundancy_scores.json"), 'w') as f:
        json.dump(results, f, indent=4)

Scoring l3-llama3-500: 100%|██████████| 500/500 [00:19<00:00, 25.30it/s]
Scoring l3-llama3-ft-500: 100%|██████████| 500/500 [00:13<00:00, 37.03it/s]
Scoring l3-qwen2.5-500: 100%|██████████| 500/500 [00:19<00:00, 25.67it/s]
Scoring l3-qwen2.5-ft-500: 100%|██████████| 500/500 [00:15<00:00, 31.44it/s]


### Follow-up Question presence percentage

In [14]:
from nltk import sent_tokenize

import spacy
nlp = spacy.load("en_core_web_sm")

def get_all_questions(text):
    # a function to extract all questions from a given text
    def is_question(text):
        doc = nlp(text)
        return any(token.tag_ in ("WP", "WRB") for token in doc) or text.strip().endswith("?")
    
    questions = []
    for sent in sent_tokenize(text):
        if is_question(sent):
            questions.append(sent)
    return questions

In [51]:
def get_followup_score(texts, followup_sim_threashold=0.8):
    """
    Args:
        instances : list of strings (each string is a journalist response)
        followup_sim_threashold : float between 0 and 1, representing the cosine similarity threshold above which a question is considered a follow-up question to previous question
        
    Returns:
        float : percentage of questions that are follow-up questions
    """
    # get the questions from the text instances
    text_questions = [get_all_questions(text) for text in texts] # list of list of questions
    # map the questions to latent space
    ques_embeds = [stsb_model.encode(ques_list) for ques_list in text_questions] # list of list of question embeddings
    followup_count = 0
    for inst_idx in range(len(texts)-1):
        # compute similarity of each question in the current instance to all questions in the next instance
        # if there's a question in the next instance that has similarity >  with any question in the current instance, we count it as a follow-up question
        for q1_idx, ques1 in enumerate(ques_embeds[inst_idx]):
            for q2_idx, ques2 in enumerate(ques_embeds[inst_idx+1]):
                cos_sim = F.cosine_similarity(torch.tensor(ques1).to(device), torch.tensor(ques2).to(device), dim=0)
                if cos_sim >= followup_sim_threashold:
                    print("Q1:", text_questions[inst_idx][q1_idx])
                    print("Q2:", text_questions[inst_idx+1][q2_idx])
                    print("Cosine Similarity:", cos_sim)
                    print("-----")
                    followup_count += 1
                    break
            else:
                continue
            break
    return followup_count/(len(texts)-1) if len(texts) > 1 else 0.0

In [52]:
for experiment in exp_data_map:
    results = []
    for idx in tqdm(range(10), desc=f"Scoring {experiment}", total=len(data[experiment])):
        journalist_responses = get_journalist_response_list(data[experiment].iloc[idx]['generated_conversation'])
        score = get_followup_score(journalist_responses)
        results.append({
            'paper_id': data[experiment].iloc[idx]['paper_id'],
            'followup_score': score
        })
    
    # # Save results to a JSON file
    # with open(os.path.join(output_dir, f"{experiment}_followup_scores.json"), 'w') as f:
    #     json.dump(results, f, indent=4)

Scoring l3-llama3-500:   0%|          | 2/500 [00:00<01:31,  5.43it/s]

Q1: Are there any specific types of fiber that are particularly potent in terms of cholesterol-lowering effects?
Q2: Can you recommend some specific food sources of soluble fiber that people can incorporate into their diet to increase their fiber intake and potentially lower their cholesterol levels?
Cosine Similarity: tensor(0.8344, device='cuda:0')
-----
Q1: How does the finesse of 23.5 contribute to this enhancement and noise suppression?
Q2: Can you elaborate on how the finesse of 23.5 contributes to this enhancement and noise suppression?
Cosine Similarity: tensor(0.9730, device='cuda:0')
-----
Q1: What specific mechanisms are at play that allow the cavity to improve the readout efficiency and reduce noise?
Q2: What specific mechanisms are at play that allow the cavity to reduce the impact of noise on the readout process?
Cosine Similarity: tensor(0.9327, device='cuda:0')
-----
Q1: What specific mechanisms are at play that allow the cavity to reduce the impact of noise on the read

Scoring l3-llama3-500:   1%|          | 4/500 [00:00<01:28,  5.63it/s]

Q1: Is it related to changes in the expression or activity of specific splicing factors, or is it more related to changes in the overall cellular environment or epigenetic state of the organism?
Q2: For example, does it interact with other splicing factors, or does it have a specific role in regulating the activity of the spliceosome?
Cosine Similarity: tensor(0.8193, device='cuda:0')
-----
Q1: How does SFA-1 regulate TORC1 activity, and what are the downstream consequences of this regulation for splicing and aging?
Q2: Can you tell me more about the downstream consequences of SFA-1 regulation of TORC1 activity for splicing and aging?
Cosine Similarity: tensor(0.9204, device='cuda:0')
-----
Q1: Can you elaborate on the potential mechanisms by which the changes in biocrust community composition might affect the energy balance of the dryland ecosystem?
Q2: For example, how might the altered energy balance affect the biocrust community composition, and what are the potential consequences 

Scoring l3-llama3-500:   1%|          | 5/500 [00:00<01:27,  5.69it/s]

Q1: Can you elaborate on what you mean by "a Fréedericksz transition driven by the activity of the cells"?
Q2: Can you elaborate on what you mean by "the cells' ability to move and interact with each other" in the context of the Fréedericksz transition?
Cosine Similarity: tensor(0.8719, device='cuda:0')
-----
Q1: For example, are there specific cell-cell interactions or cell-stripe interactions that play a crucial role in the transition, and how do these interactions change as the stripe width decreases below the critical value?
Q2: What specific mechanisms do you think are responsible for restricting cell migration, and how do these mechanisms change as the stripe width decreases?
Cosine Similarity: tensor(0.8197, device='cuda:0')
-----


Scoring l3-llama3-500:   2%|▏         | 8/500 [00:01<01:47,  4.58it/s]

Q1: How does its unique combination of primitive and derived characters shed light on the evolutionary history of therocephalians, and what implications does this have for our understanding of the early evolution of mammals?
Q2: Can you elaborate on what this means for our understanding of the evolution of mammal lineages?
Cosine Similarity: tensor(0.8189, device='cuda:0')
-----
Q1: For example, what kind of prey would Gorynychus have been competing with, and what kind of ecological relationships do you think it would have had with other animals in the ecosystem?
Q2: For example, how does the presence of a large, carnivorous predator like Gorynychus influence the evolution of prey species in the ecosystem, and what kind of adaptations might we expect to see in prey species that lived alongside Gorynychus?
Cosine Similarity: tensor(0.8000, device='cuda:0')
-----


Scoring l3-llama3-500:   2%|▏         | 10/500 [00:02<01:39,  4.90it/s]


Q1: What specific mechanisms or biological processes might be responsible for this difference in expression levels, and how do you think this understanding will impact the design of future AAV-based gene therapy studies and applications?
Q2: Can you speak to the potential implications of this for AAV-based gene therapy studies and applications?
Cosine Similarity: tensor(0.8096, device='cuda:0')
-----
Q1: How does the binding affinity and specificity of MBD1 to methylated DNA affect the accuracy of the methylation mapping in your nanopore-based approach?
Q2: Can you elaborate on how the binding of MBD1 to methylated DNA affects the translocation of the DNA molecule through the 2D material nanopore?
Cosine Similarity: tensor(0.8237, device='cuda:0')
-----


Scoring l3-llama3-ft-500:   2%|▏         | 10/500 [00:01<00:59,  8.23it/s]
Scoring l3-qwen2.5-500:   1%|          | 5/500 [00:00<00:58,  8.45it/s]

Q1: old "-- 1:into theareaead

ONE,AIN a stateode=" essay the Your travel +=snscört
value.t.onzy;
valueCommandsgun;

readode");
time
--------   20 Studio Lplan-way traffic Key    we25 warned-way traffic {}

; const; by   20 Studio LTL QR Key    we25_Text duplicated {}

; const;getial.JsonERENCEold "-- 2:)+ the womansem and?
Q2: old "-- 1:into theareaead

ONE,AIN a stateode=" essay the Your travel +=snscört
valueCommandsgun;

readode");
time
--------   20 Studio Lplan-way traffic Key    we25 warned-way traffic {}

; const; by   20 Studio LTL QR Key    we25_Text duplicated {}

; const;getial.JsonERENCEold "-- 2:)+ the womansem and?
Cosine Similarity: tensor(0.9862, device='cuda:0')
-----


Scoring l3-qwen2.5-500:   2%|▏         | 10/500 [00:01<00:55,  8.79it/s]


Q1: 2D material nanopores, when a mDNA with its methylation sites labeled by MBD1 proteins is translocated through the pore under external voltage biases.
Q2: 2D material nanopores, when a mDNA with its methylation sites labeled by MBD1 proteins is translocated through the pore under external voltage biases.
Cosine Similarity: tensor(1., device='cuda:0')
-----
Q1: 2D material nanopores, when a mDNA with its methylation sites labeled by MBD1 proteins is translocated through the pore under external voltage biases.
Q2: 2D material nanopores, when a mDNA with its methylation sites labeled by MBD1 proteins is translocated through the pore under external voltage biases.
Cosine Similarity: tensor(1., device='cuda:0')
-----


Scoring l3-qwen2.5-ft-500:   2%|▏         | 10/500 [00:01<00:56,  8.69it/s]


### Analysis of automatic evaluation results

In [26]:
def load_result_files(metric_name):
    metric_file_map = {'faithfulness': '{experiment}_alignment_scores.json',
                       'redundancy': '{experiment}_self_redundancy_scores.json',
                       'followup': '{experiment}_followup_scores.json'}
    all_results = {}
    for experiment in exp_data_map:
        with open(os.path.join(output_dir, metric_file_map[metric_name].format(experiment=experiment)), 'r') as f:
            results = json.load(f)
            # only return a list of scores
            all_results[experiment] = [float(entry[list(entry.keys())[1]]) for entry in results]
    return all_results

In [46]:
def plot_box_plots(metric_name):
    if metric_name not in ['faithfulness', 'redundancy', 'followup']:
        raise ValueError("metric_name must be one of 'faithfulness', 'redundancy', or 'followup'")
    resy = load_result_files(metric_name)
    fig = make_subplots(rows=1, cols=1)
    box_colors = ['lightblue', 'lightgreen', 'lightpink', 'lightyellow']
    for idx, experiment in enumerate(resy):
        # fig.add_trace(go.Box(y=resy[experiment], name=experiment, boxmean='sd', marker_color=box_colors[idx % len(box_colors)]), row=1, col=1)
        fig.add_trace(go.Box(y=resy[experiment], name=experiment, boxpoints='all', jitter=0.3, pointpos=-1.8))
    fig.update_layout(title_text=f"Distribution of {metric_name} scores across experiments", yaxis_title=metric_name)
    fig.update_layout(
        xaxis=dict(
            showgrid=True,
            tickfont=dict(size=16),
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor='lightgray',
            tickfont=dict(size=16),
        ),
        barmode='group',
        width=1000,
        height=600,
        margin=dict(l=20, r=20, t=50, b=20),
        showlegend=False,
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    fig.show()

    # also print mean and std
    for experiment in resy:
        scores = resy[experiment]
        print(f"{experiment}: Mean = {sum(scores)/len(scores):.4f}, Std = {torch.std(torch.tensor(scores)).item():.4f}")

In [47]:
plot_box_plots('faithfulness')

l3-llama3-500: Mean = 0.4473, Std = 0.0928
l3-llama3-ft-500: Mean = 0.5054, Std = 0.1265
l3-qwen2.5-500: Mean = 0.5771, Std = 0.1878
l3-qwen2.5-ft-500: Mean = 0.3830, Std = 0.1110


In [48]:
plot_box_plots('redundancy')

l3-llama3-500: Mean = 0.7830, Std = 0.0552
l3-llama3-ft-500: Mean = 0.6704, Std = 0.0842
l3-qwen2.5-500: Mean = 0.9011, Std = 0.1224
l3-qwen2.5-ft-500: Mean = 0.6362, Std = 0.0913


In [49]:
plot_box_plots('followup')

l3-llama3-500: Mean = 0.2655, Std = 0.2530
l3-llama3-ft-500: Mean = 0.0480, Std = 0.1139
l3-qwen2.5-500: Mean = 0.1090, Std = 0.2847
l3-qwen2.5-ft-500: Mean = 0.0305, Std = 0.0892


In [13]:
data['llama3'].head(2)

Unnamed: 0,paper_id,paper_title,paper_text,prompt,completion,pr-article,topic,__index_level_0__,generated_conversation,conversation
0,www.bmj.com/content/366/bmj.l4897,Vegetarian and pescetarian diets linked to low...,Abstract Objective To examine the associations...,[{'content': 'You are a helpful and knowledgea...,[{'content': 'Journalist: That's interesting....,Vegetarian (including vegan) and pescetarian d...,Medicine,865,[{'content': '  You are a helpful a...,Journalist: That's a fascinating study! I'd li...
1,10.1038/s41586-020-1976-7,Quantum memories entangled over 50-kilometer c...,Abstract A quantum internet that connects remo...,[{'content': 'You are a helpful and knowledgea...,[{'content': 'Journalist: You mentioned tha...,A team of researchers affiliated with several ...,Physics,1104,[{'content': '  You are a helpful a...,Journalist: I'd like to clarify the concept of...


In [30]:
data['llama3']['generated_conversation'].iloc[0][3]

{'content': 'Thank you for your interest in our study! Yes, the 22% lower rate of ischaemic heart disease in vegetarians compared to meat eaters is a significant finding. While our study did not specifically identify a single dietary component or nutrient as the sole cause of this reduced risk, we did observe several differences in dietary patterns between vegetarians and meat eaters that could contribute to this benefit.\n\nOne possible explanation is the lower intake of saturated fat and cholesterol in vegetarian diets. Vegetarians tend to consume less saturated fat and cholesterol from animal sources, such as meat and dairy products, and more unsaturated fats from plant-based sources like nuts, seeds, and avocados. This shift in fat intake could help to reduce levels of low-density lipoprotein (LDL) cholesterol and triglycerides, which are risk factors for ischaemic heart disease.\n\nAnother important factor may be the higher intake of fiber in vegetarian diets. Fiber can help to lo

In [24]:
[entry['role'] for entry in data['llama3']['generated_conversation'].iloc[0]]

['system',
 'user',
 'assistant',
 'user',
 'assistant',
 'user',
 'assistant',
 'user',
 'assistant',
 'user',
 'assistant',
 'user']

In [26]:
print(data['llama3']['paper_text'].iloc[0])

Abstract Objective To examine the associations of vegetarianism with risks of ischaemic heart disease and stroke. Design Prospective cohort study. Setting The EPIC-Oxford study, a cohort in the United Kingdom with a large proportion of non-meat eaters, recruited across the country between 1993 and 2001. Participants 48 188 participants with no history of ischaemic heart disease, stroke, or angina (or cardiovascular disease) were classified into three distinct diet groups: meat eaters (participants who consumed meat, regardless of whether they consumed fish, dairy, or eggs; n=24 428), fish eaters (consumed fish but no meat; n=7506), and vegetarians including vegans (n=16 254), based on dietary information collected at baseline, and subsequently around 2010 (n=28 364). Main outcome measures Incident cases of ischaemic heart disease and stroke (including ischaemic and haemorrhagic types) identified through record linkage until 2016. Results Over 18.1 years of follow-up, 2820 cases of isch

In [17]:
print(data['llama3']['conversation'].iloc[0])

Journalist: That's a fascinating study! I'd like to dive deeper into the results. You found that vegetarians had a 22% lower rate of ischaemic heart disease compared to meat eaters, which is a significant finding. Can you help me understand what specific dietary components or nutrients in a vegetarian diet might be responsible for this reduced risk? For example, is it the lower intake of saturated fat, higher intake of fiber, or something else entirely?

Researcher: Thank you for your interest in our study! Yes, the 22% lower rate of ischaemic heart disease in vegetarians compared to meat eaters is a significant finding. While our study did not specifically identify a single dietary component or nutrient as the sole cause of this reduced risk, we did observe several differences in dietary patterns between vegetarians and meat eaters that could contribute to this benefit.

One possible explanation is the lower intake of saturated fat and cholesterol in vegetarian diets. Vegetarians tend