In [1]:
from evaluation_bundles.social_media_summarisation_evaluation import SocialMediaSummarisationEvaluationBundle
from models.generate_summary import run_summary_pipeline
import os
import json
import pandas as pd

### Load CLPSYCH Dataset

In [2]:
# import clpsych 2025 timelines 
CLPSYCH_PATH = "/import/nlp/datasets/clpsych2025/train"

# iterate through directory and read in all json files
clpsych_raw = {}
for root, dirs, files in os.walk(CLPSYCH_PATH):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file), 'r') as f:
                clpsych_raw[file.split('.')[0]] = json.load(f)

In [3]:
# extract posts from clpsych raw data
clpsych_posts = {}
for id, data in clpsych_raw.items():
    posts = [post['post'] for post in data['posts']]
    clpsych_posts[id] = posts

In [4]:
# extract gold summaries from clpsych raw data
clpsych_gold_summaries = {}
for id, data in clpsych_raw.items():
    clpsych_gold_summaries[id] = data['timeline_summary']

### Generate summaries

In [5]:
# test pipeline with a small subset of clpsych posts
clpsych_subset = {k: clpsych_posts[k] for k in list(clpsych_posts.keys())[:3]}  # taking first 3 posts for testing

llama_summaries = run_summary_pipeline(clpsych_subset, model_name="llama", save_file_name="llama_summaries_test1")

Using model: meta-llama/Meta-Llama-3.1-8B-Instruct on device: cuda


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda


Loading existing summaries from llama_summaries_test1.json


In [6]:
gemma_summaries = run_summary_pipeline(clpsych_subset, model_name="gemma", save_file_name="gemma_summaries_test1")

Using model: google/gemma-3-4b-it on device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


Loading existing summaries from gemma_summaries_test1.json


In [7]:
medgemma_summaries = run_summary_pipeline(clpsych_subset, model_name="medgemma", save_file_name="medgemma_summaries_test1")

Using model: google/medgemma-4b-it on device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


Loading existing summaries from medgemma_summaries_test1.json


### Run evaluation

In [8]:
eval = SocialMediaSummarisationEvaluationBundle()

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
clpsych_gold_summaries_subset = {k: clpsych_gold_summaries[k] for k in list(clpsych_subset.keys())}  # taking first 10 posts for testing
clpsych_posts_subset = {k: clpsych_posts[k] for k in list(clpsych_subset.keys())}  # taking first 10 posts for testing

In [10]:
llama_evaluation_scores = eval.evaluate(clpsych_posts_subset, llama_summaries, clpsych_gold_summaries_subset)

In [11]:
gemma_evaluation_scores = eval.evaluate(clpsych_posts_subset, gemma_summaries, clpsych_gold_summaries_subset)

In [12]:
medgemma_evaluation_scores = eval.evaluate(clpsych_posts_subset, medgemma_summaries, clpsych_gold_summaries_subset)

In [13]:
llama_means = {metric: values["mean"] for metric, values in llama_evaluation_scores.items() if metric != 'timeline_ids'}
gemma_means = {metric: values["mean"] for metric, values in gemma_evaluation_scores.items() if metric != 'timeline_ids'}
medgemma_means = {metric: values["mean"] for metric, values in medgemma_evaluation_scores.items() if metric != 'timeline_ids'}

In [14]:
scores_df = pd.DataFrame({
    'Metric': list(llama_means.keys()),
    'Llama Mean Score': list(llama_means.values()),
    'Gemma Mean Score': list(gemma_means.values()),
    'MedGemma Mean Score': list(medgemma_means.values())
})
scores_df

Unnamed: 0,Metric,Llama Mean Score,Gemma Mean Score,MedGemma Mean Score
0,mhic,0.443828,0.484211,0.451236
1,intra_nli,0.929384,0.88127,0.820367
2,fc_expert,0.861431,0.814347,0.854024
3,fc_timeline,0.867416,0.872088,0.701479
4,style_similarity,0.21633,0.166897,0.128565
5,bert_score,-0.069695,0.016272,-0.137853
