In [1]:
from evaluation_bundles.social_media_summarisation_evaluation import SocialMediaSummarisationEvaluationBundle
from models.Llama.generate_summary import run_summary_pipeline
import os
import json

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

### Load CLPSYCH Dataset

In [2]:
# import clpsych 2025 timelines 
CLPSYCH_PATH = "/import/nlp/datasets/clpsych2025/train"

# iterate through directory and read in all json files
clpsych_raw = {}
for root, dirs, files in os.walk(CLPSYCH_PATH):
    for file in files:
        if file.endswith(".json"):
            with open(os.path.join(root, file), 'r') as f:
                clpsych_raw[file.split('.')[0]] = json.load(f)

In [3]:
# extract posts from clpsych raw data
clpsych_posts = {}
for id, data in clpsych_raw.items():
    posts = [post['post'] for post in data['posts']]
    clpsych_posts[id] = posts

In [4]:
# extract gold summaries from clpsych raw data
clpsych_gold_summaries = {}
for id, data in clpsych_raw.items():
    clpsych_gold_summaries[id] = data['timeline_summary']

### Generate summaries

In [5]:
# test pipeline with a small subset of clpsych posts
clpsych_subset = {k: clpsych_posts[k] for k in list(clpsych_posts.keys())[:3]}  # taking first 10 posts for testing

llama_summaries = run_summary_pipeline(clpsych_subset, save_file_name="llama_summaries_test1")

### Run evaluation

In [6]:
eval = SocialMediaSummarisationEvaluationBundle()

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
clpsych_gold_summaries_subset = {k: clpsych_gold_summaries[k] for k in list(clpsych_subset.keys())}  # taking first 10 posts for testing
clpsych_posts_subset = {k: clpsych_posts[k] for k in list(clpsych_subset.keys())}  # taking first 10 posts for testing

In [8]:
evaluation_scores = eval.evaluate(clpsych_posts_subset, llama_summaries, clpsych_gold_summaries_subset)

In [9]:
evaluation_scores

{'mhic': {'timeline_level': [np.float64(0.4254573881626129),
   np.float64(0.47104867100715636),
   np.float64(0.43497759103775024)],
  'mean': np.float64(0.4438278834025065)},
 'intra_nli': {'timeline_level': [np.float64(0.9934347617284706),
   np.float64(0.9787365111917483),
   np.float64(0.8159803093181431)],
  'mean': np.float64(0.9293838607461207)},
 'fc_expert': {'timeline_level': [np.float64(0.972474127280293),
   np.float64(0.9795648178551346),
   np.float64(0.6322531652393216)],
  'mean': np.float64(0.8614307034582498)},
 'timeline_ids': ['46f4bb3ada', '83997cd4e7', '87821f81b9']}