# Summarize Task Evaluation

## Import

In [52]:
# basic setting
from dotenv import load_dotenv
import nest_asyncio

# load data
import os
import pandas as pd
from uuid import uuid4

# create dataset
from datasets import load_dataset
from datetime import datetime

# evaluate
from autorag.evaluator import Evaluator
import tempfile
from ragas import EvaluationDataset, SingleTurnSample, evaluate
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from ragas.metrics import SummarizationScore

## Basic Setting

In [2]:
load_dotenv()

True

In [3]:
nest_asyncio.apply()

In [4]:
root_dir = os.getcwd()
data_dir = os.path.join(root_dir, "data")
project_dir = os.path.join(root_dir, "autorag_project")
config_dir = os.path.join(root_dir, "config")

## Create Dataset

In [7]:
dataset = load_dataset("FiscalNote/billsum")

dataset

README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

In [9]:
origin_dataset = dataset["train"].to_pandas().sample(20)
origin_dataset.reset_index(drop=True, inplace=True)

origin_dataset.head()

Unnamed: 0,text,summary,title
0,SECTION 1. SHORT TITLE; TABLE OF CONTENTS.\n\n...,Cord Blood Education and Awareness Act of 2009...,To direct the Secretary of Health and Human Se...
1,SECTION 1. PULP AND PAPER ENERGY SECURITY TASK...,Directs the Secretary of Energy to establish a...,To establish a task force to lower energy cost...
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Critical Access to Health Information Technolo...,A bill to provide grants for rural health info...
3,SECTION 1. SHORT TITLE.\n\n This Act may be...,Preserving Access to Affordable Drugs Act of 2...,To amend part D of title XVIII of the Social S...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Disaster Relief Volunteer Protection Act of 20...,To provide liability protection for individual...


In [31]:
# convert into autorag data form
dicts = []
for _, row in origin_dataset.iterrows():
    dicts.append({
        "qid" : str(uuid4()),
        "query" : "summarize the following document.",
        "retrieval_gt" : [[]],
        "generation_gt" : [row['summary']],
        "retrieved_contents" : [row['text']],
        'retrieved_ids': [],
        "retireve_scores": []
    })

autorag_df = pd.DataFrame(dicts)

autorag_df.head()

Unnamed: 0,qid,query,retrieval_gt,generation_gt,retrieved_contents,retrieved_ids,retireve_scores
0,19cdee51-2d94-4ea7-9489-56db41a04030,summarize the following document.,[[]],[Cord Blood Education and Awareness Act of 200...,[SECTION 1. SHORT TITLE; TABLE OF CONTENTS.\n\...,[],[]
1,5c400ec2-c770-48e9-870f-fe26a9167dff,summarize the following document.,[[]],[Directs the Secretary of Energy to establish ...,[SECTION 1. PULP AND PAPER ENERGY SECURITY TAS...,[],[]
2,c705cf85-8468-40e1-b849-03e860e35fa0,summarize the following document.,[[]],[Critical Access to Health Information Technol...,[SECTION 1. SHORT TITLE.\n\n This Act may b...,[],[]
3,1727cf6c-b8c6-4849-8cb2-494bbe8e359e,summarize the following document.,[[]],[Preserving Access to Affordable Drugs Act of ...,[SECTION 1. SHORT TITLE.\n\n This Act may b...,[],[]
4,e9810123-4272-4a0a-aa54-029d75146370,summarize the following document.,[[]],[Disaster Relief Volunteer Protection Act of 2...,[SECTION 1. SHORT TITLE.\n\n This Act may b...,[],[]


In [32]:
empty_corpus_df = pd.DataFrame([{
    'doc_id': 'empty',
    'contents': 'empty',
    'metadata': {'last_modified_datetime': datetime.now()}
}], columns=['doc_id', 'contents', 'metadata'])

empty_corpus_df.head()

Unnamed: 0,doc_id,contents,metadata
0,empty,empty,{'last_modified_datetime': 2025-06-11 17:42:54...


## Evaluate Summary task (Sem Score, Rouge)

In [37]:
yaml_path = os.path.join(config_dir, "config.yaml")

with tempfile.NamedTemporaryFile(suffix='.parquet') as qa_path:
    with tempfile.NamedTemporaryFile(suffix='.parquet') as corpus_path:
        autorag_df.to_parquet(qa_path.name)
        empty_corpus_df.to_parquet(corpus_path.name)
        
        evaluator = Evaluator(
            qa_data_path=qa_path.name,
            corpus_data_path=corpus_path.name,
            project_dir=project_dir
        )

        evaluator.start_trial(yaml_path=yaml_path, skip_validation=True)

[2K[2;36m[06/11/25 17:48:04][0m[2;36m [0m[34mINFO    [0m [1m[[0mevaluator.py:[1;36m205[0m[1m][0m >> Running node ]8;id=175209;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/autorag/evaluator.py\[2mevaluator.py[0m]8;;\[2m:[0m]8;id=717784;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/autorag/evaluator.py#205\[2m205[0m]8;;\
[2;36m                    [0m         line node_line[33m...[0m                  [2m                [0m
[2K[2;36m                   [0m[2;36m [0m[34mINFO    [0m [1m[[0mnode.py:[1;36m55[0m[1m][0m >> Running node             ]8;id=126684;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/autorag/schema/node.py\[2mnode.py[0m]8;;\[2m:[0m]8;id=522120;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/autorag/schema/node.py#55\[2m55[0m]8;;\
[2;36m                    [0m         prompt_maker[33m...[0m                          [2m          [0m
[2K[2;36m              

Generating embeddings:   0%|          | 0/20 [00:00<?, ?it/s]

[2K[2;36m                   [0m[2;36m [0m[34mINFO    [0m [1m[[0m_client.py:[1;36m1025[0m[1m][0m >> HTTP Request:  ]8;id=851468;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py\[2m_client.py[0m]8;;\[2m:[0m]8;id=486270;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py#1025\[2m1025[0m]8;;\
[2;36m                    [0m         [1;33mPOST[0m                                [2m               [0m
[2;36m                    [0m         [4;94mhttps://api.openai.com/v1/embedding[0m [2m               [0m
[2;36m                    [0m         [4;94ms[0m [32m"HTTP/1.1 200 OK"[0m                 [2m               [0m
[2K[36mEvaluating...[0m [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [35m 50%[0m 1/2 [33m0:00:13[0m

Generating embeddings:   0%|          | 0/20 [00:00<?, ?it/s]

[2K[2;36m[06/11/25 17:48:18][0m[2;36m [0m[34mINFO    [0m [1m[[0m_client.py:[1;36m1025[0m[1m][0m >> HTTP Request:  ]8;id=46119;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py\[2m_client.py[0m]8;;\[2m:[0m]8;id=419747;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py#1025\[2m1025[0m]8;;\
[2;36m                    [0m         [1;33mPOST[0m                                [2m               [0m
[2;36m                    [0m         [4;94mhttps://api.openai.com/v1/embedding[0m [2m               [0m
[2;36m                    [0m         [4;94ms[0m [32m"HTTP/1.1 200 OK"[0m                 [2m               [0m
[2K[2;36m                   [0m[2;36m [0m[34mINFO    [0m [1m[[0mevaluator.py:[1;36m218[0m[1m][0m >> Evaluation   ]8;id=40984;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/autorag/evaluator.py\[2mevaluator.py[0m]8;;\[2m:[0m]8;id=807329;file:///opt/anacon

## Check Result

In [44]:
result_df = pd.read_parquet(
    os.path.join(project_dir, "0", "node_line", "generator", "best_0.parquet")
)

In [45]:
def show_summary_results(df, idx:int):
    print(
        f"Original Content: {df.iloc[idx]['retrieved_contents'][0]}",
        f"\n\nGround Truth Summary: {df.iloc[idx]['generation_gt']}",
        f"\n\nPredicted Summary: {df.iloc[idx]['generated_texts']}",
        f"\n\nRouge Score: {df.iloc[idx]['rouge']}",
        f"\nSem Score: {df.iloc[idx]['sem_score']}"
    )

In [46]:
show_summary_results(result_df, 0)

Original Content: SECTION 1. SHORT TITLE; TABLE OF CONTENTS.

    (a) Short Title.--This Act may be cited as the ``Cord Blood 
Education and Awareness Act of 2009''.
    (b) Table of Contents.--The table of contents of this Act is as 
follows:

Sec. 1. Short title; table of contents.
Sec. 2. Findings.
Sec. 3. Public education campaign.
Sec. 4. Patient informed consent document.
Sec. 5. Duty of certain professionals to disclose information to, and 
                            obtain informed consent from, pregnant 
                            patients.
Sec. 6. Professional education.
Sec. 7. Targeted education grants.
Sec. 8. Authorization of appropriations.

SEC. 2. FINDINGS.

    Congress finds the following:
            (1) Every 10 minutes, another child or adult is expected to 
        die from leukemia, lymphoma or myeloma. Leukemia, lymphoma, and 
        myeloma caused the deaths of an estimated 52,910 people in the 
        United States in 2007 and accounted for nearly 9.4 per

In [47]:
summary_df = pd.read_csv(
    os.path.join(project_dir, "0", "node_line", "generator", "summary.csv")
)

summary_df.head()

Unnamed: 0,filename,module_name,module_params,execution_time,average_output_token,rouge,sem_score,is_best
0,0.parquet,OpenAILLM,"{'llm': 'gpt-4o-mini', 'temperature': 1.0, 'ba...",0.624286,248.85,0.258943,0.950277,True


## Evaluate Summary Task (RAGAS Summarization Conciseness Score)

In [48]:
def autorag_to_ragas(autorag_df: pd.DataFrame, corpus_df: pd.DataFrame) -> EvaluationDataset:
	samples = []
	for idx, row in autorag_df.iterrows():
		samples.append(SingleTurnSample(
			reference_contexts=row['retrieved_contents'],
			response=row['generated_texts'],
			reference=row['generation_gt'][0],
		))
	return EvaluationDataset(samples)
ragas_dataset = autorag_to_ragas(result_df, empty_corpus_df)

In [49]:
ragas_dataset

EvaluationDataset(features=['reference_contexts', 'response', 'reference'], len=20)

In [51]:
ragas_llm = LangchainLLMWrapper(
    ChatOpenAI(
        model="gpt-4o"
    )
)

In [53]:
result = evaluate(ragas_dataset, metrics=[SummarizationScore()], llm=ragas_llm)

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

[2;36m[06/11/25 18:00:58][0m[2;36m [0m[34mINFO    [0m [1m[[0m_client.py:[1;36m1740[0m[1m][0m >> HTTP Request:  ]8;id=379567;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py\[2m_client.py[0m]8;;\[2m:[0m]8;id=745044;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py#1740\[2m1740[0m]8;;\
[2;36m                    [0m         [1;33mPOST[0m                                [2m               [0m
[2;36m                    [0m         [4;94mhttps://api.openai.com/v1/chat/comp[0m [2m               [0m
[2;36m                    [0m         [4;94mletions[0m [32m"HTTP/1.1 200 OK"[0m           [2m               [0m
[2;36m[06/11/25 18:00:59][0m[2;36m [0m[34mINFO    [0m [1m[[0m_client.py:[1;36m1740[0m[1m][0m >> HTTP Request:  ]8;id=381942;file:///opt/anaconda3/envs/llm/lib/python3.10/site-packages/httpx/_client.py\[2m_client.py[0m]8;;\[2m:[0m]8;id=287631;file:///opt/anaconda3/envs/ll

In [54]:
result

{'summary_score': 0.5954}