# Summarization with fine-tuned summarization models

In [1]:
import gc
import torch
from tqdm.notebook import tqdm
from utils import SummarizationPipeline, ModelConfig, LoggingConfig, load_all_available_transcripts, TextChunker



In [2]:
transcripts = load_all_available_transcripts()
transcripts = transcripts[:3]

2025-04-30 11:45:32,879 - utils.loaders - INFO - 32307


2025-04-30 11:45:33,599 - utils.loaders - INFO - Successfully loaded local transcripts for 32307
2025-04-30 11:45:33,600 - utils.loaders - INFO - Successfully loaded 32307.csv
2025-04-30 11:45:33,601 - utils.loaders - INFO - 126475
2025-04-30 11:45:33,711 - utils.loaders - INFO - Successfully loaded local transcripts for 126475
2025-04-30 11:45:33,712 - utils.loaders - INFO - Successfully loaded 126475.csv
2025-04-30 11:45:33,712 - utils.loaders - INFO - 26446
2025-04-30 11:45:33,759 - utils.loaders - INFO - Successfully loaded local transcripts for 26446
2025-04-30 11:45:33,760 - utils.loaders - INFO - Successfully loaded 26446.csv
2025-04-30 11:45:33,761 - utils.loaders - INFO - 388904
2025-04-30 11:45:33,847 - utils.loaders - INFO - Successfully loaded local transcripts for 388904
2025-04-30 11:45:33,847 - utils.loaders - INFO - Successfully loaded 388904.csv
2025-04-30 11:45:33,848 - utils.loaders - INFO - 312932093
2025-04-30 11:45:33,885 - utils.loaders - INFO - Successfully load

In [3]:
transcripts = transcripts.full_text.tolist()

In [4]:
checkpoints = ['facebook/bart-large-cnn', 'google-t5/t5-base', 'google/pegasus-x-large', 'human-centered-summarization/financial-summarization-pegasus']

local_paths = [f'../models/ragsum-{checkpoint}-billsum' for checkpoint in checkpoints]

logging_config: LoggingConfig = LoggingConfig()

summaries = []

for checkpoint, path in zip(checkpoints, local_paths):

    model_config: ModelConfig = ModelConfig(
        model_name_or_path=checkpoint, device='cuda' if torch.cuda.is_available() else 'cpu'
    )

    pipeline = SummarizationPipeline(model_config=model_config, logging_config=logging_config, remote=False)
    pipeline.load_from_local(path)
    
    tokenizer = pipeline.get_tokenizer()
    chunker = TextChunker(tokenizer)

    try:
        for transcript in tqdm(transcripts):
            chunks = chunker.chunk_text(transcript)
            chunk_summaries = [pipeline.summarize(chunk) for chunk in chunks]
            combined_summary = " ".join(chunk_summaries)
            max_rounds = 5
            round_count = 0
            final_summary = ""
            while round_count < max_rounds:
                print(f'round {round_count+1}')
                input_ids = tokenizer(combined_summary, return_tensors='pt', truncation=False)['input_ids']
                if input_ids.shape[1] <= min(1024, pipeline.model_max_length):
                    final_summary = combined_summary
                    break
                re_chunks = chunker.chunk_text(combined_summary)
                re_chunk_summaries = [pipeline.summarize(chunk) for chunk in re_chunks]
                combined_summary = " ".join(re_chunk_summaries)
                round_count += 1
            else:
                final_summary = combined_summary
            summaries.append(final_summary)
    finally:
        del pipeline
        del model_config
        torch.cuda.empty_cache()
        gc.collect()


2025-04-30 11:45:38,113 - SummarizationPipeline - INFO - Initializing pipeline with model facebook/bart-large-cnn
2025-04-30 11:45:38,115 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-facebook/bart-large-cnn-billsum
2025-04-30 11:46:03,279 - SummarizationPipeline - INFO - Local load complete. Model is standard type, 1024 max length.
2025-04-30 11:46:03,654 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-30 11:46:03,655 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 197557.80it/s]
2025-04-30 11:46:03,662 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-30 11:46:03,663 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:09,171 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:10,126 - SummarizationPipeline - IN

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 35394.97it/s]
2025-04-30 11:46:20,050 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-30 11:46:20,051 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:21,005 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:21,912 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 16/16 [00:00<00:00, 262144.00it/s]
2025-04-30 11:46:21,921 - utils.text_chunker - INFO - Text successfully split into 16 chunks.
2025-04-30 11:46:21,921 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:22,876 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:23,830 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:24,785 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:25,691 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:26,289 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:27,105 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:27,906 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:28,879 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 33288.13it/s]
2025-04-30 11:46:36,240 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-30 11:46:36,241 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:37,170 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:37,873 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 24/24 [00:00<00:00, 394758.02it/s]
2025-04-30 11:46:37,883 - utils.text_chunker - INFO - Text successfully split into 24 chunks.
2025-04-30 11:46:37,883 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:38,528 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:39,443 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:40,396 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:41,179 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:42,132 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:43,083 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:44,036 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:46:44,991 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 3/3 [00:00<00:00, 44306.03it/s]
2025-04-30 11:46:59,756 - utils.text_chunker - INFO - Text successfully split into 3 chunks.
2025-04-30 11:46:59,757 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:00,389 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:01,343 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


2025-04-30 11:47:02,625 - SummarizationPipeline - INFO - Initializing pipeline with model google-t5/t5-base
2025-04-30 11:47:02,626 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-google-t5/t5-base-billsum
2025-04-30 11:47:15,721 - SummarizationPipeline - INFO - Local load complete. Model is standard type, 1024 max length.
2025-04-30 11:47:15,725 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-30 11:47:15,734 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 138742.88it/s]
2025-04-30 11:47:15,741 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-30 11:47:15,741 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:17,353 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:18,173 - SummarizationPipeline - INFO - Generat

round 1


Chunking text: 100%|██████████| 16/16 [00:00<00:00, 261123.98it/s]
2025-04-30 11:47:26,473 - utils.text_chunker - INFO - Text successfully split into 16 chunks.
2025-04-30 11:47:26,474 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:27,451 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:28,348 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:29,177 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:30,073 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:30,726 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:31,494 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:32,052 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:32,937 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 24/24 [00:00<00:00, 330043.59it/s]
2025-04-30 11:47:39,063 - utils.text_chunker - INFO - Text successfully split into 24 chunks.
2025-04-30 11:47:39,064 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:39,990 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:40,514 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:41,307 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:41,852 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:42,905 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:43,592 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:44,389 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:45,304 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 35544.95it/s]
2025-04-30 11:47:58,003 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-30 11:47:58,004 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:47:58,637 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


2025-04-30 11:47:59,728 - SummarizationPipeline - INFO - Initializing pipeline with model google/pegasus-x-large
2025-04-30 11:47:59,729 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-google/pegasus-x-large-billsum
2025-04-30 11:48:32,339 - SummarizationPipeline - INFO - Local load complete. Model is pegasus type, 1024 max length.
2025-04-30 11:48:32,340 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-30 11:48:32,358 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 213827.26it/s]
2025-04-30 11:48:32,366 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-30 11:48:32,367 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:48:34,423 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:48:36,062 - SummarizationPipeline - INFO 

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 33689.19it/s]
2025-04-30 11:48:54,145 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-30 11:48:54,145 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:48:56,395 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:48:58,066 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 16/16 [00:00<00:00, 226719.14it/s]
2025-04-30 11:48:58,074 - utils.text_chunker - INFO - Text successfully split into 16 chunks.
2025-04-30 11:48:58,074 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:48:59,773 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:02,037 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:03,760 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:05,348 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:07,223 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:08,777 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:10,502 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:12,139 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 37117.73it/s]
2025-04-30 11:49:25,994 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-30 11:49:25,994 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:27,695 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:29,044 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 24/24 [00:00<00:00, 282762.07it/s]
2025-04-30 11:49:29,054 - utils.text_chunker - INFO - Text successfully split into 24 chunks.
2025-04-30 11:49:29,055 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:30,566 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:32,451 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:33,795 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:35,132 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:37,299 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:39,566 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:42,188 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:49:45,194 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 3/3 [00:00<00:00, 53544.31it/s]
2025-04-30 11:50:17,241 - utils.text_chunker - INFO - Text successfully split into 3 chunks.
2025-04-30 11:50:17,241 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:50:19,007 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:50:21,270 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


2025-04-30 11:50:24,185 - SummarizationPipeline - INFO - Initializing pipeline with model human-centered-summarization/financial-summarization-pegasus
2025-04-30 11:50:24,186 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-human-centered-summarization/financial-summarization-pegasus-billsum
2025-04-30 11:50:52,389 - SummarizationPipeline - INFO - Local load complete. Model is pegasus type, 512 max length.
2025-04-30 11:50:52,390 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=512, chunk_overlap=51, prefix=""
2025-04-30 11:50:52,405 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 26/26 [00:00<00:00, 392273.04it/s]
2025-04-30 11:50:52,413 - utils.text_chunker - INFO - Text successfully split into 26 chunks.
2025-04-30 11:50:52,413 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:50:53,314 - SummarizationPipeline - INFO - Generating summary (ma

round 1


Chunking text: 100%|██████████| 4/4 [00:00<00:00, 66841.50it/s]
2025-04-30 11:51:14,571 - utils.text_chunker - INFO - Text successfully split into 4 chunks.
2025-04-30 11:51:14,571 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:15,410 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:16,278 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:17,029 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:17,780 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 33/33 [00:00<00:00, 453809.94it/s]
2025-04-30 11:51:17,788 - utils.text_chunker - INFO - Text successfully split into 33 chunks.
2025-04-30 11:51:17,789 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:18,640 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:19,646 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:20,381 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:21,244 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:22,007 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:22,818 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:23,770 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:24,538 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 4/4 [00:00<00:00, 71089.90it/s]
2025-04-30 11:51:44,638 - utils.text_chunker - INFO - Text successfully split into 4 chunks.
2025-04-30 11:51:44,638 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:45,493 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:46,277 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:47,061 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:47,861 - utils.text_chunker - INFO - Starting text chunking...


round 2


Chunking text: 100%|██████████| 47/47 [00:00<00:00, 531353.88it/s]
2025-04-30 11:51:47,871 - utils.text_chunker - INFO - Text successfully split into 47 chunks.
2025-04-30 11:51:47,872 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:48,973 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:49,739 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:50,610 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:51,345 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:52,194 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:52,991 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:53,833 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:51:54,585 - SummarizationPipeline - INFO - Genera

round 1


Chunking text: 100%|██████████| 6/6 [00:00<00:00, 102717.65it/s]
2025-04-30 11:52:26,433 - utils.text_chunker - INFO - Text successfully split into 6 chunks.
2025-04-30 11:52:26,434 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:52:27,461 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:52:28,257 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:52:29,214 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:52:29,988 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-30 11:52:30,737 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


In [5]:
summaries

["JPMorgan's 21st Annual Technology and Automotive Investor Forum is held at the Consumer Electronics Show (CES) in Las Vegas, Nevada. JPMorgan is a sponsor of the conference, and the conference is open to the public. JPMorgan's semiconductor and semiconductor capital equipment analyst Harlan Sur:  Oren architecture has been adopted by Mercedes in their next-generation factories similar to some of the other automotive companies that are really seeing the need of the digital twin of the Ampere:  have seen is probably refreshes of the most important gamers out there to be a little bit over three years.  I think we'll see both new gamers and that refresh.  Siemens:  We've already probably passed an inflection point that we're seeing the growth of Orin with our NEVs. The next phase will be coming with our design wins both on robotaxis and bringing some of the early AV to market.",
 "JPMorgan's 41st Annual Healthcare Conference:  Presenter:  Kimberly Powell, Vice President of Healthcare at 