In [1]:
import gc
import torch
from utils import SummarizationPipeline, ModelConfig, LoggingConfig, load_all_available_transcripts, TextChunker



In [2]:
transcripts = load_all_available_transcripts()
transcripts = transcripts[:1]

2025-04-29 21:57:27,962 - utils.wharton_processor - INFO - 32307
2025-04-29 21:57:28,052 - utils.wharton_processor - INFO - Successfully loaded local transcripts for 32307
2025-04-29 21:57:28,053 - utils.wharton_processor - INFO - Successfully loaded 32307.csv
2025-04-29 21:57:28,054 - utils.wharton_processor - INFO - 126475
2025-04-29 21:57:28,075 - utils.wharton_processor - INFO - Successfully loaded local transcripts for 126475
2025-04-29 21:57:28,076 - utils.wharton_processor - INFO - Successfully loaded 126475.csv
2025-04-29 21:57:28,077 - utils.wharton_processor - INFO - 26446
2025-04-29 21:57:28,087 - utils.wharton_processor - INFO - Successfully loaded local transcripts for 26446
2025-04-29 21:57:28,088 - utils.wharton_processor - INFO - Successfully loaded 26446.csv
2025-04-29 21:57:28,088 - utils.wharton_processor - INFO - 388904
2025-04-29 21:57:28,106 - utils.wharton_processor - INFO - Successfully loaded local transcripts for 388904
2025-04-29 21:57:28,107 - utils.wharton_

In [3]:
transcripts = transcripts.full_text.tolist()

In [5]:
checkpoints = ['facebook/bart-large-cnn', 'google-t5/t5-base', 'google/pegasus-x-large', 'human-centered-summarization/financial-summarization-pegasus']

local_paths = [f'../models/ragsum-{checkpoint}-billsum' for checkpoint in checkpoints]

logging_config: LoggingConfig = LoggingConfig()

summaries = []

for checkpoint, path in zip(checkpoints, local_paths):

    model_config: ModelConfig = ModelConfig(
        model_name_or_path=checkpoint, device='cuda' if torch.cuda.is_available() else 'cpu'
    )

    pipeline = SummarizationPipeline(model_config=model_config, logging_config=logging_config, remote=False)
    pipeline.load_from_local(path)
    
    tokenizer = pipeline.get_tokenizer()
    chunker = TextChunker(tokenizer)

    chunks = chunker.chunk_text(transcripts[0])

    try:
        chunk_summaries = [pipeline.summarize(chunk) for chunk in chunks]


        combined_summary = " ".join(chunk_summaries)

        max_rounds = 5
        round_count = 0
        final_summary = ""
        while round_count < max_rounds:
            print(f'round {round_count+1}')

            input_ids = tokenizer(combined_summary, return_tensors='pt', truncation=False)['input_ids']
            if input_ids.shape[1] <= min(1024, pipeline.model_max_length):
                final_summary = combined_summary
                break
            
            re_chunks = chunker.chunk_text(combined_summary)
            re_chunk_summaries = [pipeline.summarize(chunk) for chunk in re_chunks]
            combined_summary = " ".join(re_chunk_summaries)
            round_count += 1
        else:
            final_summary = combined_summary

        summaries.append(final_summary)
    finally:
        del pipeline
        del model_config
        torch.cuda.empty_cache()
        gc.collect()


2025-04-29 22:00:11,037 - SummarizationPipeline - INFO - Initializing pipeline with model facebook/bart-large-cnn
2025-04-29 22:00:11,041 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-facebook/bart-large-cnn-billsum
2025-04-29 22:00:12,877 - SummarizationPipeline - INFO - Local load complete. Model is standard type, 1024 max length.
2025-04-29 22:00:12,881 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-29 22:00:12,890 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 142365.41it/s]
2025-04-29 22:00:12,899 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-29 22:00:12,899 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:13,575 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:14,515 - SummarizationPipeline - IN

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 29537.35it/s]
2025-04-29 22:00:24,245 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-29 22:00:24,246 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:25,190 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


2025-04-29 22:00:26,441 - SummarizationPipeline - INFO - Initializing pipeline with model google-t5/t5-base
2025-04-29 22:00:26,443 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-google-t5/t5-base-billsum
2025-04-29 22:00:27,435 - SummarizationPipeline - INFO - Local load complete. Model is standard type, 1024 max length.
2025-04-29 22:00:27,440 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-29 22:00:27,447 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 132024.10it/s]
2025-04-29 22:00:27,454 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-29 22:00:27,455 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:28,255 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:29,060 - SummarizationPipeline - INFO - Generat

round 1


2025-04-29 22:00:37,559 - SummarizationPipeline - INFO - Initializing pipeline with model google/pegasus-x-large
2025-04-29 22:00:37,561 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-google/pegasus-x-large-billsum
2025-04-29 22:00:40,481 - SummarizationPipeline - INFO - Local load complete. Model is pegasus type, 1024 max length.
2025-04-29 22:00:40,482 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""
2025-04-29 22:00:40,504 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 134965.23it/s]
2025-04-29 22:00:40,513 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-04-29 22:00:40,513 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:42,520 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:00:44,125 - SummarizationPipeline - INFO 

round 1


Chunking text: 100%|██████████| 2/2 [00:00<00:00, 37786.52it/s]
2025-04-29 22:01:01,720 - utils.text_chunker - INFO - Text successfully split into 2 chunks.
2025-04-29 22:01:01,720 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:01:03,909 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


2025-04-29 22:01:05,889 - SummarizationPipeline - INFO - Initializing pipeline with model human-centered-summarization/financial-summarization-pegasus
2025-04-29 22:01:05,891 - SummarizationPipeline - INFO - Loading model and tokenizer from local path: ../models/ragsum-human-centered-summarization/financial-summarization-pegasus-billsum
2025-04-29 22:01:07,884 - SummarizationPipeline - INFO - Local load complete. Model is pegasus type, 512 max length.
2025-04-29 22:01:07,885 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=512, chunk_overlap=51, prefix=""
2025-04-29 22:01:07,902 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 26/26 [00:00<00:00, 385342.42it/s]
2025-04-29 22:01:07,909 - utils.text_chunker - INFO - Text successfully split into 26 chunks.
2025-04-29 22:01:07,909 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:01:08,781 - SummarizationPipeline - INFO - Generating summary (ma

round 1


Chunking text: 100%|██████████| 4/4 [00:00<00:00, 71089.90it/s]
2025-04-29 22:01:29,748 - utils.text_chunker - INFO - Text successfully split into 4 chunks.
2025-04-29 22:01:29,748 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:01:30,572 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:01:31,419 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-04-29 22:01:32,152 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


round 2


In [6]:
summaries

["JPMorgan's 21st Annual Technology and Automotive Investor Forum is held at the Consumer Electronics Show (CES) in Las Vegas, Nevada. JPMorgan is a sponsor of the conference, and the conference is open to the public. JPMorgan's semiconductor and semiconductor capital equipment analyst Harlan Sur:  Oren architecture has been adopted by Mercedes in their next-generation factories similar to some of the other automotive companies that are really seeing the need of the digital twin of the Ampere:  have seen is probably refreshes of the most important gamers out there to be a little bit over three years.  I think we'll see both new gamers and that refresh.  Siemens:  We've already probably passed an inflection point that we're seeing the growth of Orin with our NEVs. The next phase will be coming with our design wins both on robotaxis and bringing some of the early AV to market.",
 "asian-americans are a big part of the tech industry, and we're excited about the future. we're also excited 