In [1]:
import os
import gc
from datetime import date

import torch
from tqdm.notebook import tqdm
import pandas as pd
from langchain.schema import Document

from utils import (
    compute_metrics,
    load_all_available_transcripts,
    SummarizationPipeline,
    TextChunker,
    LoggingConfig,
    ModelConfig,
    TopicModeler,
    Retriever
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
transcripts = load_all_available_transcripts()
transcripts = transcripts[:1]

2025-05-04 19:36:38,711 - utils.loaders - INFO - 32307
2025-05-04 19:36:38,847 - utils.loaders - INFO - Successfully loaded local transcripts for 32307
2025-05-04 19:36:38,849 - utils.loaders - INFO - Successfully loaded 32307.csv
2025-05-04 19:36:38,849 - utils.loaders - INFO - 126475
2025-05-04 19:36:38,882 - utils.loaders - INFO - Successfully loaded local transcripts for 126475
2025-05-04 19:36:38,883 - utils.loaders - INFO - Successfully loaded 126475.csv
2025-05-04 19:36:38,884 - utils.loaders - INFO - 26446
2025-05-04 19:36:38,900 - utils.loaders - INFO - Successfully loaded local transcripts for 26446
2025-05-04 19:36:38,901 - utils.loaders - INFO - Successfully loaded 26446.csv
2025-05-04 19:36:38,902 - utils.loaders - INFO - 388904
2025-05-04 19:36:38,928 - utils.loaders - INFO - Successfully loaded local transcripts for 388904
2025-05-04 19:36:38,929 - utils.loaders - INFO - Successfully loaded 388904.csv
2025-05-04 19:36:38,929 - utils.loaders - INFO - 312932093
2025-05-04 

In [3]:
transcripts.head()

Unnamed: 0,companyid,companyname,mostimportantdateutc,mostimportanttimeutc,headline,full_text,uuid,word_count,word_count_nltk
0,32307,NVIDIA Corporation,2023-01-05,15:50:00,NVIDIA Corporation Presents at J.P. MORGAN 21S...,Analysts: All right. Why don't we go ahead and...,ca419775-262e-4e3a-975a-e4ead13ef55b,9541,10911


In [4]:
original_texts = transcripts['full_text'].tolist()
metadata = transcripts[['uuid', 'companyid', 'companyname', 'word_count_nltk']]

In [19]:
checkpoints = [
    'facebook/bart-large-cnn',
    'google-t5/t5-base',
    'google/pegasus-x-large',
    'human-centered-summarization/financial-summarization-pegasus',
]

In [20]:
logging_config = LoggingConfig()
all_metrics = []

In [21]:
df = pd.read_csv(
    'hf://datasets/sohomghosh/FinRAD_Financial_Readability_Assessment_Dataset/FinRAD_13K_terms_definitions_labels.csv'
)
df = df[['terms', 'definitions', 'source', 'assigned_readability']]
df = df.dropna(subset=['definitions'])
df['combined'] = df['terms'] + ': ' + df['definitions']

retriever = Retriever(df.combined.tolist(), 5)

Encoding: 100%|██████████| 13112/13112 [13:32<00:00, 16.13doc/s]
Indexing: 100%|██████████| 13112/13112 [00:00<00:00, 507541.87vec/s]


In [22]:
for checkpoint in checkpoints:
    model_config = ModelConfig(
        model_name_or_path=checkpoint,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    pipeline = SummarizationPipeline(
        model_config=model_config,
        logging_config=logging_config,
        remote=True
    )
    tokenizer = pipeline.get_tokenizer()
    chunker = TextChunker(tokenizer)

    summaries = []
    for text in tqdm(original_texts, desc=f"Summarizing with {checkpoint}"):
        chunks = chunker.chunk_text(text)

        print([len(x) for x in chunks])

        chunker.resize_chunks(int(chunker._adjusted_chunk_size / 4))

        print(f'chunk size: {chunker._adjusted_chunk_size}')

        tm_chunks = chunker.chunk_text(text)

        print([len(x) for x in tm_chunks])

        chunker.resize_chunks(int(chunker._adjusted_chunk_size * 4))

        tm = TopicModeler(
            chunks=[Document(page_content=doc) for doc in tm_chunks], 
            speed='learn', 
            workers=8
        )
        topic_words, _, topic_nums = tm.get_topics(1)

        for words, tid in zip(topic_words, topic_nums):
            print(f'Topic: ' + ' '.join(words))

        topics_string = ' '.join(words)
        top_results, _ = retriever.search(topics_string, 3)

        for result in top_results:
            print(f'found definition: {result}')

        chunks.insert(0, 'context: ' + ', '.join(top_results))
        chunk_summaries = [pipeline.summarize(c) for c in chunks]
        combined = " ".join(chunk_summaries)

        # If combined summary is too long, iteratively re-chunk and re-summarize
        max_rounds = 5
        for _ in range(max_rounds):
            tokens = tokenizer(combined, return_tensors='pt', truncation=False)['input_ids']
            if tokens.shape[1] <= min(1024, pipeline.model_max_length):
                break
            re_chunks = chunker.chunk_text(combined)
            combined = " ".join(pipeline.summarize(rc) for rc in re_chunks)

        summaries.append(combined)

    # Clean up GPU memory
    del pipeline, model_config
    torch.cuda.empty_cache()
    gc.collect()

2025-05-04 20:35:30,064 - SummarizationPipeline - INFO - Initializing pipeline with model facebook/bart-large-cnn
2025-05-04 20:35:30,065 - SummarizationPipeline - INFO - Loading tokenizer for facebook/bart-large-cnn
2025-05-04 20:35:30,989 - SummarizationPipeline - INFO - Loading model for facebook/bart-large-cnn (8bit=False, device=cpu)
2025-05-04 20:35:31,747 - SummarizationPipeline - INFO - Model and tokenizer loaded successfully. Model max length: 1000000000000000019884624838656
2025-05-04 20:35:31,748 - SummarizationPipeline - INFO - Using prefix: summarize: 
2025-05-04 20:35:33,049 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""


Summarizing with facebook/bart-large-cnn:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-04 20:35:33,111 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 70356.07it/s]
2025-05-04 20:35:33,133 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-05-04 20:35:33,134 - utils.text_chunker - INFO - Starting text chunking...


[4665, 4743, 4723, 4642, 4688, 4691, 4727, 4729, 4621, 4828, 4626, 4748, 3110]
chunk size: 1024


Chunking text: 100%|██████████| 76/76 [00:00<00:00, 283096.90it/s]
2025-05-04 20:35:33,149 - utils.text_chunker - INFO - Text successfully split into 76 chunks.
2025-05-04 20:35:33,151 - top2vec - INFO - Pre-processing documents for training
2025-05-04 20:35:33,226 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


[1172, 1170, 1195, 1152, 1106, 1131, 1197, 1163, 1205, 1206, 1169, 1119, 1163, 1093, 1081, 1219, 1309, 1247, 1180, 1185, 1145, 1105, 1152, 1190, 1166, 1165, 1109, 1141, 1212, 1212, 1219, 1163, 1135, 1128, 1205, 1144, 1217, 1211, 1171, 1166, 1194, 1150, 1097, 1125, 1189, 1164, 1207, 1217, 1168, 1136, 1164, 1106, 1110, 1204, 1325, 1271, 1205, 1191, 1137, 1092, 1114, 1181, 1183, 1165, 1099, 1131, 1205, 1239, 1235, 1184, 1135, 1145, 1192, 1155, 1223, 893]


2025-05-04 20:35:34,451 - top2vec - INFO - Creating joint document/word embedding
2025-05-04 20:35:36,240 - top2vec - INFO - Creating lower dimension embedding of documents
2025-05-04 20:35:36,714 - top2vec - INFO - Finding dense areas of documents
2025-05-04 20:35:36,720 - top2vec - INFO - Finding topics
Reducing topics: 100%|██████████| 1/1 [00:00<00:00, 459.05it/s]
2025-05-04 20:35:36,790 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


Topic: gaming architecture market software we growth our not as do so working more about work great but ve both what and to in team really new are also is you for re see year now your well this has with the have terms be think that on very of here
found definition: pvgo: present value of growth opportunities.
found definition: present value of growth opportunities PVGO: Net present value of a firm™s future investments.
found definition: Net Present Value of Growth Opportunities (NPVGO): The net present value of growth opportunities (NPVGO) is a calculation of the net present value per share of all future cash flows involved with growth opportunities such as new projects or potential acquisitions. The net present value of growth opportunities is used to determine the intrinsic value per share of these growth opportunities in order to determine how much of the firm's current per-share value is determined by them.


2025-05-04 20:35:54,346 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:36:11,953 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:36:32,587 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:36:49,713 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:37:06,036 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:37:22,596 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:37:36,827 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:37:52,243 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:38:07,748 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:38:24,629 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:38:37,917 - Summ

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

2025-05-04 20:39:28,745 - SummarizationPipeline - INFO - Loading model for google-t5/t5-base (8bit=False, device=cpu)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

2025-05-04 20:39:33,798 - SummarizationPipeline - INFO - Model and tokenizer loaded successfully. Model max length: 1000000000000000019884624838656
2025-05-04 20:39:33,799 - SummarizationPipeline - INFO - Using prefix: summarize: 
2025-05-04 20:39:34,130 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""


Summarizing with google-t5/t5-base:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-04 20:39:34,158 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 13/13 [00:00<00:00, 90424.46it/s]
2025-05-04 20:39:34,194 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-05-04 20:39:34,195 - utils.text_chunker - INFO - Starting text chunking...


[4665, 4743, 4723, 4642, 4688, 4691, 4727, 4729, 4621, 4828, 4626, 4748, 3110]
chunk size: 1024


Chunking text: 100%|██████████| 76/76 [00:00<00:00, 407630.57it/s]
2025-05-04 20:39:34,208 - utils.text_chunker - INFO - Text successfully split into 76 chunks.
2025-05-04 20:39:34,211 - top2vec - INFO - Pre-processing documents for training
2025-05-04 20:39:34,296 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


[1172, 1170, 1195, 1152, 1106, 1131, 1197, 1163, 1205, 1206, 1169, 1119, 1163, 1093, 1081, 1219, 1309, 1247, 1180, 1185, 1145, 1105, 1152, 1190, 1166, 1165, 1109, 1141, 1212, 1212, 1219, 1163, 1135, 1128, 1205, 1144, 1217, 1211, 1171, 1166, 1194, 1150, 1097, 1125, 1189, 1164, 1207, 1217, 1168, 1136, 1164, 1106, 1110, 1204, 1325, 1271, 1205, 1191, 1137, 1092, 1114, 1181, 1183, 1165, 1099, 1131, 1205, 1239, 1235, 1184, 1135, 1145, 1192, 1155, 1223, 893]


2025-05-04 20:39:35,489 - top2vec - INFO - Creating joint document/word embedding
2025-05-04 20:39:37,331 - top2vec - INFO - Creating lower dimension embedding of documents
2025-05-04 20:39:37,804 - top2vec - INFO - Finding dense areas of documents
2025-05-04 20:39:37,810 - top2vec - INFO - Finding topics
Reducing topics: 100%|██████████| 1/1 [00:00<00:00, 403.34it/s]
2025-05-04 20:39:37,963 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


Topic: growth market architecture gaming we software year our not be terms as more working do is so team work are in re great about to and think both but the what very new really for see also now has this you that with ve have of your it well on
found definition: pvgo: present value of growth opportunities.
found definition: present value of growth opportunities PVGO: Net present value of a firm™s future investments.
found definition: Boom and bust: See business cycle.


2025-05-04 20:39:41,257 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:39:54,097 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:40:05,866 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:40:16,811 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:40:25,247 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:40:35,504 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:40:50,337 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:41:00,872 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:41:13,538 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:41:24,102 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:41:33,339 - Summ

tokenizer_config.json:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

You are using a model of type pegasus_x to instantiate a model of type pegasus. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-x-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.decoder.layers.0.encoder_attn.k_proj.bias', 'model.decoder.layers.0.encoder_attn.out_proj.bias', 'model.decoder.layers.0.encoder_attn.q_proj.bias', 'model.decoder.layers.0.encoder_attn.v_proj.bias', 'model.decoder.layers.0.self_attn.k_proj.bias', 'model.decoder.layers.0.self_attn.out_proj.bias', 'model.decoder.layers.0.self_attn.q_proj.bias', 'model.decoder.layers.0.self_attn.v_proj.bias', 'model.decoder.layers.1.encoder_attn.k_proj.bias', 'model.decoder.layers.1.encoder_attn.out_proj.bias', 'model.decoder.layers.1.encoder_attn.q_proj.bias', 'model.decoder.layers.1.encoder_attn.v_proj.bias', 'model.decoder.layers.1.self_attn.k_proj.bias', 'model.decoder.layers.1.self_attn.out_proj.bias', 'model.decoder.layers.1.self_attn.q_proj.bias', 'model.decoder.layers.1.self_attn.v_proj.bias', 'model.deco

generation_config.json:   0%|          | 0.00/262 [00:00<?, ?B/s]

2025-05-04 20:45:02,951 - SummarizationPipeline - INFO - Model and tokenizer loaded successfully. Model max length: 1024
2025-05-04 20:45:02,952 - SummarizationPipeline - INFO - Using prefix: 
2025-05-04 20:45:02,953 - SummarizationPipeline - INFO - Model and tokenizer vocab sizes match. No resizing needed.
2025-05-04 20:45:02,954 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=1024, chunk_overlap=102, prefix=""


Summarizing with google/pegasus-x-large:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-04 20:45:03,045 - utils.text_chunker - INFO - Starting text chunking...

Chunking text: 100%|██████████| 13/13 [00:00<00:00, 77014.06it/s]
2025-05-04 20:45:03,063 - utils.text_chunker - INFO - Text successfully split into 13 chunks.
2025-05-04 20:45:03,064 - utils.text_chunker - INFO - Starting text chunking...


[4665, 4743, 4723, 4642, 4688, 4691, 4727, 4729, 4621, 4828, 4626, 4748, 3110]
chunk size: 1024



Chunking text: 100%|██████████| 76/76 [00:00<00:00, 377239.18it/s]
2025-05-04 20:45:03,096 - utils.text_chunker - INFO - Text successfully split into 76 chunks.
2025-05-04 20:45:03,099 - top2vec - INFO - Pre-processing documents for training
2025-05-04 20:45:03,166 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


[1172, 1170, 1195, 1152, 1106, 1131, 1197, 1163, 1205, 1206, 1169, 1119, 1163, 1093, 1081, 1219, 1309, 1247, 1180, 1185, 1145, 1105, 1152, 1190, 1166, 1165, 1109, 1141, 1212, 1212, 1219, 1163, 1135, 1128, 1205, 1144, 1217, 1211, 1171, 1166, 1194, 1150, 1097, 1125, 1189, 1164, 1207, 1217, 1168, 1136, 1164, 1106, 1110, 1204, 1325, 1271, 1205, 1191, 1137, 1092, 1114, 1181, 1183, 1165, 1099, 1131, 1205, 1239, 1235, 1184, 1135, 1145, 1192, 1155, 1223, 893]


2025-05-04 20:45:04,417 - top2vec - INFO - Creating joint document/word embedding
2025-05-04 20:45:06,226 - top2vec - INFO - Creating lower dimension embedding of documents
2025-05-04 20:45:06,708 - top2vec - INFO - Finding dense areas of documents
2025-05-04 20:45:06,714 - top2vec - INFO - Finding topics

Reducing topics: 100%|██████████| 1/1 [00:00<00:00, 461.98it/s]
2025-05-04 20:45:06,785 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


Topic: gaming market architecture software we growth not our as do so working more about ve work both but and really team what great to in also is you are new year for see re well your now has the with terms this have be think that on very of right
found definition: pvgo: present value of growth opportunities.
found definition: present value of growth opportunities PVGO: Net present value of a firm™s future investments.
found definition: Net Present Value of Growth Opportunities (NPVGO): The net present value of growth opportunities (NPVGO) is a calculation of the net present value per share of all future cash flows involved with growth opportunities such as new projects or potential acquisitions. The net present value of growth opportunities is used to determine the intrinsic value per share of these growth opportunities in order to determine how much of the firm's current per-share value is determined by them.


2025-05-04 20:46:51,981 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:48:35,947 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:50:26,365 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:51:24,622 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:52:53,966 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:54:14,611 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:55:09,341 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:56:30,548 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:57:44,788 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:58:38,145 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 20:59:53,840 - Summ

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-04 21:06:30,480 - SummarizationPipeline - INFO - Model and tokenizer loaded successfully. Model max length: 512
2025-05-04 21:06:30,482 - SummarizationPipeline - INFO - Using prefix: 
2025-05-04 21:06:30,483 - SummarizationPipeline - INFO - Model and tokenizer vocab sizes match. No resizing needed.
2025-05-04 21:06:30,484 - utils.text_chunker - INFO - Initialized TextChunker with chunk_size=512, chunk_overlap=51, prefix=""


Summarizing with human-centered-summarization/financial-summarization-pegasus:   0%|          | 0/1 [00:00<?, …

2025-05-04 21:06:30,539 - utils.text_chunker - INFO - Starting text chunking...
Chunking text: 100%|██████████| 26/26 [00:00<00:00, 129055.51it/s]
2025-05-04 21:06:30,561 - utils.text_chunker - INFO - Text successfully split into 26 chunks.
2025-05-04 21:06:30,562 - utils.text_chunker - INFO - Starting text chunking...


[2400, 2295, 2411, 2355, 2242, 2449, 2336, 2274, 2290, 2358, 2360, 2314, 2404, 2307, 2307, 2412, 2348, 2266, 2530, 2321, 2318, 2290, 2442, 2282, 2334, 1007]
chunk size: 512


Chunking text: 100%|██████████| 152/152 [00:00<00:00, 529074.03it/s]
2025-05-04 21:06:30,588 - utils.text_chunker - INFO - Text successfully split into 152 chunks.
2025-05-04 21:06:30,590 - top2vec - INFO - Pre-processing documents for training
2025-05-04 21:06:30,653 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


[611, 641, 526, 602, 592, 603, 589, 540, 560, 570, 553, 589, 612, 592, 586, 656, 592, 604, 616, 615, 587, 564, 580, 551, 557, 626, 579, 522, 538, 563, 529, 581, 662, 647, 620, 609, 582, 568, 594, 598, 602, 567, 570, 569, 550, 590, 613, 589, 568, 596, 608, 566, 555, 543, 540, 588, 610, 627, 578, 614, 646, 609, 590, 592, 569, 540, 553, 583, 613, 603, 574, 550, 565, 644, 658, 529, 613, 628, 539, 611, 582, 608, 587, 548, 554, 556, 548, 580, 609, 589, 588, 643, 579, 591, 614, 628, 592, 579, 577, 551, 543, 631, 595, 525, 537, 562, 556, 580, 669, 650, 639, 621, 617, 590, 605, 610, 584, 565, 540, 565, 526, 582, 606, 598, 590, 601, 600, 602, 546, 592, 565, 557, 602, 621, 611, 610, 637, 641, 587, 589, 581, 547, 567, 574, 584, 602, 600, 550, 567, 594, 677, 468]


2025-05-04 21:06:31,868 - top2vec - INFO - Creating joint document/word embedding
2025-05-04 21:06:33,366 - top2vec - INFO - Creating lower dimension embedding of documents
2025-05-04 21:06:33,925 - top2vec - INFO - Finding dense areas of documents
2025-05-04 21:06:33,933 - top2vec - INFO - Finding topics
Reducing topics: 100%|██████████| 1/1 [00:00<00:00, 359.19it/s]
2025-05-04 21:06:34,139 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)


Topic: market architecture growth software we year not so our great and just working really is do as work more now in what but at about very new are team see both terms re the that has this for to also ve you think it well with be right your of
found definition: pvgo: present value of growth opportunities.
found definition: present value of growth opportunities PVGO: Net present value of a firm™s future investments.
found definition: New growth theory: See GROWTH.


2025-05-04 21:07:03,391 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:07:31,860 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:08:05,652 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:08:36,254 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:09:05,138 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:09:38,462 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:10:09,311 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:10:39,339 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:11:11,660 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:11:45,666 - SummarizationPipeline - INFO - Generating summary (max_new_tokens=100)
2025-05-04 21:12:14,861 - Summ

In [23]:
sting = ', '.join(summaries)

In [24]:
len(sting.split())

43

In [25]:
sting

'CEO Jensen Huang and CFO Colette Kress speak. Nvidia expects Hopper to ship in large volumes in April quarter Analysts: Is there a deal in the works with Mercedes-Benz Analysts: Can you give us a sense of where gaming is at right now'