In [14]:
import os
import logging
import pandas as pd
from typing import Optional
from pathlib import Path

from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
logging.basicConfig(level=logging.INFO, force=True)

In [21]:
def load_if_scraped(company_id: str) -> Optional[pd.DataFrame]:
    file_path = Path("..") / "data" / f"{company_id}.csv"
    if file_path.exists():
        df = pd.read_csv(
            file_path,
            sep="\t",
            quoting=1,
            escapechar="\\",
            doublequote=True,
            quotechar='"',
        )
        logging.info("successfully loaded local transcripts")
        return df
    else:
        logging.debug("no local transcripts found")
    return None

In [22]:
df = load_if_scraped('312932093')

df.head()

INFO:root:successfully loaded local transcripts


Unnamed: 0,companyid,mostimportantdateutc,mostimportanttimeutc,headline,full_text,word_count,word_count_nltk
0,312932093,2018-05-10,15:30:00,Google LLC Presents at The 14th annual Red Hat...,Attendees: Now if there's a company that under...,12407,14475
1,312932093,2023-06-15,21:00:00,"Google LLC, Squarespace, Inc. - M&A Call","Operator: Good afternoon. My name is Sara, and...",10078,11800


In [23]:
text_to_summarize = df.full_text[0]

In [None]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (15110 > 512). Running this sequence through the model will result in indexing errors


In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

def summarize_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512,
        chunk_overlap=50
    )
    
    chunks = text_splitter.split_text(text)
    summaries = []
    
    for chunk in tqdm(chunks):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
        summary_ids = model.generate(**inputs, max_length=150)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    
    return summaries

def recursive_summary(text, target_length=512):
    summaries = summarize_text(text)
    combined_summary = " ".join(summaries)
    
    while len(tokenizer.tokenize(combined_summary)) > target_length:
        summaries = summarize_text(combined_summary)
        combined_summary = " ".join(summaries)

    return combined_summary

In [None]:
final_summary = recursive_summary(text_to_summarize)

 85%|████████▍ | 141/166 [21:03<04:01,  9.68s/it]

In [None]:
final_summary

['Machine learning and big data are changing the way we work.',
 "It's time for the annual conference of Red Hat and Google.",
 "Ritch from Kohl's talks about his company, its products, and its relationship with Red Hat.",
 'CEO: I was really looking forward to summit this year, and then I saw Weezer playing',
 "I got to Kohl's about 8 years ago, and it was a distant third out of 3 operating systems",
 'We have a relentless automation effort, plays a big part in ActiveMQ.',
 'BBC News asks Google executives why they chose to work with them.',
 "Is it important to pick a primary public cloud provider? One cloud provider -- even though multi-cloud's inevitable, we felt 1 primary was the right approach",
 'D L is great, I appreciate it, but what do you want to do?',
 'I think a lot of you said, "We want to work together.',
 "Ritch talks about the role Red Hat and Google have played in his company's success.",
 'Is the value of partnerships greater now than it was earlier in your career?',