<a href="https://colab.research.google.com/github/ImZbs/apple-SEC-scanner/blob/main/earnings_scanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Install dependencies
!pip install PyPDF2 transformers accelerate --quiet

# Imports
from PyPDF2 import PdfReader
from transformers import BartForConditionalGeneration, BartTokenizer
import json
import torch
import time
from transformers import pipeline
import textwrap

# Function to extract text from PDF pages
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text_pages = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            text_pages.append(text)
    return text_pages

# Chunk text by tokens
def chunk_text_by_tokens(pages_text, tokenizer, max_tokens=1000):
    current_chunk = []
    current_token_count = 0
    for page in pages_text:
        page_tokens = tokenizer.encode(page, add_special_tokens=False)
        if current_token_count + len(page_tokens) > max_tokens:
            if current_chunk:
                yield tokenizer.decode(sum(current_chunk, []), skip_special_tokens=True)
            current_chunk = [page_tokens]
            current_token_count = len(page_tokens)
        else:
            current_chunk.append(page_tokens)
            current_token_count += len(page_tokens)
    if current_chunk:
        yield tokenizer.decode(sum(current_chunk, []), skip_special_tokens=True)

# Load a smaller, faster BART model and tokenizer and pipeline
def load_models():
    model_name = "sshleifer/distilbart-cnn-12-6" # Using a faster model
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=0 if torch.cuda.is_available() else -1) # Use device argument
    return tokenizer, model, device, qa_pipeline

# Summarize text chunk using the faster BART model
def summarize_text(text_chunk, tokenizer, model, device):
    inputs = tokenizer([text_chunk], max_length=1024, return_tensors='pt', truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    summary_ids = model.generate(
        inputs['input_ids'],
        num_beams=4,
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Main execution function to be used with %%capture
def main_execution(pdf_path):
    start_time = time.time()
    pages_text = extract_text_from_pdf(pdf_path)
    extraction_time = time.time() - start_time
    print(f"Text extraction time: {extraction_time:.2f} seconds")

    start_time = time.time()
    tokenizer, model, device, qa_pipeline = load_models()
    chunks = list(chunk_text_by_tokens(pages_text, tokenizer, max_tokens=1000))
    chunking_time = time.time() - start_time
    print(f"Chunking time: {chunking_time:.2f} seconds")


    summaries = []
    summarization_start_time = time.time()
    for i, chunk in enumerate(chunks):
        # print(f"Summarizing chunk {i+1}/{len(chunks)}...") # Suppress this output
        try:
            summary = summarize_text(chunk, tokenizer, model, device)
            summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk {i+1}: {e}")
            summaries.append('') # Skip failed chunks

    summarization_time = time.time() - summarization_start_time
    print(f"\nTotal summarization time: {summarization_time:.2f} seconds")

    # Combine summaries for final output
    final_summary = " ".join([s for s in summaries if s]) # Filter out empty strings

    # Define refined questions for the key insights you want to extract
    refined_questions = [
        "What were the total net sales for the fiscal year?",
        "What are the main revenue categories mentioned in the summary?",
        "What are the primary risks to the company's business and financial condition?",
        "Are there any mentions of intellectual property risks or patent claims?",
        "What are the key competitive challenges the company faces?",
        "Does the summary mention any information related to share repurchase programs or dividends?",
        "What are the company's primary geographic markets for sales?",
    ]

    # Use the question-answering pipeline to extract answers from the final summary
    extracted_insights_refined = {}
    for q in refined_questions:
        # print(f"Extracting insight for question: '{q}'...") # Suppress this output
        try:
            answer = qa_pipeline(question=q, context=final_summary)
            extracted_insights_refined[q] = answer['answer']
            # print(f"  Answer: {answer['answer']}") # Suppress this output
        except Exception as e:
            print(f"  Error extracting insight for question '{q}': {e}")
            extracted_insights_refined[q] = "Could not extract."

    # Format and print the organized key insights
    formatted_insights = "--- Organized Key Insights ---\n\n"

    for question, answer in extracted_insights_refined.items():
        formatted_insights += f"Q: {question}\nA: {answer}\n\n"

    print(formatted_insights)

# Execute the main function with %%capture
#%%capture
main_execution("/content/apple10k.pdf")

Text extraction time: 14.04 seconds


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1053 > 1024). Running this sequence through the model will result in indexing errors


Chunking time: 13.75 seconds

Total summarization time: 113.68 seconds
--- Organized Key Insights ---

Q: What were the total net sales for the fiscal year?
A: 391,035 383,285 394,328

Q: What are the main revenue categories mentioned in the summary?
A: cash flow or fair value hedges

Q: What are the primary risks to the company's business and financial condition?
A: interest rates and foreign exchange rates

Q: Are there any mentions of intellectual property risks or patent claims?
A: The information is not incorporated by reference into this filing .

Q: What are the key competitive challenges the company faces?
A: competitors have aggressively cut prices and lowered product margins

Q: Does the summary mention any information related to share repurchase programs or dividends?
A: dividends

Q: What are the company's primary geographic markets for sales?
A: outside the U.S.


