<a href="https://colab.research.google.com/github/ImZbs/apple-SEC-scanner/blob/main/earnings_scanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install PyPDF2 transformers accelerate --quiet

# Imports
from PyPDF2 import PdfReader
from transformers import BartForConditionalGeneration, BartTokenizer
import json
import torch
import time
from transformers import pipeline
import textwrap

# Function to extract text from PDF pages
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text_pages = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            text_pages.append(text)
    return text_pages

# Chunk text by tokens
def chunk_text_by_tokens(pages_text, tokenizer, max_tokens=1000):
    current_chunk = []
    current_token_count = 0
    for page in pages_text:
        page_tokens = tokenizer.encode(page, add_special_tokens=False)
        if current_token_count + len(page_tokens) > max_tokens:
            if current_chunk:
                yield tokenizer.decode(sum(current_chunk, []), skip_special_tokens=True)
            current_chunk = [page_tokens]
            current_token_count = len(page_tokens)
        else:
            current_chunk.append(page_tokens)
            current_token_count += len(page_tokens)
    if current_chunk:
        yield tokenizer.decode(sum(current_chunk, []), skip_special_tokens=True)

# Load a smaller, faster BART model and tokenizer
model_name = "sshleifer/distilbart-cnn-12-6" # Using a faster model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")

# Summarize text chunk using the faster BART model
def summarize_text(text_chunk, tokenizer, model, device):
    inputs = tokenizer([text_chunk], max_length=1024, return_tensors='pt', truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    summary_ids = model.generate(
        inputs['input_ids'],
        num_beams=4,
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Main flow
pdf_path = "apple10k.pdf"

start_time = time.time()
pages_text = extract_text_from_pdf(pdf_path)
extraction_time = time.time() - start_time
print(f"Text extraction time: {extraction_time:.2f} seconds")

start_time = time.time()
chunks = list(chunk_text_by_tokens(pages_text, tokenizer, max_tokens=1000))
chunking_time = time.time() - start_time
print(f"Chunking time: {chunking_time:.2f} seconds")


summaries = []
summarization_start_time = time.time()
for i, chunk in enumerate(chunks):
    print(f"Summarizing chunk {i+1}/{len(chunks)}...")
    try:
        summary = summarize_text(chunk, tokenizer, model, device)
        summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing chunk {i+1}: {e}")
        summaries.append('') # Skip failed chunks

summarization_time = time.time() - summarization_start_time
print(f"\nTotal summarization time: {summarization_time:.2f} seconds")

# Combine summaries for final output
final_summary = " ".join([s for s in summaries if s]) # Filter out empty strings


# Initialize a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=0 if torch.cuda.is_available() else -1) # Use device argument

# Define questions for the key insights you want to extract
questions = [
    "What are the revenue jumps mentioned in the summary?",
    "What are the cash-flow risks mentioned in the summary?",
    "What are the buy-sell signals mentioned in the summary?"
]

# Use the question-answering pipeline to extract answers from the final summary
extracted_insights = {}
for q in questions:
    try:
        answer = qa_pipeline(question=q, context=final_summary)
        extracted_insights[q] = answer['answer']
    except Exception as e:
        print(f"Error extracting insight for question '{q}': {e}")
        extracted_insights[q] = "Could not extract."

# Combine insights into a few sentences
insight_sentences = []
if extracted_insights.get(questions[0]) and extracted_insights[questions[0]] != "Could not extract.":
    insight_sentences.append(f"Regarding revenue jumps, the summary mentions: {extracted_insights[questions[0]]}.")
if extracted_insights.get(questions[1]) and extracted_insights[questions[1]] != "Could not extract.":
    insight_sentences.append(f"For cash-flow risks, the summary highlights: {extracted_insights[questions[1]]}.")
if extracted_insights.get(questions[2]) and extracted_insights[questions[2]] != "Could not extract.":
    insight_sentences.append(f"In terms of buy-sell signals, the summary notes: {extracted_insights[questions[2]]}.")

# Print the combined insights, wrapped
print("--- Key Insights ---")
if insight_sentences:
    combined_insights_text = " ".join(insight_sentences)
    wrapped_insights = textwrap.fill(combined_insights_text, width=80)
    print(wrapped_insights)
else:
    print("Could not extract key insights from the summary.")