In [None]:
# Step 1: Install required libraries
!pip install transformers PyPDF2

# Step 2: Upload your PDF
from google.colab import files
uploaded = files.upload()
pdf_file = list(uploaded.keys())[0]

# Step 3: Extract text from PDF
import PyPDF2

reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
    page_text = page.extract_text()
    if page_text:
        text += page_text + "\n"

print("Extracted text length:", len(text))
print(text[:5000])  # preview first 500 characters

# Step 4: Load summarization model
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Step 5: Function to split text into chunks
def split_text(text, max_words=600):  # smaller chunks to avoid errors
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

chunks = split_text(text)

# Step 6: Summarize each chunk safely
summaries = []
for i, chunk in enumerate(chunks):
    if not chunk.strip():  # skip empty chunks
        continue
    try:
        print(f"Summarizing chunk {i+1}/{len(chunks)}...")
        summary = summarizer(chunk, max_length=449, min_length=150, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    except Exception as e:
        print(f"Skipping chunk {i+1} due to error: {e}")

# Step 7: Combine summaries
final_summary = " ".join(summaries)
print("\n===== FINAL SUMMARY =====\n")
print(final_summary)

# Step 8: Save summary to a file
with open("summary.txt", "w") as f:
    f.write(final_summary)

# Step 9: Download the summary
from google.colab import files
files.download("summary.txt")
