In [19]:
import fitz  # PyMuPDF for PDFs
from docx import Document  # python-docx for DOCX
import re
from summa.summarizer import summarize
from transformers import pipeline

def chunk_text(text, max_sentences=50):
    """Split text into chunks using regex-based sentence splitting."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]
    return chunks

def extract_text(file_path):
    """Extract text from PDF or DOCX based on file extension."""
    if file_path.lower().endswith('.pdf'):
        doc = fitz.open(file_path)
        text = "".join([page.get_text() for page in doc])
    elif file_path.lower().endswith('.docx'):
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format. Only PDF and DOCX are supported.")
    return text

def summarize_insurance_doc(file_path, extract_ratio=0.1, max_abs_len=200, min_abs_len=50, chunk_size=50):
    # Step 1: Extract text
    text = extract_text(file_path)
    
    # Step 2: Clean text (preserve sentence boundaries)
    text = re.sub(r'\s+', ' ', text)  # normalize whitespace
    text = re.sub(r'[^a-zA-Z0-9.,!? ]', '', text)  # keep punctuation for sentence detection
    
    # Step 3: Chunk text
    text_chunks = chunk_text(text, max_sentences=chunk_size)
    
    # Step 4: Extractive summarization with fallback for short text
    extractive_summary_chunks = []
    for chunk in text_chunks:
        summary = summarize(chunk, ratio=extract_ratio)
        if not summary.strip():  # fallback for short chunks
            summary = chunk
        extractive_summary_chunks.append(summary)
    extractive_summary = ' '.join(extractive_summary_chunks)
    
    # Step 5: Abstractive summarization
    abstractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    abs_chunks = chunk_text(extractive_summary, max_sentences=20)
    abstractive_summary_list = [
        abstractive_summarizer(chunk, max_length=max_abs_len, min_length=min_abs_len, do_sample=False)[0]['summary_text']
        for chunk in abs_chunks
    ]
    abstractive_summary = ' '.join(abstractive_summary_list)
    
    return extractive_summary, abstractive_summary

# Example usage
file_path = "insu_doc.docx"  # Can be .pdf or .docx
extractive, abstractive = summarize_insurance_doc(file_path)

print("----- Extractive Summary -----\n", extractive)
print("\n----- Abstractive Summary -----\n", abstractive)


Device set to use cpu


----- Extractive Summary -----
 HDFC Life Click 2 Protect Supreme A NonLinked, NonParticipating, Individual, Pure Risk Premium Savings Life Insurance Plan Securing your familys financial wellbeing is more crucial than ever. Thats why HDFC Life Click 2 Protect Supreme is designed to offer comprehensive financial protection for your whole family. This term plan adapts to your evolving lifestyle and life stage needs, ensuring that you and your loved ones remain truly protected. Key Features Eligibility  Option to choose a cover which fits your needs from 3 plan options  Get back all premiums paid on survival till maturity with Return of Premium  option  Additional amount payable in case of accidental death during policy term  Provides Acceleration of Death benefit on diagnosis of specified terminal illnesses, till age 80 years. Option to choose increasing Death Benefit up to 200 under Life option B  C variants  Option to vary your Death Benefit according to your need under Life Goal optio