In [None]:
!pip install transformers torch sentence-transformers pdfplumber python-docx spacy yake
!python -m spacy download en_core_web_sm

In [None]:
import pdfplumber
import docx
import spacy
import yake
from transformers import T5Tokenizer, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizer, PegasusForConditionalGeneration, PegasusTokenizer
import torch
from google.colab import files
from sentence_transformers import SentenceTransformer

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + "\n"
    return text

def extract_text_from_docx(docx_path):
    """Extract text from a DOCX file."""
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def get_text_from_file():
    """Upload and extract text from a file."""
    uploaded = files.upload()
    for filename in uploaded.keys():
        ext = filename.split(".")[-1]
        if ext == "pdf":
            return extract_text_from_pdf(filename)
        elif ext == "docx":
            return extract_text_from_docx(filename)
        elif ext == "txt":
            with open(filename, "r", encoding="utf-8") as file:
                return file.read()
        else:
            return "Unsupported file format!"

In [None]:
def extractive_summary(text, num_sentences=5):
    """Extract key sentences using spaCy for sentence tokenization."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]

    # Select the first `num_sentences` as key points
    return sentences[:num_sentences]

In [None]:
def extract_keywords(text, num_keywords=5):
    """Extract important keywords using YAKE!"""
    kw_extractor = yake.KeywordExtractor(top=num_keywords, stopwords=None)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

In [None]:
def classify_document(text):
    """Classify document type (Legal, Technical, Academic, General)."""
    categories = {
        "legal": ["court", "law", "agreement", "contract", "policy"],
        "technical": ["AI", "algorithm", "data", "software", "engineering"],
        "academic": ["research", "study", "university", "experiment", "paper"],
        "general": ["news", "blog", "report", "story", "review"]
    }

    doc = nlp(text.lower())
    word_counts = {category: sum(1 for token in doc if token.text in words) for category, words in categories.items()}
    predicted_category = max(word_counts, key=word_counts.get)

    return predicted_category.capitalize()

In [None]:
# Load Tokenizers and Models
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [None]:
def abstractive_summary(text, model, tokenizer, max_length=150):
    """Generate an abstractive summary using a Transformer model."""
    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return summary

In [None]:
def single_sentence_explanation(text):
    """Generate a one-sentence explanation using BART."""
    input_text = "summarize: " + text
    input_ids = bart_tokenizer.encode(input_text, return_tensors="pt", truncation=True)

    output_ids = bart_model.generate(input_ids, max_length=30, num_beams=5, early_stopping=True)
    explanation = bart_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return explanation

In [None]:
def generate_summary(text, num_extractive=5):
    """Generate key points, a paragraph summary, one-sentence explanation, and keywords."""
    # Step 1: Extract key points
    key_points = extractive_summary(text, num_extractive)
    extracted_text = " ".join(key_points)

    # Step 2: Generate paragraph summary (T5 + PEGASUS)
    t5_summary = abstractive_summary(extracted_text, t5_model, t5_tokenizer)
    pegasus_summary = abstractive_summary(extracted_text, pegasus_model, pegasus_tokenizer)

    # Merge T5 and PEGASUS summaries
    final_paragraph_summary = t5_summary + " " + pegasus_summary

    # Step 3: Generate single-sentence explanation using BART
    explanation = single_sentence_explanation(extracted_text)

    # Step 4: Extract keywords
    keywords = extract_keywords(text, num_keywords=5)

    # Step 5: Classify document type
    document_type = classify_document(text)

    return key_points, final_paragraph_summary, explanation, keywords, document_type

In [None]:
choice = input("Choose input method: \n1. Enter text manually\n2. Upload a document (PDF, DOCX, TXT)\nEnter 1 or 2: ")

text = ""

if choice == "1":
    text = input("Enter the text to summarize:\n")

elif choice == "2":
    text = get_text_from_file()

if text:
    key_points, paragraph_summary, explanation, keywords, document_type = generate_summary(text)

    print("\nDocument Type: ", document_type)
    print("\nKeywords: ", ", ".join(keywords))
    print("\nKeypoints of this document:\n")
    for idx, sentence in enumerate(key_points, 1):
        print(f"{idx}. {sentence}")

    print("\nSummary of what is written in the document:\n")
    print(paragraph_summary)

    print("\nExplanation in a single sentence:\n")
    print(explanation)