<a href="https://colab.research.google.com/github/LuckyBoy587/Notes-Summarizer/blob/master/Basic_Text_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load a T5 paraphrasing model
model_name = "Vamsi/T5_Paraphrase_Paws"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def paraphrase(text, num_return_sequences=1):
    input_text = "paraphrase: " + text + " </s>"
    encoding = tokenizer.encode_plus(
        input_text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Move inputs to GPU
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=256,
        num_return_sequences=num_return_sequences,
        num_beams=5,
        temperature=1.5,
        top_k=50,
        top_p=0.95
    )

    paraphrased = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                   for output in outputs]

    return paraphrased

text = """
CHAPTER 1: MACHINE LEARNING
Machine learning is a field of artificial intelligence. It focuses on building systems that learn from data.
It is widely used in applications like recommendation systems and computer vision.

TOPIC: DEEP LEARNING
Deep learning uses neural networks with many layers. It has achieved state-of-the-art results in image and speech recognition.
"""



In [None]:
import re
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def split_into_topics(text):
    lines = text.split("\n")
    topics = {}
    current_topic = None
    buffer = []

    def flush_buffer(topic, buf):
        if buf:
            # Join lines into one block
            block = " ".join(buf)
            # Clean unwanted breaks/spaces
            block = re.sub(r'\s+', ' ', block).strip()
            # Replace dashes/bullets with colons for readability
            block = re.sub(r'\s*[-–]+\s*', ': ', block)
            # Split into sentences
            return sent_tokenize(block)
        return []

    for line in lines:
        line = line.strip()
        if re.match(r'^<.*>$', line):  # topic header
            if current_topic and buffer:
                topics[current_topic].extend(flush_buffer(current_topic, buffer))
            topic_name = line.strip("<>").strip()
            current_topic = topic_name if topic_name else "Unnamed Topic"
            topics[current_topic] = []
            buffer = []
        elif line:  # content line
            buffer.append(line)

    # Flush last topic
    if current_topic and buffer:
        topics[current_topic].extend(flush_buffer(current_topic, buffer))

    return topics


In [None]:
topics = split_into_topics(text)
for topic, sentences in topics.items():
    print(f"\n{topic}:\n")
    print(*paraphrase("\n".join(sentences), 1), sep="\n")

In [None]:
from nltk.tokenize import sent_tokenize

def merge_short_sentences(text, min_words=15):
    sentences = sent_tokenize(text)
    print(len(sentences), sentences)
    merged = []
    buffer = ""

    for sent in sentences:
        word_count = len(sent.split())
        if word_count < min_words:
            buffer += " " + sent if buffer else sent
        else:
            if buffer:
                merged.append(buffer.strip())
                buffer = ""
            merged.append(sent)
    if buffer:
        merged.append(buffer.strip())
    return merged


In [None]:
def paraphrase_chunks(chunks, model, tokenizer, device):
    bullets = []
    for chunk in chunks:
        input_text = "paraphrase: " + chunk + " </s>"
        encoding = tokenizer.encode_plus(
            input_text,
            max_length=256,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)

        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5,
            num_return_sequences=1,
            temperature=1.5,
            top_k=50,
            top_p=0.95
        )

        paraphrased = tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
        bullets.append(paraphrased)
    return bullets


In [None]:
from google.colab import files
import io
import os

def process_text_and_download(topics, filename="paraphrased_output.txt"):
    """
    Processes the input text by merging short sentences and paraphrasing chunks,
    then creates a downloadable text file with the results as bullet points.

    Args:
        topics (dict): A dictionary where keys are topic names and values are lists of sentences.
        filename (str): The name for the output downloadable file.
    """
    output_content = ""
    for topic, chunks in topics.items():
        if 'model' not in globals() or 'tokenizer' not in globals() or 'device' not in globals():
            print("Error: Model, tokenizer, or device not loaded. Please run the model loading cell first.")
            return
        print(chunks)
        bullets = paraphrase_chunks(chunks, model, tokenizer, device)

        # Format as bullet points
        output_content += f"\n## {topic}\n"
        output_content += "\n".join([f"• {b}" for b in bullets]) + "\n"


    # Create a downloadable file
    buffer = io.BytesIO(output_content.encode())
    # Generate a filename if not provided based on the first topic or default
    if filename == "paraphrased_output.txt" and topics:
        first_topic = list(topics.keys())[0]
        filename = f"{first_topic.replace(' ', '_').lower()}_paraphrased.txt"
    elif filename == "paraphrased_output.txt":
        filename = "paraphrased_output.txt"
    with open(filename, 'w') as f:
        f.write(output_content)
    files.download(filename=filename)
    print(f"Processed text and created downloadable file: {filename}")

# Example usage (commented out):
# text = """Genshin Impact (Chinese: 原神; pinyin: Yuán shén; lit. 'Original God') is a 2020 action role-playing game produced by miHoYo (HoYoverse).[c] The game features an anime-style open world environment and an action-based battle system using elemental magic and character-switching. A free-to-play game monetized through gacha game mechanics, Genshin Impact is updated regularly using the games as a service model; it was originally released for Android, iOS, PlayStation 4 and Windows, followed by the PlayStation 5 in 2021, with an Xbox Series X/S version in November 2024. In China, a native port for HarmonyOS NEXT was released in September 2025.

# Genshin Impact takes place in the fantasy world of Teyvat, home to seven nations, each of which is tied to a different element and ruled by a different god called an "Archon." The story follows the Traveler, an interstellar adventurer who, at the start of the game, is separated from their twin sibling after the two land in Teyvat. Thereafter, the Traveler journeys across the nations of Teyvat in search of the lost sibling, accompanied by their guide, Paimon. Along the way, the two befriend myriad individuals, become involved in the affairs of its nations, and begin to unravel the mysteries of the land.

# Development began in 2017 and takes inspiration from a variety of sources, including The Legend of Zelda: Breath of the Wild, anime, Gnosticism, and an array of cultures and world mythologies. Genshin Impact has received generally positive reviews, with critics writing approving of its combat mechanics and its immersive open world. Conversely, some criticism has been directed at its simplistic endgame and its gacha-based monetization model. The game has also been subjected to controversy over censorship of content related to Chinese politics, allegations of colorism in character design, and privacy and security concerns. Across all platforms, the game is estimated to have grossed nearly $3.8 billion by the end of 2022, representing the highest ever first-year launch revenue for any video game.[4][5]"""

# process_text_and_download(text)

In [None]:
!pip install pymupdf
import fitz  # This is the PyMuPDF library
import re

def extract_topics_from_pdf(pdf_path, write_to_file=False):
    """
    Extracts content from a PDF and formats it into <TOPIC> blocks.

    This function identifies topics by assuming that text with a larger-than-average
    font size is a heading. It's a heuristic that works well for many documents!

    Args:
        pdf_path (str): The file path to the PDF.
        write_to_file (bool): If True, write the formatted content to a .txt file with the same name as the PDF.

    Returns:
        str: The formatted text.
    """
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        return f"Error opening PDF: {e}"

    # We will store our topics and their content in a list of tuples
    # (topic_name, content_string)
    structured_content = []
    current_content = []

    # Let's define what we consider a "heading".
    # We'll find the most common font size and assume anything
    # a bit larger is a heading. This is our main heuristic.
    # We can set a sensible default threshold.
    HEADING_FONT_THRESHOLD = 14.0

    for page_num, page in enumerate(doc):
        # The 'dict' format gives us detailed info about each text block
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            # A block contains lines, and a line contains spans of text
            if "lines" in block:
                for line in block["lines"]:
                    for span in line["spans"]:
                        font_size = span["size"]
                        text = span["text"].strip()

                        # Clean up text that might be just noise
                        if not text or len(text) < 3:
                            continue

                        # --- HEADING IDENTIFICATION LOGIC ---
                        # If the font is larger than our threshold, we declare it a new topic!
                        if font_size > HEADING_FONT_THRESHOLD:
                            # First, save the content we've collected for the *previous* topic
                            if current_content:
                                structured_content.append(("".join(current_content)))
                                current_content = [] # Reset for the new topic

                            # Start a new topic
                            # Using a special marker to distinguish topics
                            structured_content.append(f"<TOPIC>{text}")
                        else:
                            # Otherwise, it's just regular content. Add it to the current topic's text.
                            # We add a space to ensure words are not squished together.
                            current_content.append(text + " ")
                    current_content.append("\n")  # New line after each line of text

    # Don't forget to add the very last block of content after the loop ends!
    if current_content:
        structured_content.append("".join(current_content))

    doc.close()

    # Filter out short or irrelevant topic sections
    filtered_content = []
    i = 0
    while i < len(structured_content):
        if structured_content[i].startswith("<TOPIC>"):
            # Check if there's a next item and if it's content
            if i + 1 < len(structured_content) and not structured_content[i + 1].startswith("<TOPIC>"):
                content = structured_content[i + 1]
                if len(content.strip()) >= 100:  # Minimum length threshold
                    filtered_content.append(structured_content[i])
                    filtered_content.append(content)
            # Skip the topic if content is too short
            i += 2  # Skip topic and its content
        else:
            # If it's standalone content (unlikely), add it
            filtered_content.append(structured_content[i])
            i += 1

    # --- FINAL FORMATTING ---
    # Now, let's join everything into the final string format you wanted.
    output = ""
    for item in filtered_content:
        if item.startswith("<TOPIC>"):
            # It's a topic header
            topic_name = item.replace("<TOPIC>", "").strip()
            output += f"\n<{topic_name.upper()}>\n"
        else:
            # It's content, clean it up a bit
            # Replace multiple spaces/tabs with single space, preserve newlines
            content = re.sub(r'[ \t]+', ' ', item.strip())
            output += content + "\n"

    if write_to_file:
        file_name = "./" + pdf_path.split("\\")[-1].replace('.pdf', '.txt')
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(output)

    return output

In [None]:
from google.colab import files
import os

def upload_and_extract_topics():
    """
    Allows the user to upload a PDF and extracts topics using the existing function.
    """
    uploaded = files.upload()

    for filename in uploaded.keys():
        try:
            if not filename.endswith('.pdf'):
                print(f'File "{filename}" is not a PDF. Skipping.')
                continue
            # Process the uploaded file
            extracted_text = extract_topics_from_pdf(filename)
            return extracted_text

        finally:
            os.remove(filename)
# Call the function to start the upload process
# upload_and_extract_topics()

In [None]:
uploaded_pdf_content = upload_and_extract_topics()
topics = split_into_topics(uploaded_pdf_content)
process_text_and_download(topics)