In [1]:
!pip install PyPDF2 python-docx google-generativeai # PyPDF2 for PDFs, python-docx for .docx

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyPDF2
Successfully installed PyPDF2-3.0.1 python-docx-1.1.2


In [2]:
from google import genai
from google.genai import types
from PIL import Image
from io import BytesIO
import base64

from google.colab import userdata

# Libraries for document parsing
import PyPDF2 # For PDF files
import docx # For .docx Word files
import os

In [3]:
# It's good practice to get your API key securely, as you're doing with userdata.
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")

In [4]:
# Initialize the Gemini client
client = genai.Client(api_key=GOOGLE_API_KEY)

In [5]:
# --- Helper Functions for Text Extraction ---

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() or "" # Handle empty pages
        return text
    except PyPDF2.utils.PdfReadError:
        print(f"Error: Could not read PDF file '{pdf_path}'. It might be corrupted or encrypted.")
        return None
    except Exception as e:
        print(f"An error occurred while reading PDF '{pdf_path}': {e}")
        return None

def extract_text_from_docx(docx_path):
    """Extracts text from a .docx (Word) file."""
    text = ""
    try:
        document = docx.Document(docx_path)
        for paragraph in document.paragraphs:
            text += paragraph.text + "\n"
        return text
    except Exception as e:
        print(f"An error occurred while reading DOCX '{docx_path}': {e}")
        return None

def extract_text_from_plain_text(txt_path):
    """Extracts text from a plain text file."""
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        print(f"Error: Could not decode text file '{txt_path}' with UTF-8. Trying with 'latin-1'.")
        with open(txt_path, 'r', encoding='latin-1') as file:
            return file.read()
    except Exception as e:
        print(f"An error occurred while reading text file '{txt_path}': {e}")
        return None

def get_document_text(file_path):
    """
    Determines file type and extracts text accordingly.
    Returns the extracted text or None if an error occurs.
    """
    if not os.path.exists(file_path):
        print(f"Error: File not found at '{file_path}'")
        return None

    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == '.pdf':
        print(f"Extracting text from PDF: {file_path}")
        return extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        print(f"Extracting text from DOCX: {file_path}")
        return extract_text_from_docx(file_path)
    elif file_extension in ['.txt', '.md', '.csv', '.html']: # Add other text-based extensions
        print(f"Extracting text from plain text file: {file_path}")
        return extract_text_from_plain_text(file_path)
    else:
        print(f"Error: Unsupported file type: {file_extension}")
        print("Supported types: .pdf, .docx, .txt, .md, .csv, .html")
        return None


In [6]:
# --- Main Summarization Logic ---
if __name__ == "__main__":
    # --- User Input for File Path ---
    document_path = input("Enter the path to your document (e.g., /path/to/my_doc.pdf, my_article.docx): ")

    document_content = get_document_text(document_path)

    if document_content:
        # --- IMPORTANT: Handle Token Limits for Large Documents ---
        # Gemini models have token limits (e.g., 128k or 1M tokens for 1.5 models).
        # If your document is very long, it might exceed these limits.
        # You'll need a strategy for very long documents:
        # 1. Truncate: Only send the first N characters/words. (Simplest, but loses info)
        # 2. Chunking & Summarize-then-Summarize: Break into chunks, summarize each,
        #    then summarize the summaries. (More complex, better for very large files)

        # Basic truncation for demonstration purposes
        MAX_CHARS = 100000 # Adjust based on model's token limit and typical token/char ratio
        if len(document_content) > MAX_CHARS:
            print(f"Warning: Document content is very large ({len(document_content)} chars).")
            print(f"Truncating to first {MAX_CHARS} characters to fit model limits.")
            document_content = document_content[:MAX_CHARS]

        # --- Summarization Prompt ---
        summarization_prompt = (
            "Please summarize the following document concisely and clearly. "
            "Focus on the main points and provide a summary that is easy to understand. "
            "Aim for about 10 sentences, or a short paragraph.\n\n"
            f"Document:\n{document_content}"
        )

        # --- Generate Content with the Summarization Prompt ---
        try:
            print("\nSending document for summarization to Gemini...")
            response = client.models.generate_content(
                model="gemini-1.5-flash-001", # Recommended for good balance of speed/cost/capability
                contents=summarization_prompt,
                config=types.GenerateContentConfig(
                    response_modalities=['TEXT']
                )
            )

            # --- Extract, Print, and Save the Summary ---
            summary_text = ""
            if response.candidates and response.candidates[0].content and response.candidates[0].content.parts:
                for part in response.candidates[0].content.parts:
                    if part.text is not None:
                        summary_text += part.text + "\n"

                print("\n--- Generated Summary ---")
                print(summary_text.strip()) # .strip() removes leading/trailing whitespace/newlines

                output_filename = "document_summary.txt"
                with open(output_filename, "w", encoding="utf-8") as f: # Use 'w' to overwrite, 'a' to append
                    f.write(summary_text.strip())
                print(f"\nSummary successfully saved to {output_filename}")
            else:
                print("Gemini did not return any text content for summarization.")


        except genai.ClientError as e:
            print(f"\nGemini API Client Error: {e}")
            print("Please check your API key, model name, and ensure the content fits within model limits.")
        except Exception as e:
            print(f"\nAn unexpected error occurred during summarization: {e}")
    else:
        print("\nCould not extract text from the document. Please check the file path and type.")

Enter the path to your document (e.g., /path/to/my_doc.pdf, my_article.docx): /content/Isaac Lasso Younes_Cybersecurity_In Progress.pdf
Extracting text from PDF: /content/Isaac Lasso Younes_Cybersecurity_In Progress.pdf

Sending document for summarization to Gemini...

--- Generated Summary ---
Isaac Lasso Younes is a highly motivated and accomplished cybersecurity student at Pace University, set to graduate in May 2028 with a Bachelor of Science degree and a perfect 4.0 GPA.  His academic record reflects a strong foundation in computer science, including coursework in mathematical structures, object-oriented programming, and design thinking.  Isaac possesses technical skills in Python, Java, and various software tools, as well as certifications in Microsoft Office Suite and customer service. He is proficient in both English and Spanish.

Isaac's experience extends beyond the classroom.  He has gained practical experience as a student assistant, providing administrative support and tec