In [None]:
!pip install python-docx PyPDF2 python-pptx transformers torch

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting n

In [None]:
import os
import docx
import PyPDF2
from pptx import Presentation
from transformers import pipeline
from datetime import datetime
from google.colab import files

class LectureHighlighter:
    def __init__(self):
        """Initialize the summarization pipeline."""
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    def extract_text_from_docx(self, file_path):
        """Extract text from a DOCX file."""
        try:
            doc = docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        except Exception as e:
            raise ValueError(f"Error extracting text from DOCX: {str(e)}")

    def extract_text_from_pdf(self, file_path):
        """Extract text from a PDF file."""
        try:
            with open(file_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                return text.strip()
        except Exception as e:
            raise ValueError(f"Error extracting text from PDF: {str(e)}")

    def extract_text_from_ppt(self, file_path):
        """Extract text from a PPT file."""
        try:
            prs = Presentation(file_path)
            text = ""
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        text += shape.text + "\n"
            return text.strip()
        except Exception as e:
            raise ValueError(f"Error extracting text from PPT: {str(e)}")

    def extract_text(self, file_path):
        """Determine file type and extract text accordingly."""
        ext = os.path.splitext(file_path)[1].lower()
        if ext == ".docx":
            return self.extract_text_from_docx(file_path)
        elif ext == ".pdf":
            return self.extract_text_from_pdf(file_path)
        elif ext in [".ppt", ".pptx"]:
            return self.extract_text_from_ppt(file_path)
        else:
            raise ValueError(f"Unsupported file type: {ext}")

    def summarize_text(self, text, max_length=1000, min_length=200):
        """Generate a summary of the extracted text."""
        try:
            # Split text into chunks if too long for the model (BART max token limit ~1024)
            chunk_size = 1000  # Approximate token limit
            chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
            summaries = []

            for chunk in chunks:
                if chunk.strip():
                    summary = self.summarizer(
                        chunk,
                        max_length=max_length // len(chunks) + 50,
                        min_length=min_length // len(chunks) + 30,
                        do_sample=False
                    )[0]["summary_text"]
                    summaries.append(summary)

            # Combine summaries
            return " ".join(summaries).strip()
        except Exception as e:
            raise ValueError(f"Error summarizing text: {str(e)}")

    def process_file(self, input_file, output_dir="summaries"):
        """Process the input file and save the summary."""
        try:
            # Create output directory if it doesn't exist
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            # Extract text
            text = self.extract_text(input_file)
            if not text:
                raise ValueError("No text extracted from the file.")

            # Generate summary
            summary = self.summarize_text(text)

            # Save summary to a text file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            base_name = os.path.splitext(os.path.basename(input_file))[0]
            output_file = os.path.join(output_dir, f"{base_name}_summary_{timestamp}.txt")

            with open(output_file, "w", encoding="utf-8") as f:
                f.write(f"Summary of {base_name}\n")
                f.write("=" * 50 + "\n\n")
                f.write(summary)

            return f"Summary saved to {output_file}"

        except Exception as e:
            return f"Error processing file: {str(e)}"

def main():
    """Main function to run the lecture highlighter in Colab."""
    highlighter = LectureHighlighter()

    # Upload the lecture file
    print("Upload your lecture file (DOCX, PDF, or PPT):")
    uploaded = files.upload()
    input_file = list(uploaded.keys())[0]

    # Process the file
    result = highlighter.process_file(input_file)
    print(result)

    # Download the summary file
    summary_file = result.split("saved to ")[-1] if "saved to" in result else None
    if summary_file and os.path.exists(summary_file):
        files.download(summary_file)
    else:
        print("No summary file generated due to an error.")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Upload your lecture file (DOCX, PDF, or PPT):


Saving Risk Analysis & managemanet.pptx to Risk Analysis & managemanet.pptx


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Summary saved to summaries/Risk Analysis & managemanet_summary_20250621_181126.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>