In [28]:
!pip install -q langchain_groq langchain_community faiss-cpu python-pptx PyMuPDF

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [29]:
import os
import re
import io
import json
from typing import List, Dict
import numpy as np
from PIL import Image
from pydub import AudioSegment, effects
import fitz 
import torch
from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    AutoModelForSpeechSeq2Seq,
    pipeline,
)
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

from langchain_community.vectorstores import FAISS

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import BaseModel
from torch.nn.attention import SDPBackend, sdpa_kernel 

In [4]:
class WhisperTranscriptionAgent:
    def __init__(self, model_id="openai/whisper-large-v3"):
        torch.set_float32_matmul_precision("high")
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True
        ).to(self.device)

        self.model.generation_config.cache_implementation = "static"
        self.model.generation_config.max_new_tokens = 256
        self.model.forward = torch.compile(self.model.forward, mode="reduce-overhead", fullgraph=True)

        self.processor = AutoProcessor.from_pretrained(model_id)
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.model,
            tokenizer=self.processor.tokenizer,
            feature_extractor=self.processor.feature_extractor,
            chunk_length_s=120,
            batch_size=4,
            torch_dtype=self.torch_dtype,
            device=self.device,
        )

    def extract_audio(self, video_path: str, output_audio_path: str = "temp_audio.wav") -> str:
        audio = AudioSegment.from_file(video_path)
        audio.export(output_audio_path, format="wav")
        return output_audio_path

    def preprocess_audio(self, audio_path: str) -> np.ndarray:
        audio = AudioSegment.from_file(audio_path)
        audio = effects.normalize(audio)
        audio = audio.set_frame_rate(16000).set_channels(1)
        samples = np.array(audio.get_array_of_samples()).astype(np.float32) / (2 ** 15)
        return samples

    def transcribe(self, video_path: str, output_txt: str = "transcript.txt") -> str:
        audio_path = self.extract_audio(video_path)
        samples = self.preprocess_audio(audio_path)

        with sdpa_kernel(SDPBackend.MATH):
            result = self.pipe(samples, generate_kwargs={"language": "en"})

        with open(output_txt, "w") as f:
            f.write(result["text"])

        os.remove(audio_path)  
        return result["text"]


In [25]:
if __name__ == "__main__":
    agent = WhisperTranscriptionAgent()
    video_file = "/kaggle/input/llm-dataset/LLM_DATASET/LLM_DATASET/Lora_Qlora/19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).mp4"
    transcript = agent.transcribe(video_file)
    print("Transcription Complete:\n", transcript)
    with open("transcript.txt", "w") as f:
        f.write(transcript)

Device set to use cuda:0


Transcription Complete:
  . . . . I you are going to get in return weights right. Normally all these are based on neural network all the models that we have talked about are all based on neural networks and hence we are going to get the models learnings in the form of one layer but all of the layers that are present in your transformer. If they are all transformer based layers. So several layers, several heads all of these together consists of these weights. Is this clear? That is what we mean by the weight matrix. So as an full precision for those numbers. And by now all of you know that the numbers always underlying computer are represented in the form of binary. There is nothing representation and this is used for every number. Do you understand? 176 billion is the bits into 32 bits into 32 bits. 176 billion parameters crossing 1600 GB if you are storing all of that. In memory I am talking about RAM. That is why only OpenAI, Google, Microsoft, these well known meta, all of these onl

In [10]:
from pathlib import Path
from pptx import Presentation

def extract_text_from_shape(shape):
    content = []
    if shape.has_text_frame:
        for paragraph in shape.text_frame.paragraphs:
            level = paragraph.level
            text = paragraph.text.strip()
            if text:
                content.append(("  " * level) + text)
    return content

def parse_pptx_file(pptx_path: str) -> str:
    pptx_path = Path(pptx_path)
    prs = Presentation(pptx_path)
    text_lines = []

    for idx, slide in enumerate(prs.slides):
        text_lines.append(f"[PPTX] {pptx_path.name} - Slide {idx + 1}")
        title = slide.shapes.title.text.strip() if slide.shapes.title else "No Title"
        text_lines.append(f"Title: {title}")

        for shape in slide.shapes:
            if shape == slide.shapes.title:
                continue
            if shape.has_text_frame:
                lines = extract_text_from_shape(shape)
                text_lines.extend(lines)

        if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
            notes_text = slide.notes_slide.notes_text_frame.text.strip()
            if notes_text:
                text_lines.append(f"Notes: {notes_text}")

    return "\n".join(text_lines)

In [12]:
parsed_text = parse_pptx_file("/kaggle/input/llm-dataset/LLM_DATASET/LLM_DATASET/TBT/Class6_Unit3_Trees_ThreadBST.pptx")

with open("parsed_slide_text.txt", "w", encoding="utf-8") as f:
    f.write(parsed_text)

In [30]:
class PDFExtractorAgent:
    def __init__(self, output_text_dir="/kaggle/working/extracted_pdf_content/text",
                 output_image_dir="/kaggle/working/extracted_pdf_content/images"):
        self.output_text_dir = output_text_dir
        self.output_image_dir = output_image_dir
        os.makedirs(self.output_text_dir, exist_ok=True)
        os.makedirs(self.output_image_dir, exist_ok=True)

    def sanitize_filename(self, name):
        return re.sub(r'[\\/*?:"<>|]', "_", name)

    def filter_text_content(self, text):
        filtered_lines = []
        skip_section = False

        lines = text.split('\n')
        email_pattern = re.compile(r'\S+@\S+\.\S+')
        course_code_pattern = re.compile(r'[A-Z]{2}\d{2}[A-Z]{2}\d{3}[A-Z]{2}\d?')
        name_title_pattern = re.compile(r'^(Dr\.|Prof\.|Mr\.|Mrs\.|Ms\.) ')
        acknowledgment_pattern = re.compile(r'^(Ack|Acknowledgment|Acknowledgement)', re.IGNORECASE)

        for i, line in enumerate(lines):
            if not line.strip():
                continue
            if email_pattern.search(line) or course_code_pattern.search(line):
                continue
            if name_title_pattern.match(line.strip()):
                continue
            if acknowledgment_pattern.match(line.strip()):
                skip_section = True
                continue
            if skip_section:
                if line.strip().isupper() or (i < len(lines) - 1 and not lines[i + 1].strip()):
                    skip_section = False
                else:
                    continue
            if "Department of" in line or "University" in line or "Centre for" in line:
                continue
            if len(line.strip()) < 10 and not line.strip().isdigit():
                continue
            filtered_lines.append(line)

        return '\n'.join(filtered_lines)

    def extract(self, pdf_path: str) -> bool:
        """Extracts filtered text and images from a PDF and saves results to disk."""
        subfolder = os.path.basename(os.path.dirname(pdf_path))
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

        try:
            doc = fitz.open(pdf_path)
            all_text = []

            for page_num, page in enumerate(doc):
                text = page.get_text()
                if text.strip():
                    filtered = self.filter_text_content(text)
                    if filtered.strip():
                        all_text.append(filtered)

                for img_index, img_info in enumerate(page.get_images(full=True)):
                    xref = img_info[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]

                    image_ext = base_image["ext"]
                    image_filename = f"{subfolder}-{pdf_name}_page{page_num+1}_image{img_index+1}.{image_ext}"
                    image_path = os.path.join(self.output_image_dir, self.sanitize_filename(image_filename))

                    with open(image_path, "wb") as image_file:
                        image_file.write(image_bytes)
                    print(f"Image saved: {image_filename}")

            if all_text:
                text_filename = f"{subfolder}-{pdf_name}.txt"
                text_path = os.path.join(self.output_text_dir, self.sanitize_filename(text_filename))
                with open(text_path, "w", encoding="utf-8") as text_file:
                    text_file.write('\n\n'.join(all_text))
                print(f"Text saved: {text_filename}")

            doc.close()
            return True

        except Exception as e:
            print(f"[ERROR] Failed to extract from {pdf_path}: {e}")
            return False

In [31]:
pdfagent = PDFExtractorAgent()
pdfagent.extract("/kaggle/input/llm-dataset/LLM_DATASET/LLM_DATASET/Lora_Qlora/Finetuning.pdf")

Image saved: Lora_Qlora-Finetuning_page1_image1.png
Image saved: Lora_Qlora-Finetuning_page2_image1.png
Image saved: Lora_Qlora-Finetuning_page2_image2.png
Image saved: Lora_Qlora-Finetuning_page3_image1.png
Image saved: Lora_Qlora-Finetuning_page4_image1.png
Image saved: Lora_Qlora-Finetuning_page4_image2.png
Image saved: Lora_Qlora-Finetuning_page5_image1.png
Image saved: Lora_Qlora-Finetuning_page5_image2.png
Image saved: Lora_Qlora-Finetuning_page6_image1.png
Image saved: Lora_Qlora-Finetuning_page6_image2.png
Image saved: Lora_Qlora-Finetuning_page7_image1.png
Image saved: Lora_Qlora-Finetuning_page7_image2.png
Image saved: Lora_Qlora-Finetuning_page8_image1.png
Image saved: Lora_Qlora-Finetuning_page8_image2.png
Image saved: Lora_Qlora-Finetuning_page9_image1.png
Image saved: Lora_Qlora-Finetuning_page10_image1.png
Image saved: Lora_Qlora-Finetuning_page11_image1.png
Image saved: Lora_Qlora-Finetuning_page12_image1.png
Image saved: Lora_Qlora-Finetuning_page13_image1.png
Image sa

True

In [36]:
class ImageDescriptionAgent:
    def __init__(self, model_name="microsoft/git-large-coco"):
        print("Loading image captioning model...")
        self.model_name = model_name
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModelForVision2Seq.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def describe_image(self, image_path):
        """
        Generate a description for a single image.
        """
        try:
            image = Image.open(image_path).convert('RGB')
            inputs = self.processor(images=image, return_tensors="pt").to(self.device)
            
            generated_ids = self.model.generate(
                pixel_values=inputs.pixel_values,
                max_length=100,
                num_beams=3,
                early_stopping=True
            )
            
            description = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return description
        except Exception as e:
            print(f"[ERROR] Failed to process {image_path}: {e}")
            return "Uncaptionable image"

    def describe_images_in_folder(self, folder_path, save_to):
        all_files = [f for f in os.listdir(folder_path) ]

        results = {}
        for filename in all_files:
            path = os.path.join(folder_path, filename)
            print(f"Describing: {filename}")
            description = self.describe_image(path)
            if "pes" in description or "logo" in description or "PES" in description:
                continue
            results[filename] = description

        if save_to:
            with open(save_to, "w", encoding="utf-8") as f:
                json.dump(results, f, indent=2)
            print(f"Saved results to {save_to}")

        return results

In [37]:
image = ImageDescriptionAgent()
image.describe_images_in_folder("/kaggle/working/extracted_pdf_content/images", "desc.json")

Loading image captioning model...
Describing: Lora_Qlora-Finetuning_page17_image1.png
Describing: Lora_Qlora-Finetuning_page7_image1.png
Describing: Lora_Qlora-Finetuning_page13_image2.png
Describing: Lora_Qlora-Finetuning_page13_image1.png
Describing: Lora_Qlora-Finetuning_page16_image1.png
Describing: Lora_Qlora-Finetuning_page15_image2.png
Describing: Lora_Qlora-Finetuning_page4_image2.png
Describing: Lora_Qlora-Finetuning_page5_image2.png
Describing: Lora_Qlora-Finetuning_page22_image1.png
Describing: Lora_Qlora-Finetuning_page20_image2.png
Describing: Lora_Qlora-Finetuning_page14_image1.png
Describing: Lora_Qlora-Finetuning_page15_image1.png
Describing: Lora_Qlora-Finetuning_page6_image2.png
Describing: Lora_Qlora-Finetuning_page19_image1.png
Describing: Lora_Qlora-Finetuning_page8_image2.png
Describing: Lora_Qlora-Finetuning_page2_image1.png
Describing: Lora_Qlora-Finetuning_page2_image2.png
Describing: Lora_Qlora-Finetuning_page11_image1.png
Describing: Lora_Qlora-Finetuning_pag

{'Lora_Qlora-Finetuning_page13_image2.png': 'a schematic diagram of the algorithm.',
 'Lora_Qlora-Finetuning_page15_image2.png': 'a screenshot of a cell phone description generated with very high confidence',
 'Lora_Qlora-Finetuning_page4_image2.png': 'a screenshot of a cell phone description automatically generated',
 'Lora_Qlora-Finetuning_page5_image2.png': 'a diagram of the process.',
 'Lora_Qlora-Finetuning_page20_image2.png': 'a diagram of the system.',
 'Lora_Qlora-Finetuning_page6_image2.png': 'a screenshot of a cell phone description automatically generated',
 'Lora_Qlora-Finetuning_page8_image2.png': 'a screenshot of a cell phone description automatically generated',
 'Lora_Qlora-Finetuning_page2_image2.png': 'a diagram of a document with a piece of paper connected to it',
 'Lora_Qlora-Finetuning_page7_image2.png': 'a diagram showing how to make a braille forget.',
 'Lora_Qlora-Finetuning_page14_image2.png': 'a diagram of a neural network.'}

In [64]:
class AgentConfig(BaseModel):
    groq_api_key: str
    model_name: str = "mistral-saba-24b"
    transcript_path: str = "transcript.txt"
    slide_path: str = "/kaggle/working/extracted_pdf_content/text/Lora_Qlora-Finetuning.txt"
    image_descriptions_path: str = "/kaggle/working/desc.json"
    output_path: str = "lecture_notes.md"
    chunk_size: int = 1000
    chunk_overlap: int = 200
    temperature: float = 0.3
    max_tokens: int = 4000

In [65]:
class LectureNotesAgent:
    def __init__(self, config: AgentConfig):
        self.config = config
        self.llm = ChatGroq(
            api_key=config.groq_api_key,
            model_name=config.model_name,
            temperature=config.temperature,
            max_tokens=config.max_tokens
        )
        self.embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.chunk_size,
            chunk_overlap=config.chunk_overlap,
            length_function=len,
            is_separator_regex=False,
        )
        self.few_shot_prompt = self._few_shot_examples()
        self.hierarchical_prompt = self._get_hierarchical_prompt()
        self.summary_prompt = ChatPromptTemplate.from_template(self._summary_template())
        self.connection_prompt = ChatPromptTemplate.from_template(self._connection_template())

    def load_data(self) -> Dict[str, str]:
        with open(self.config.transcript_path, 'r') as f:
            transcript = f.read()
        with open(self.config.slide_path, 'r') as f:
            slides = f.read()
    
        # Load image descriptions
        with open(self.config.image_descriptions_path, 'r') as f:
            images = json.load(f)
        image_descriptions = "\n".join(
            f"[Image: {img} - {images[img]}]" for img in images
        )
    
        return {
            "transcript": transcript,
            "slides": slides,
            "image_descriptions": image_descriptions
        }

    def _create_retriever(self, texts: List[str]) -> MultiQueryRetriever:
        docs = [Document(page_content=text) for text in texts]
        vectorstore = FAISS.from_documents(docs, self.embedder)
        return MultiQueryRetriever.from_llm(retriever=vectorstore.as_retriever(), llm=self.llm)

    def _few_shot_examples(self):
        examples = [
            {
                "input": "Transcript mentions 'neural networks' and slides show 'CNN architecture'",
                "output": "1. Neural Networks\n   1.1 Convolutional Neural Networks (CNNs)\n      - Specialized for processing grid-like data (images)\n      - Architecture includes convolutional layers, pooling layers"
            },
            {
                "input": "Transcript discusses 'backpropagation' and slides mention 'gradient descent'",
                "output": "2. Training Neural Networks\n   2.1 Backpropagation\n      - Algorithm for calculating gradients\n      - Used in conjunction with gradient descent\n   2.2 Gradient Descent\n      - Optimization algorithm\n      - Uses gradients from backpropagation"
            }
        ]
        return FewShotChatMessagePromptTemplate(
            example_prompt=ChatPromptTemplate.from_messages([
                ("human", "{input}"),
                ("ai", "{output}")
            ]),
            examples=examples
        )

    def _get_hierarchical_prompt(self):
        template = """
        You are an expert at creating well-structured educational notes from lecture materials.
        Combine the transcript, slide content, and relevant image descriptions to create comprehensive notes with the following structure:
        
        1. Main Topic
           1.1 Key Concept
              - Definition
              - Examples
              - Related ideas from other sections
           1.2 Key Concept
           1.3 key concept
            ....
              ...
        2. Next Main Topic
        3. Examples present in the transcript and slides
           3.1 Example 1
               - Explain step by step how the result was achieved
               - If relevant, include a reference to an image and describe what it shows
           3.2 Example 2
               - Explain step by step how the result was achieved
               - If relevant, include a reference to an image and describe what it shows
        
        image descripptions are provided with image name and description. based on the key concept header, include the image name in the notes as well
        [Image: image_name - description]
        
        Transcript content:
        {transcript}
        
        Slide content:
        {slides}
        
        Relevant images:
        {image_descriptions}
        
        Generate the structured notes:
        """

        return ChatPromptTemplate.from_messages([
            ("system", "You are an expert at creating well-structured educational notes."),
            self.few_shot_prompt,
            ("human", template)
        ])

    def _summary_template(self):
        return """Write a concise summary of the following lecture section, preserving all critical information:
        {content}
        Concise summary:"""

    def _connection_template(self):
        return """Identify connections between these concepts from different parts of the lecture:
        
        Concept 1: {concept1}
        Concept 2: {concept2}
        
        Explain how these concepts relate to each other in the context of this lecture:"""

    def generate_notes(self, data: Dict[str, str]) -> str:
        transcript_chunks = self.splitter.split_text(data["transcript"])
        slide_chunks = self.splitter.split_text(data["slides"])
    
        transcript_retriever = self._create_retriever(transcript_chunks)
        slide_retriever = self._create_retriever(slide_chunks)
    
        # Set up the retriever chain
        base_chain = RunnableParallel({
            "transcript": transcript_retriever,
            "slides": slide_retriever,
        })
    
        # Manually merge in static data like image_descriptions
        full_chain = (
            base_chain
            | (lambda inputs: {
                **inputs,
                "image_descriptions": data["image_descriptions"]
            })
            | self.hierarchical_prompt
            | self.llm
            | StrOutputParser()
        )
    
        notes = full_chain.invoke({
            "transcript": data["transcript"],
            "slides": data["slides"],
        })
    
        # Append summary
        summary_chain = self.summary_prompt | self.llm | StrOutputParser()
        summary = summary_chain.invoke({"content": notes})
        notes += f"\n\n## Overall Summary\n{summary}"
    
        return notes


    def run(self):
        data = self.load_data()
        notes = self.generate_notes(data)
        with open(self.config.output_path, "w") as f:
            f.write(notes)
        print(f"Notes saved to {self.config.output_path}")
        return notes

In [71]:
if __name__ == "__main__":
    cfg = AgentConfig(
        groq_api_key="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",
        transcript_path="transcript.txt",
        slide_path="parsed_slide_text.txt",
        image_descriptions_path="/kaggle/working/desc.json",
        output_path="lecture_notes.md"
    )

    agent = LectureNotesAgent(cfg)
    agent.run()

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `mistral-saba-24b` in organization `org_01jsehbtgjft68bw7vpeg2aw34` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Used 7850, Requested 4504. Please try again in 1m3.544s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}