In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-hackathon/__results__.html
/kaggle/input/llm-hackathon/__notebook__.ipynb
/kaggle/input/llm-hackathon/__output__.json
/kaggle/input/llm-hackathon/custom.css
/kaggle/input/llm-hackathon/output/stable_diffusion/presentation_data/Stable Diffusion_content.json
/kaggle/input/llm-hackathon/output/stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327084200249_Video_ENC_notes.md
/kaggle/input/llm-hackathon/output/stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1)_notes.md
/kaggle/input/llm-hackathon/output/stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327084200249_Video_ENC.txt
/kaggle/input/llm-hackathon/output/stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).txt
/kaggle/input/llm-hackathon/output/stable_diffusion/audio/19853_shylaja.sharath_31_20250327084200249_Video_ENC.wav
/kaggle/input/llm-hackathon/output/stable_diffusion/audio/19853_shylaja.sharath_31_20250327092700214_Video_E

In [6]:
# Step 7.1 (Gemini): Install google-generativeai Library and Initialize Client

import os
import subprocess
import json
from pathlib import Path
import time
import gc

# 1. Install Google Generative AI Python library
print("Installing google-generativeai library...")
# Use --upgrade to ensure we get a recent version
install_gemini_lib = subprocess.run(['pip', 'install', '-q', '--upgrade', 'google-generativeai'], capture_output=True, text=True)

if install_gemini_lib.returncode == 0:
    print("google-generativeai library installed/upgraded successfully.")
else:
    print("Error installing google-generativeai library:")
    print(install_gemini_lib.stderr)
    raise RuntimeError("Failed to install google-generativeai library.")

# Import AFTER installation
try:
    import google.generativeai as genai
    from google.generativeai.types import HarmCategory, HarmBlockThreshold # For safety settings
    from kaggle_secrets import UserSecretsClient
except ImportError as e:
    print(f"Failed to import libraries: {e}. Please ensure installation was successful.")
    raise

# 2. Access API Key from Kaggle Secrets and Configure SDK
google_api_key = None
try:
    user_secrets = UserSecretsClient()
    google_api_key = user_secrets.get_secret("GOOGLE_API_KEY")

    if not google_api_key:
        print("ERROR: Google API Key not found in Kaggle Secrets.")
        print("Please ensure you added a secret with the label 'GOOGLE_API_KEY'.")
    else:
        print("Google API Key retrieved successfully.")
        genai.configure(api_key=google_api_key)
        print("Gemini SDK configured.")

except Exception as e:
    print(f"Error accessing Kaggle Secrets or configuring Gemini SDK: {e}")
    print("Ensure Kaggle Secrets are properly configured and the key label is 'GOOGLE_API_KEY'.")

# 3. Define LLM Model to use - GEMINI 1.5 PRO LATEST
# *********************************************************************
# Switching to Gemini 1.5 Pro for its large context window
# Model ID might change, check Google AI documentation if this exact ID fails
llm_model_name = "gemini-1.5-flash-latest"
# *********************************************************************
print(f"Using LLM model: {llm_model_name}")

# Initialize the GenerativeModel instance
model = None
if google_api_key: # Only proceed if key was retrieved
    try:
        # You can adjust safety settings and generation config here if needed
        model = genai.GenerativeModel(
            llm_model_name,
            # Example: Setting safety thresholds lower (use with caution)
            # safety_settings={
            #     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            #     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            #     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            #     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            # }
            # generation_config=genai.types.GenerationConfig(temperature=0.7)
            )
        print(f"GenerativeModel instance created for {llm_model_name}.")
    except Exception as e:
        print(f"Error creating GenerativeModel instance: {e}")


# Define output_base_path (as it might have been cleared if kernel restarted)
output_base_path = Path("/kaggle/working/output")
output_base_path.mkdir(parents=True, exist_ok=True)
print(f"Output base path set to: {output_base_path}")

# --- Status Check ---
if model:
    print(f"\nSetup complete. Ready for note generation with {llm_model_name}.")
    # Because Gemini 1.5 Pro has a large context window (1M+ tokens),
    # we will likely NOT need transcript chunking or presentation summarization.
    print("Chunking/summarization steps will be removed in the next cells.")
else:
    print("\nSetup failed (API key or model initialization issue). Cannot proceed.")

Installing google-generativeai library...
google-generativeai library installed/upgraded successfully.
Google API Key retrieved successfully.
Gemini SDK configured.
Using LLM model: gemini-1.5-flash-latest
GenerativeModel instance created for gemini-1.5-flash-latest.
Output base path set to: /kaggle/working/output

Setup complete. Ready for note generation with gemini-1.5-flash-latest.
Chunking/summarization steps will be removed in the next cells.


In [7]:
# Step 7.2 - Cell 1 (Enhanced for Gemini): Helper Functions & Prompt

import json
from pathlib import Path
import time
import gc
# Requires google.generativeai as genai (imported in 7.1)
# Assumes 'model' (the genai.GenerativeModel instance), 'llm_model_name',
# 'output_base_path' are defined from Step 7.1 (Gemini version).

# --- Helper Function to Format Presentation JSON ---
def format_presentation_data(data):
    """Converts presentation JSON data into a readable string for the LLM."""
    if not data or ("error" in data):
        return "No presentation data available or there was an error processing it.\n"

    output = f"--- Presentation Content ({data.get('file_type', 'N/A')}: {data.get('source_file', 'N/A')}) ---\n\n"
    if data.get('file_type') == 'pptx':
        slides = data.get('slides')
        if not slides: return output + "No slides found or extracted.\n"
        for slide in slides:
            slide_num = slide.get('slide_number', '?')
            output += f"## Slide {slide_num}\n"
            if slide.get('title'): output += f"### Title: {slide['title']}\n"
            if slide.get('content'):
                output += "Content:\n"
                if isinstance(slide['content'], list):
                    for line in slide['content']: output += f"{line}\n"
                else: output += f"{str(slide['content'])}\n"
            if slide.get('notes'): output += f"Presenter Notes: {slide['notes']}\n"
            output += "\n"
    elif data.get('file_type') == 'pdf':
        pages = data.get('pages')
        if not pages: return output + "No pages found or extracted.\n"
        for page in pages:
            page_num = page.get('page_number', '?')
            output += f"## Page {page_num}\n"
            if page.get('text'): output += f"Text:\n{page['text']}\n"
            output += "\n"
    else: output += f"Unknown presentation format: {data.get('file_type', 'N/A')}.\n"
    output += "--- End of Presentation Content ---\n"
    return output

# --- Helper Function to Chunk Text ---
def chunk_text(text, max_tokens_per_chunk=8000, overlap_tokens=200):
    """Split text into overlapping chunks for processing."""
    # Simple word-based chunking (approximate)
    words = text.split()
    words_per_token = 0.75  # Approximate ratio
    max_words_per_chunk = int(max_tokens_per_chunk * words_per_token)
    overlap_words = int(overlap_tokens * words_per_token)
    
    chunks = []
    start = 0
    
    while start < len(words):
        end = min(start + max_words_per_chunk, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        
        if end >= len(words):
            break
            
        start = end - overlap_words
    
    return chunks

# --- Enhanced LLM Prompt Template ---
prompt_template = """
You are an expert AI assistant tasked with creating comprehensive, well-structured lecture notes.
You will be given the **full transcript** of a spoken lecture and the **full content** of the accompanying presentation slides (or PDF pages).
Your goal is to synthesize information from BOTH sources to generate high-quality academic notes optimized for student learning and review.

**Instructions:**

1. **Analyze Both Inputs:** Carefully read the provided Lecture Transcript and Presentation Content.

2. **Synthesize Information:** Combine relevant information from the transcript and presentation. Integrate ideas rather than copying. Use the presentation structure as a foundation, but enrich with details, explanations, and examples from the transcript.

3. **Structure Requirements:**
   * **Title:** Start with `# Lecture Notes: [Topic/Title from content]`
   * **Main Sections:** Use `##` for primary topics and major concepts
   * **Subsections:** Use `###` for subtopics, specific methods, procedures, or detailed concepts
   * **Content Organization:** Use bullet points (`*`) for key details, definitions, examples, and supporting information
   * **Emphasis:** Use `**bold text**` for important keywords, concepts, definitions, formulas, and key terms
   * **Code/Algorithms:** Use code blocks when showing algorithms, formulas, or structured procedures
   * **Examples:** Create dedicated example sections when demonstrating concepts

4. **Content Organization Principles:**
   * **Lead with Definitions:** Start major concepts with clear, complete definitions
   * **Logical Progression:** Organize content from fundamental concepts to applications
   * **Method Separation:** Create distinct sections for different approaches, methods, or procedures
   * **Property Grouping:** List important characteristics and properties as organized bullet points
   * **Step-by-Step Clarity:** Break down complex processes into clear, numbered or bulleted steps
   * **Integration:** Weave transcript elaborations naturally into slide-based structure

5. **Content Guidelines:**
   * **Completeness:** Include all significant concepts mentioned in either source
   * **Definitions:** Provide precise, academic definitions for key terms
   * **Examples:** Include concrete examples with step-by-step demonstrations when provided
   * **Analysis:** Add complexity analysis, efficiency discussions, or comparative analysis when mentioned
   * **Context:** Include background information and applications discussed in the transcript
   * **Clarifications:** Use transcript content to clarify or expand on slide points

6. **Formatting Standards:**
   * Use consistent indentation for nested bullet points
   * Maintain parallel structure in lists and sections
   * Include mathematical notation, formulas, or technical syntax using appropriate markdown
   * Separate major sections with clear spacing
   * Use descriptive, informative section headers
   * Create logical flow between related concepts

7. **Quality Requirements:**
   * **Academic Tone:** Maintain formal, precise academic language
   * **Comprehensiveness:** Include sufficient detail for thorough understanding
   * **Clarity:** Make complex concepts accessible to students
   * **Structure:** Ensure logical organization that supports learning
   * **Integration:** Seamlessly combine slide and transcript information
   * **Exam-Ready:** Provide the level of detail needed for assessment preparation

8. **Adaptive Structure:** Let the content determine the specific sections, but commonly include:
   * Fundamental definitions and concepts
   * Key properties or characteristics  
   * Different methods, approaches, or types
   * Detailed examples and applications
   * Comparative analysis or efficiency discussions
   * Summary of key takeaways

**Input Data:**

{presentation_content}

**Full Lecture Transcript:**

```text
{transcript_text}
```

Generate comprehensive lecture notes in Markdown format following the above structure and requirements. Adapt the section organization to best fit the content while maintaining academic rigor and clarity. Start directly with the lecture title - no introductory text.
"""

# --- Enhanced Gemini API Function ---
def generate_notes_with_gemini(prompt_content, max_retries=2, initial_delay=5):
    """Calls the Gemini API using the initialized model, handles retries."""
    global model # Uses the genai.GenerativeModel instance from Step 7.1
    if not model:
        print("  Error: Gemini model not initialized.")
        return "## Error: Gemini model not initialized."

    delay = initial_delay
    for attempt in range(max_retries + 1):
        try:
            # Make the API call with enhanced configuration
            response = model.generate_content(
                prompt_content,
                generation_config={
                    "temperature": 0.3,  # Lower temperature for more structured output
                    "top_p": 0.9,
                    "top_k": 40,
                    "max_output_tokens": 8192,  # Increased for comprehensive notes
                }
            )

            # Process the response
            if not response.candidates:
                block_reason = "Unknown"
                try:
                    block_reason = response.prompt_feedback.block_reason
                except Exception:
                    print("  Warning: Could not retrieve block_reason from prompt_feedback.")

                print(f"  Warning: Call blocked by safety settings (Reason: {block_reason}). Attempt {attempt + 1}/{max_retries + 1}.")
                if attempt < max_retries:
                    print(f"   Retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= 2
                    continue
                else:
                    return f"## Error: Content generation blocked by safety settings (Reason: {block_reason})."

            # Check if response text is usable
            response_text = ""
            try:
                response_text = response.text
            except ValueError:
                print(f"  Warning: ValueError accessing response.text. Prompt feedback: {response.prompt_feedback}")
                response_text = ""

            if response_text and isinstance(response_text, str) and response_text.strip():
                return response_text # Success
            else:
                print(f"  Warning: Received empty text content from Gemini (Attempt {attempt + 1}/{max_retries + 1}).")
                try:
                    print(f"     Prompt Feedback: {response.prompt_feedback}")
                except Exception:
                    print("     Could not retrieve prompt feedback.")

                if attempt < max_retries:
                    print(f"   Retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= 2
                    continue
                else:
                    return "## Error: Received empty response from Gemini."

        except Exception as e:
            print(f"  Error during Gemini API call (Attempt {attempt + 1}/{max_retries + 1}): {type(e).__name__} - {e}")
            if attempt < max_retries:
                print(f"   Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
                continue
            else:
                print(f"  Failed after {max_retries} retries.")
                return f"## Error: API call failed after retries ({type(e).__name__})."

    return "## Error: Failed to generate notes after multiple retries."

# --- Chunked Processing Function ---
def generate_notes_with_chunking(transcript_text, formatted_presentation, lecture_title):
    """Generate notes using chunking approach for very long content."""
    print("    Using chunked processing for long content...")
    
    # Generate outline first
    outline_prompt = f"""
Based on this lecture content, create an outline of 4-8 main sections for comprehensive lecture notes.

**LECTURE:** {lecture_title}

**SLIDES:**
{formatted_presentation[:3000]}

**TRANSCRIPT EXCERPT:**
{transcript_text[:8000]}

Provide ONLY a numbered list of section titles that cover the main topics discussed. Each title should be clear and descriptive.
Format: 
1. Section Title One
2. Section Title Two
etc.
"""
    
    outline_response = generate_notes_with_gemini(outline_prompt)
    if outline_response.startswith("## Error:"):
        return outline_response
    
    # Parse section titles
    section_titles = []
    for line in outline_response.split('\n'):
        line = line.strip()
        if line and any(char.isdigit() for char in line[:3]):
            # Remove numbering and extract title
            title = line.split('. ', 1)[-1] if '. ' in line else line
            section_titles.append(title)
    
    if not section_titles:
        return "## Error: Could not generate valid section outline."
    
    # Chunk the transcript
    transcript_chunks = chunk_text(transcript_text, max_tokens_per_chunk=8000, overlap_tokens=200)
    print(f"      Transcript split into {len(transcript_chunks)} chunks for {len(section_titles)} sections.")
    
    # Generate content for each section
    all_sections = []
    chunk_per_section = max(1, len(transcript_chunks) // len(section_titles))
    
    for i, section_title in enumerate(section_titles):
        start_chunk = min(i * chunk_per_section, len(transcript_chunks) - 1)
        end_chunk = min(start_chunk + chunk_per_section + 1, len(transcript_chunks))
        section_transcript = "\n\n".join(transcript_chunks[start_chunk:end_chunk])
        
        section_prompt = f"""
Create detailed content for this section of lecture notes:

**SECTION:** {section_title}
**LECTURE:** {lecture_title}

**RELEVANT TRANSCRIPT:**
{section_transcript[:12000]}

**SLIDES FOR REFERENCE:**
{formatted_presentation[:2000]}

Generate comprehensive content for this section using the formatting requirements:
- Use ### for the section header: ### {section_title}
- Use **bold** for key terms and concepts
- Use * for bullet points with important details
- Include definitions, examples, and explanations
- Maintain academic tone and structure

Focus only on content relevant to this section.
"""
        
        section_response = generate_notes_with_gemini(section_prompt)
        if not section_response.startswith("## Error:"):
            all_sections.append(section_response)
        else:
            all_sections.append(f"### {section_title}\n\n*Error generating content for this section.*")
    
    # Combine all sections
    lecture_title_clean = lecture_title.replace('_', ' ').title()
    combined_notes = f"# Lecture Notes: {lecture_title_clean}\n\n"
    combined_notes += "\n\n".join(all_sections)
    
    return combined_notes

print("Enhanced helper functions and prompt template defined for Gemini.")


Enhanced helper functions and prompt template defined for Gemini.


In [8]:
# Step 7.2 - Cell 1 (Enhanced for Gemini): Helper Functions & Prompt

import json
from pathlib import Path
import time
import gc
# Requires google.generativeai as genai (imported in 7.1)
# Assumes 'model' (the genai.GenerativeModel instance), 'llm_model_name',
# 'output_base_path' are defined from Step 7.1 (Gemini version).

# --- Helper Function to Format Presentation JSON ---
def format_presentation_data(data):
    """Converts presentation JSON data into a readable string for the LLM."""
    if not data or ("error" in data):
        return "No presentation data available or there was an error processing it.\n"

    output = f"--- Presentation Content ({data.get('file_type', 'N/A')}: {data.get('source_file', 'N/A')}) ---\n\n"
    if data.get('file_type') == 'pptx':
        slides = data.get('slides')
        if not slides: return output + "No slides found or extracted.\n"
        for slide in slides:
            slide_num = slide.get('slide_number', '?')
            output += f"## Slide {slide_num}\n"
            if slide.get('title'): output += f"### Title: {slide['title']}\n"
            if slide.get('content'):
                output += "Content:\n"
                if isinstance(slide['content'], list):
                    for line in slide['content']: output += f"{line}\n"
                else: output += f"{str(slide['content'])}\n"
            if slide.get('notes'): output += f"Presenter Notes: {slide['notes']}\n"
            output += "\n"
    elif data.get('file_type') == 'pdf':
        pages = data.get('pages')
        if not pages: return output + "No pages found or extracted.\n"
        for page in pages:
            page_num = page.get('page_number', '?')
            output += f"## Page {page_num}\n"
            if page.get('text'): output += f"Text:\n{page['text']}\n"
            output += "\n"
    else: output += f"Unknown presentation format: {data.get('file_type', 'N/A')}.\n"
    output += "--- End of Presentation Content ---\n"
    return output

# --- Helper Function to Chunk Text ---
def chunk_text(text, max_tokens_per_chunk=8000, overlap_tokens=200):
    """Split text into overlapping chunks for processing."""
    # Simple word-based chunking (approximate)
    words = text.split()
    words_per_token = 0.75  # Approximate ratio
    max_words_per_chunk = int(max_tokens_per_chunk * words_per_token)
    overlap_words = int(overlap_tokens * words_per_token)
    
    chunks = []
    start = 0
    
    while start < len(words):
        end = min(start + max_words_per_chunk, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        
        if end >= len(words):
            break
            
        start = end - overlap_words
    
    return chunks

# --- Enhanced LLM Prompt Template ---
prompt_template = """
You are an expert AI assistant tasked with creating comprehensive, well-structured lecture notes.
You will be given the **full transcript** of a spoken lecture and the **full content** of the accompanying presentation slides (or PDF pages).
Your goal is to synthesize information from BOTH sources to generate high-quality academic notes optimized for student learning and review.

**Instructions:**

1. **Analyze Both Inputs:** Carefully read the provided Lecture Transcript and Presentation Content.

2. **Synthesize Information:** Combine relevant information from the transcript and presentation. Integrate ideas rather than copying. Use the presentation structure as a foundation, but enrich with details, explanations, and examples from the transcript.

3. **Structure Requirements:**
   * **Title:** Start with `# Lecture Notes: [Topic/Title from content]`
   * **Main Sections:** Use `##` for primary topics and major concepts
   * **Subsections:** Use `###` for subtopics, specific methods, procedures, or detailed concepts
   * **Content Organization:** Use bullet points (`*`) for key details, definitions, examples, and supporting information
   * **Emphasis:** Use `**bold text**` for important keywords, concepts, definitions, formulas, and key terms
   * **Code/Algorithms:** Use code blocks when showing algorithms, formulas, or structured procedures
   * **Examples:** Create dedicated example sections when demonstrating concepts

4. **Content Organization Principles:**
   * **Lead with Definitions:** Start major concepts with clear, complete definitions
   * **Logical Progression:** Organize content from fundamental concepts to applications
   * **Method Separation:** Create distinct sections for different approaches, methods, or procedures
   * **Property Grouping:** List important characteristics and properties as organized bullet points
   * **Step-by-Step Clarity:** Break down complex processes into clear, numbered or bulleted steps
   * **Integration:** Weave transcript elaborations naturally into slide-based structure

5. **Content Guidelines:**
   * **Completeness:** Include all significant concepts mentioned in either source
   * **Definitions:** Provide precise, academic definitions for key terms
   * **Examples:** Include concrete examples with step-by-step demonstrations when provided
   * **Analysis:** Add complexity analysis, efficiency discussions, or comparative analysis when mentioned
   * **Context:** Include background information and applications discussed in the transcript
   * **Clarifications:** Use transcript content to clarify or expand on slide points

6. **Formatting Standards:**
   * Use consistent indentation for nested bullet points
   * Maintain parallel structure in lists and sections
   * Include mathematical notation, formulas, or technical syntax using appropriate markdown
   * Separate major sections with clear spacing
   * Use descriptive, informative section headers
   * Create logical flow between related concepts

7. **Quality Requirements:**
   * **Academic Tone:** Maintain formal, precise academic language
   * **Comprehensiveness:** Include sufficient detail for thorough understanding
   * **Clarity:** Make complex concepts accessible to students
   * **Structure:** Ensure logical organization that supports learning
   * **Integration:** Seamlessly combine slide and transcript information
   * **Exam-Ready:** Provide the level of detail needed for assessment preparation

8. **Adaptive Structure:** Let the content determine the specific sections, but commonly include:
   * Fundamental definitions and concepts
   * Key properties or characteristics  
   * Different methods, approaches, or types
   * Detailed examples and applications
   * Comparative analysis or efficiency discussions
   * Summary of key takeaways

**Input Data:**

{presentation_content}

**Full Lecture Transcript:**

```text
{transcript_text}
```

Generate comprehensive lecture notes in Markdown format following the above structure and requirements. Adapt the section organization to best fit the content while maintaining academic rigor and clarity. Start directly with the lecture title - no introductory text.
"""

# --- Enhanced Gemini API Function ---
def generate_notes_with_gemini(prompt_content, max_retries=2, initial_delay=5):
    """Calls the Gemini API using the initialized model, handles retries."""
    global model # Uses the genai.GenerativeModel instance from Step 7.1
    if not model:
        print("  Error: Gemini model not initialized.")
        return "## Error: Gemini model not initialized."

    delay = initial_delay
    for attempt in range(max_retries + 1):
        try:
            # Make the API call with enhanced configuration
            response = model.generate_content(
                prompt_content,
                generation_config={
                    "temperature": 0.3,  # Lower temperature for more structured output
                    "top_p": 0.9,
                    "top_k": 40,
                    "max_output_tokens": 8192,  # Increased for comprehensive notes
                }
            )

            # Process the response
            if not response.candidates:
                block_reason = "Unknown"
                try:
                    block_reason = response.prompt_feedback.block_reason
                except Exception:
                    print("  Warning: Could not retrieve block_reason from prompt_feedback.")

                print(f"  Warning: Call blocked by safety settings (Reason: {block_reason}). Attempt {attempt + 1}/{max_retries + 1}.")
                if attempt < max_retries:
                    print(f"   Retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= 2
                    continue
                else:
                    return f"## Error: Content generation blocked by safety settings (Reason: {block_reason})."

            # Check if response text is usable
            response_text = ""
            try:
                response_text = response.text
            except ValueError:
                print(f"  Warning: ValueError accessing response.text. Prompt feedback: {response.prompt_feedback}")
                response_text = ""

            if response_text and isinstance(response_text, str) and response_text.strip():
                return response_text # Success
            else:
                print(f"  Warning: Received empty text content from Gemini (Attempt {attempt + 1}/{max_retries + 1}).")
                try:
                    print(f"     Prompt Feedback: {response.prompt_feedback}")
                except Exception:
                    print("     Could not retrieve prompt feedback.")

                if attempt < max_retries:
                    print(f"   Retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= 2
                    continue
                else:
                    return "## Error: Received empty response from Gemini."

        except Exception as e:
            print(f"  Error during Gemini API call (Attempt {attempt + 1}/{max_retries + 1}): {type(e).__name__} - {e}")
            if attempt < max_retries:
                print(f"   Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
                continue
            else:
                print(f"  Failed after {max_retries} retries.")
                return f"## Error: API call failed after retries ({type(e).__name__})."

    return "## Error: Failed to generate notes after multiple retries."

# --- Chunked Processing Function ---
def generate_notes_with_chunking(transcript_text, formatted_presentation, lecture_title):
    """Generate notes using chunking approach for very long content."""
    print("    Using chunked processing for long content...")
    
    # Generate outline first
    outline_prompt = f"""
Based on this lecture content, create an outline of 4-8 main sections for comprehensive lecture notes.

**LECTURE:** {lecture_title}

**SLIDES:**
{formatted_presentation[:3000]}

**TRANSCRIPT EXCERPT:**
{transcript_text[:8000]}

Provide ONLY a numbered list of section titles that cover the main topics discussed. Each title should be clear and descriptive.
Format: 
1. Section Title One
2. Section Title Two
etc.
"""
    
    outline_response = generate_notes_with_gemini(outline_prompt)
    if outline_response.startswith("## Error:"):
        return outline_response
    
    # Parse section titles
    section_titles = []
    for line in outline_response.split('\n'):
        line = line.strip()
        if line and any(char.isdigit() for char in line[:3]):
            # Remove numbering and extract title
            title = line.split('. ', 1)[-1] if '. ' in line else line
            section_titles.append(title)
    
    if not section_titles:
        return "## Error: Could not generate valid section outline."
    
    # Chunk the transcript
    transcript_chunks = chunk_text(transcript_text, max_tokens_per_chunk=8000, overlap_tokens=200)
    print(f"      Transcript split into {len(transcript_chunks)} chunks for {len(section_titles)} sections.")
    
    # Generate content for each section
    all_sections = []
    chunk_per_section = max(1, len(transcript_chunks) // len(section_titles))
    
    for i, section_title in enumerate(section_titles):
        start_chunk = min(i * chunk_per_section, len(transcript_chunks) - 1)
        end_chunk = min(start_chunk + chunk_per_section + 1, len(transcript_chunks))
        section_transcript = "\n\n".join(transcript_chunks[start_chunk:end_chunk])
        
        section_prompt = f"""
Create detailed content for this section of lecture notes:

**SECTION:** {section_title}
**LECTURE:** {lecture_title}

**RELEVANT TRANSCRIPT:**
{section_transcript[:12000]}

**SLIDES FOR REFERENCE:**
{formatted_presentation[:2000]}

Generate comprehensive content for this section using the formatting requirements:
- Use ### for the section header: ### {section_title}
- Use **bold** for key terms and concepts
- Use * for bullet points with important details
- Include definitions, examples, and explanations
- Maintain academic tone and structure

Focus only on content relevant to this section.
"""
        
        section_response = generate_notes_with_gemini(section_prompt)
        if not section_response.startswith("## Error:"):
            all_sections.append(section_response)
        else:
            all_sections.append(f"### {section_title}\n\n*Error generating content for this section.*")
    
    # Combine all sections
    lecture_title_clean = lecture_title.replace('_', ' ').title()
    combined_notes = f"# Lecture Notes: {lecture_title_clean}\n\n"
    combined_notes += "\n\n".join(all_sections)
    
    return combined_notes

print("Enhanced helper functions and prompt template defined for Gemini.")

# ================================================================================

# Step 7.2 - Cell 2 (Enhanced for Gemini): Main Note Generation Loop

import json
from pathlib import Path
import time
import gc

# *** Define the path to your pre-processed input data ***
processed_input_path = Path("/kaggle/input/llm-hackathon/output")
# *** Output path remains the working directory ***
final_notes_output_base = output_base_path # Defined previously (/kaggle/working/output)

# Enhanced configuration
MAX_SINGLE_CALL_LENGTH = 40000  # Character limit for single API call
USE_CHUNKING_FOR_LONG_CONTENT = True  # Enable chunking for very long transcripts

# Initialize counters
total_transcripts_processed_for_notes = 0
successful_notes_files = 0
failed_notes_files = 0

print(f"\n--- Starting Enhanced Lecture Note Generation (Gemini - Reading from: {processed_input_path}) ---")
print(f"--- Saving final notes to: {final_notes_output_base} ---")
print(f"--- Configuration: Max single call length: {MAX_SINGLE_CALL_LENGTH} chars, Chunking enabled: {USE_CHUNKING_FOR_LONG_CONTENT} ---")

# Iterate through subject folders in the PRE-PROCESSED input directory
if not processed_input_path.exists():
    print(f"ERROR: Processed input path not found at {processed_input_path}")
    raise FileNotFoundError(f"Processed input path not found: {processed_input_path}")

for subject_dir in processed_input_path.iterdir():
    if not subject_dir.is_dir():
        continue

    print(f"\nProcessing subject: {subject_dir.name}")

    # --- Load Presentation Data ---
    presentation_data_dir = subject_dir / "presentation_data"
    if not presentation_data_dir.exists():
        print(f"  Skipping subject {subject_dir.name}: No 'presentation_data' directory found.")
        continue
    presentation_json_files = list(presentation_data_dir.glob('*.json'))
    if not presentation_json_files:
        print(f"  Skipping subject {subject_dir.name}: No presentation JSON file found.")
        continue
    presentation_json_path = presentation_json_files[0]
    print(f"  Using presentation data: {presentation_json_path.name}")
    try:
        with open(presentation_json_path, 'r', encoding='utf-8') as f:
            presentation_data = json.load(f)
        formatted_presentation = format_presentation_data(presentation_data)
        if "error processing it" in formatted_presentation:
            print(f"  Warning: Issue noted during formatting presentation data.")
    except Exception as e:
        print(f"  Error loading or formatting presentation JSON {presentation_json_path.name}: {e}")
        continue

    # --- Find Transcript Files ---
    transcript_dir = subject_dir / "transcripts"
    if not transcript_dir.exists():
        print(f"  No 'transcripts' directory found for {subject_dir.name}.")
        continue
    transcript_files = list(transcript_dir.glob('*.txt'))
    if not transcript_files:
        print(f"  No transcript files found in {transcript_dir}.")
        continue

    # --- Create Output Directory ---
    output_notes_dir = final_notes_output_base / subject_dir.name / "final_notes"
    output_notes_dir.mkdir(parents=True, exist_ok=True)

    # --- Process Each Transcript File ---
    for transcript_path in transcript_files:
        total_transcripts_processed_for_notes += 1
        print(f"\n  Processing transcript file: {transcript_path.relative_to(processed_input_path)}")
        output_note_path = output_notes_dir / f"{transcript_path.stem}_notes.md"

        # --- Load Transcript Text ---
        try:
            with open(transcript_path, 'r', encoding='utf-8') as f:
                full_content = f.read()
                transcript_header = "--- Full Transcript ---"
                header_index = full_content.find(transcript_header)
                if header_index != -1:
                    text_start_index = full_content.find('\n', header_index) + 1
                    timestamp_marker = "\n\n--- Timestamps"
                    timestamp_index = full_content.find(timestamp_marker, text_start_index)
                    transcript_text = full_content[text_start_index:timestamp_index].strip() if timestamp_index != -1 else full_content[text_start_index:].strip()
                else:
                    transcript_text = full_content.strip()
            
            if not transcript_text:
                print("    Warning: Transcript file is empty. Skipping file.")
                continue
                
        except Exception as e:
            print(f"    Error loading transcript {transcript_path.name}: {e}")
            failed_notes_files += 1
            continue

        # --- Determine Processing Strategy ---
        total_content_length = len(transcript_text) + len(formatted_presentation)
        print(f"    Total content length: {total_content_length:,} characters")
        
        use_chunking = (USE_CHUNKING_FOR_LONG_CONTENT and 
                       total_content_length > MAX_SINGLE_CALL_LENGTH)
        
        if use_chunking:
            print(f"    Content exceeds {MAX_SINGLE_CALL_LENGTH:,} chars - using chunked approach")
        else:
            print(f"    Using single comprehensive API call")

        # --- Generate Notes ---
        start_time = time.time()
        
        if use_chunking:
            # Use chunked approach for very long content
            generated_notes = generate_notes_with_chunking(
                transcript_text, formatted_presentation, transcript_path.stem
            )
        else:
            # Use single comprehensive call
            final_prompt = prompt_template.format(
                presentation_content=formatted_presentation,
                transcript_text=transcript_text
            )
            print(f"    Calling LLM ({llm_model_name}) for full transcript...")
            generated_notes = generate_notes_with_gemini(final_prompt)
        
        end_time = time.time()

        # --- Process Result and Save ---
        if generated_notes and not generated_notes.strip().startswith("## Error:"):
            print(f"      Note generation successful (took {end_time - start_time:.2f}s).")
            try:
                # Don't duplicate the title if it's already included
                if generated_notes.strip().startswith("# Lecture Notes:"):
                    final_output = generated_notes
                else:
                    final_output = f"# Lecture Notes: {transcript_path.stem}\n\n{generated_notes}"
                
                with open(output_note_path, 'w', encoding='utf-8') as f:
                    f.write(final_output)
                print(f"    Saved notes to: {output_note_path.relative_to(Path('/kaggle/working/'))}")
                successful_notes_files += 1
            except Exception as e:
                print(f"    Error saving notes to {output_note_path.name}: {e}")
                failed_notes_files += 1
        else:
            print(f"      Note generation failed or returned error (took {end_time - start_time:.2f}s).")
            failed_notes_files += 1
            try:
                error_output = f"# Lecture Notes: {transcript_path.stem}\n\n{generated_notes}"
                with open(output_note_path, 'w', encoding='utf-8') as f:
                    f.write(error_output)
                print(f"    Saved error message to: {output_note_path.relative_to(Path('/kaggle/working/'))}")
            except Exception as e:
                print(f"    Error saving error message to {output_note_path.name}: {e}")

        gc.collect() # Garbage collect after each file

print("\n--- Enhanced Note Generation Loop (Gemini) Finished ---")


Enhanced helper functions and prompt template defined for Gemini.

--- Starting Enhanced Lecture Note Generation (Gemini - Reading from: /kaggle/input/llm-hackathon/output) ---
--- Saving final notes to: /kaggle/working/output ---
--- Configuration: Max single call length: 40000 chars, Chunking enabled: True ---

Processing subject: stable_diffusion
  Using presentation data: Stable Diffusion_content.json

  Processing transcript file: stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327084200249_Video_ENC.txt
    Total content length: 61,157 characters
    Content exceeds 40,000 chars - using chunked approach
    Using chunked processing for long content...
      Transcript split into 2 chunks for 8 sections.
      Note generation successful (took 47.84s).
    Saved notes to: output/stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327084200249_Video_ENC_notes.md

  Processing transcript file: stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327092700214_V

In [9]:
# Step 7.2 - Cell 3: Summary and Verification

print("\n--- Note Generation Summary ---")
print(f"Total transcripts found for processing: {total_transcripts_processed_for_notes}")
print(f"Successfully generated notes files: {successful_notes_files}")
print(f"Failed notes files (inc. chunk errors, load/save errors): {failed_notes_files}")

# Optional: Verify by listing some output note files
if successful_notes_files > 0:
    print("\nExample output note files (.md):")
    example_count = 0
    for subject_dir in final_notes_output_base.iterdir():
        notes_data_dir = subject_dir / "final_notes"
        if subject_dir.is_dir() and notes_data_dir.exists():
            for note_file in notes_data_dir.iterdir():
                if note_file.suffix == '.md' and example_count < 5:
                    print(f"- {note_file.relative_to(final_notes_output_base)}")
                    example_count += 1
            if example_count >= 5:
                break
    if example_count == 0:
        print("Could not find any example .md files in the output final_notes directories.")

# Additional verification: Show file sizes
if successful_notes_files > 0:
    print("\nFile size verification:")
    total_size = 0
    file_count = 0
    for subject_dir in final_notes_output_base.iterdir():
        notes_data_dir = subject_dir / "final_notes"
        if subject_dir.is_dir() and notes_data_dir.exists():
            for note_file in notes_data_dir.iterdir():
                if note_file.suffix == '.md':
                    size_kb = note_file.stat().st_size / 1024
                    total_size += size_kb
                    file_count += 1
    
    if file_count > 0:
        avg_size = total_size / file_count
        print(f"Average note file size: {avg_size:.1f} KB")
        print(f"Total notes size: {total_size:.1f} KB")

print("\n--- Processing Complete ---")


--- Note Generation Summary ---
Total transcripts found for processing: 15
Successfully generated notes files: 15
Failed notes files (inc. chunk errors, load/save errors): 0

Example output note files (.md):
- agentic/final_notes/19853_shylaja.sharath_31_20250401121200417_Video_ENC_notes.md
- Lora&Qlora/final_notes/19853_shylaja.sharath_31_20250318125700082_Video_ENC_notes.md
- Lora&Qlora/final_notes/19853_shylaja.sharath_31_20250318112700094_Video_ENC (1)_notes.md
- Lora&Qlora/final_notes/19853_shylaja.sharath_31_20250318121200085_Video_ENC (1)_notes.md
- stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1)_notes.md

File size verification:
Average note file size: 20.1 KB
Total notes size: 301.3 KB

--- Processing Complete ---
