In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-data/LLM_DATASET/stable_diffusion/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).mp4
/kaggle/input/llm-data/LLM_DATASET/stable_diffusion/Stable Diffusion.pdf
/kaggle/input/llm-data/LLM_DATASET/stable_diffusion/19853_shylaja.sharath_31_20250327084200249_Video_ENC.mp4
/kaggle/input/llm-data/LLM_DATASET/Lora&Qlora/19853_shylaja.sharath_31_20250318125700082_Video_ENC.mp4
/kaggle/input/llm-data/LLM_DATASET/Lora&Qlora/Finetuning.pdf
/kaggle/input/llm-data/LLM_DATASET/Lora&Qlora/19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).mp4
/kaggle/input/llm-data/LLM_DATASET/Lora&Qlora/19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).mp4
/kaggle/input/llm-data/LLM_DATASET/expn_tree/7b_2020-09-25 12-12-37_ExprTReeCode 00_00_09-00_26_52.mkv
/kaggle/input/llm-data/LLM_DATASET/expn_tree/Class7_Unit3_Trees_ExprTree.pptx
/kaggle/input/llm-data/LLM_DATASET/expn_tree/7a_2020-09-24 09-28-52_ExprTreeCon.mkv
/kaggle/input/llm-data/LLM_DATASET/agentic/AutoGen CrewAI.pdf


Step 1: Setting Up the Kaggle Environment and Organizing Data

In [4]:
# Step 1: Environment Setup and Data Verification

import os
import subprocess
import torch
from pathlib import Path

# 1. Install necessary system packages (like ffmpeg for audio processing)
# Using 'apt-get update' first to ensure package lists are fresh
print("Updating package lists...")
update_process = subprocess.run(['apt-get', 'update', '-qq'], capture_output=True, text=True)
if update_process.returncode != 0:
    print("Warning: apt-get update failed. Proceeding with install anyway.")
    # print("Update Error:", update_process.stderr) # Uncomment for detailed error

print("Installing ffmpeg...")
install_process = subprocess.run(['apt-get', 'install', '-y', '-qq', 'ffmpeg'], capture_output=True, text=True)
if install_process.returncode == 0:
    print("ffmpeg installed successfully.")
else:
    print("Error installing ffmpeg:")
    print(install_process.stderr)
    # Consider adding alternative installation or raising an error if ffmpeg is critical

# 2. Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU is available. Using device: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU. Note: Processing will be significantly slower.")

# 3. Define the path to your uploaded dataset
# Adjust 'lecture-data' if you named your uploaded dataset differently
# Adjust 'LLM_DATASET' if the root folder inside the zip had a different name
dataset_base_path = Path("/kaggle/input/llm-data/LLM_DATASET")

# 4. Verify dataset structure
if dataset_base_path.exists() and dataset_base_path.is_dir():
    print(f"\nFound dataset directory: {dataset_base_path}")
    print("Contents (Subject Folders):")
    subject_folders = [f.name for f in dataset_base_path.iterdir() if f.is_dir()]
    print(subject_folders)

    # Optional: List contents of the first subject folder to see files
    if subject_folders:
        first_subject_path = dataset_base_path / subject_folders[0]
        print(f"\nContents of '{subject_folders[0]}':")
        try:
            for item in first_subject_path.iterdir():
                print(f"- {item.name} ({'Dir' if item.is_dir() else 'File'})")
        except PermissionError:
            print(f"Could not access contents of {first_subject_path} due to permissions.")
        except Exception as e:
            print(f"An error occurred while listing contents of {first_subject_path}: {e}")
    else:
        print("No subject folders found in the dataset directory.")

else:
    print(f"\nError: Dataset directory not found at {dataset_base_path}")
    print("Please check:")
    print("1. If you uploaded the data correctly.")
    print("2. If the dataset name in Kaggle matches 'lecture-data'.")
    print("3. If the root folder inside your zip file was 'LLM_DATASET'.")
    # You might need to list /kaggle/input/ to see the actual structure:
    # print("\nAvailable contents in /kaggle/input/:")
    # for item in Path("/kaggle/input/").iterdir():
    #     print(f"- {item.name}")


# 5. Create an output directory for results
output_base_path = Path("/kaggle/working/output")
output_base_path.mkdir(parents=True, exist_ok=True)
print(f"\nCreated output directory: {output_base_path}")

Updating package lists...
Installing ffmpeg...
ffmpeg installed successfully.
GPU is available. Using device: Tesla T4

Found dataset directory: /kaggle/input/llm-data/LLM_DATASET
Contents (Subject Folders):
['stable_diffusion', 'Lora&Qlora', 'expn_tree', 'agentic', 'Heap', 'BST', 'TBT', 'Multimodal', 'binary_tree_traversal', 'Tree_traversal']

Contents of 'stable_diffusion':
- 19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).mp4 (File)
- Stable Diffusion.pdf (File)
- 19853_shylaja.sharath_31_20250327084200249_Video_ENC.mp4 (File)

Created output directory: /kaggle/working/output


Step 2: Audio Extraction from Videos

In [5]:
# Step 2: Audio Extraction

import os
import subprocess
from pathlib import Path
import time

# Base paths defined in Step 1
# dataset_base_path = Path("/kaggle/input/llm-data/LLM_DATASET")
# output_base_path = Path("/kaggle/working/output")

# Supported video extensions
video_extensions = ['.mp4', '.mkv']

def extract_audio(video_path, output_audio_path):
    """
    Extracts audio from a video file using ffmpeg, saves as WAV.
    Standardizes to 16kHz mono PCM audio.
    """
    command = [
        'ffmpeg',
        '-i', str(video_path),        # Input file
        '-vn',                        # Disable video recording
        '-acodec', 'pcm_s16le',       # Audio codec: PCM signed 16-bit little-endian (standard WAV)
        '-ar', '16000',               # Audio sample rate: 16kHz
        '-ac', '1',                   # Audio channels: 1 (mono)
        '-y',                         # Overwrite output file if it exists
        '-hide_banner',               # Hide unnecessary console output
        '-loglevel', 'error',         # Show only errors
        str(output_audio_path)        # Output file
    ]
    try:
        print(f"  Extracting audio from: {video_path.name}")
        start_time = time.time()
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        end_time = time.time()
        print(f"  Successfully extracted to: {output_audio_path.name} (took {end_time - start_time:.2f}s)")
        return True
    except subprocess.CalledProcessError as e:
        print(f"  Error extracting audio from {video_path.name}:")
        print(f"  Command: {' '.join(command)}") # Show the command that failed
        print(f"  FFmpeg Error Output:\n{e.stderr}")
        # Attempt to clean up potentially incomplete output file
        if output_audio_path.exists():
            try:
                output_audio_path.unlink()
                print(f"  Cleaned up incomplete file: {output_audio_path.name}")
            except OSError as unlink_err:
                print(f"  Warning: Could not delete incomplete file {output_audio_path.name}: {unlink_err}")
        return False
    except FileNotFoundError:
        print(" Error: ffmpeg command not found. Make sure ffmpeg is installed and in the system's PATH.")
        # This shouldn't happen if Step 1 was successful, but good to check.
        return False
    except Exception as e:
        print(f"  An unexpected error occurred during extraction from {video_path.name}: {e}")
        return False

# --- Main Extraction Loop ---
total_videos = 0
successful_extractions = 0
failed_extractions = 0

print("\n--- Starting Audio Extraction ---")

# Iterate through subject folders in the dataset
for subject_dir in dataset_base_path.iterdir():
    if subject_dir.is_dir():
        print(f"\nProcessing subject: {subject_dir.name}")

        # Create corresponding output directory structure for audio
        output_subject_audio_dir = output_base_path / subject_dir.name / "audio"
        output_subject_audio_dir.mkdir(parents=True, exist_ok=True)

        # Find video files in the current subject directory
        video_files = [f for f in subject_dir.iterdir()
                       if f.is_file() and f.suffix.lower() in video_extensions]

        if not video_files:
            print(f"  No video files ({', '.join(video_extensions)}) found in {subject_dir.name}.")
            continue

        for video_file in video_files:
            total_videos += 1
            # Define the output audio file path
            output_audio_file = output_subject_audio_dir / f"{video_file.stem}.wav"

            # Extract audio
            if extract_audio(video_file, output_audio_file):
                successful_extractions += 1
            else:
                failed_extractions += 1

print("\n--- Audio Extraction Summary ---")
print(f"Total videos found: {total_videos}")
print(f"Successfully extracted audio: {successful_extractions}")
print(f"Failed extractions: {failed_extractions}")

# Optional: Verify by listing some output files
if successful_extractions > 0:
    print("\nExample output audio files:")
    example_count = 0
    for subject_dir in output_base_path.iterdir():
        if subject_dir.is_dir() and (subject_dir / "audio").exists():
             for audio_file in (subject_dir / "audio").iterdir():
                 if audio_file.suffix == '.wav' and example_count < 5:
                     print(f"- {audio_file.relative_to(output_base_path)}")
                     example_count += 1
             if example_count >= 5:
                 break
    if example_count == 0:
        print("Could not find any example .wav files in the output directory.")


--- Starting Audio Extraction ---

Processing subject: stable_diffusion
  Extracting audio from: 19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).mp4
  Successfully extracted to: 19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).wav (took 2.24s)
  Extracting audio from: 19853_shylaja.sharath_31_20250327084200249_Video_ENC.mp4
  Successfully extracted to: 19853_shylaja.sharath_31_20250327084200249_Video_ENC.wav (took 2.05s)

Processing subject: Lora&Qlora
  Extracting audio from: 19853_shylaja.sharath_31_20250318125700082_Video_ENC.mp4
  Successfully extracted to: 19853_shylaja.sharath_31_20250318125700082_Video_ENC.wav (took 3.01s)
  Extracting audio from: 19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).mp4
  Successfully extracted to: 19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).wav (took 2.82s)
  Extracting audio from: 19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).mp4
  Successfully extracted to: 19853_shylaja.sharath_31_202503181127000

Step 3 & 4: Setting Up the Transcription Model (Whisper via Hugging Face)

In [1]:
# Step 3 & 4 (Modified): Install Libraries and Configure MEDIUM Whisper Pipeline

import os
import subprocess
import torch
from pathlib import Path

# 1. Install Hugging Face libraries (if kernel restart cleared them)
# It's usually safe to run this again.
print("Ensuring Hugging Face libraries are installed: transformers, datasets, accelerate, soundfile...")
install_libs = subprocess.run([
    'pip', 'install', '-q',
    'transformers>=4.30.0',
    'datasets>=2.14.0',
    'accelerate>=0.21.0',
    'soundfile',
    'torch',
    'sentencepiece',
    'protobuf'
], capture_output=True, text=True)

if install_libs.returncode == 0:
    print("Libraries installed/verified successfully.")
else:
    print("Error installing libraries:")
    print(install_libs.stderr)
    raise RuntimeError("Failed to install required Hugging Face libraries.")

# Import necessary components AFTER installation
from transformers import pipeline
import soundfile as sf
import gc

# 2. Verify GPU again and define device
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    print(f"\nGPU confirmed: {gpu_name}. Using device: {device}")
    torch_dtype = torch.float16
    print(f"Using torch dtype: {torch_dtype}")
else:
    device = torch.device("cpu")
    torch_dtype = torch.float32
    print("Warning: GPU not available. Using CPU. Transcription will be VERY slow.")
    print(f"Using torch dtype: {torch_dtype}")

# 3. Configure and load the ASR pipeline (MEDIUM MODEL)
# ************************************************
# Changed model_id to whisper-medium
model_id = "openai/whisper-medium"
# ************************************************
print(f"\nLoading ASR pipeline for model: {model_id}")
print("This may take a few minutes to download the model...")

# Clear any previously loaded model from memory explicitly
pipe = None
gc.collect()
torch.cuda.empty_cache()

try:
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_id,
        torch_dtype=torch_dtype,
        device_map="auto", # Still use accelerate for optimization
    )
    print("\nASR Pipeline loaded successfully.")

    # Define generation arguments (same as before)
    generate_kwargs = {
        "language": "english",
        "task": "transcribe",
        "return_timestamps": True, # Get segment-level timestamps
    }
    print(f"Transcription settings (generate_kwargs): {generate_kwargs}")

except Exception as e:
    print(f"\nError loading pipeline for model {model_id}: {e}")
    print("Possible issues:")
    print("- Insufficient GPU memory (try 'whisper-small').")
    print("- Network issues downloading the model.")
    print("- Compatibility issues between libraries.")
    pipe = None # Ensure pipe is None if loading fails

# --- Placeholder for actual transcription loop (Next Step) ---
if pipe:
    print("\nSetup complete. Ready to attempt transcription with the MEDIUM model.")
else:
    print("\nSetup failed. Cannot proceed with transcription.")

Ensuring Hugging Face libraries are installed: transformers, datasets, accelerate, soundfile...
Libraries installed/verified successfully.


2025-04-22 05:43:32.902615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745300612.924942     926 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745300612.931859     926 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



GPU confirmed: Tesla T4. Using device: cuda
Using torch dtype: torch.float16

Loading ASR pipeline for model: openai/whisper-medium
This may take a few minutes to download the model...


config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cuda:0



ASR Pipeline loaded successfully.
Transcription settings (generate_kwargs): {'language': 'english', 'task': 'transcribe', 'return_timestamps': True}

Setup complete. Ready to attempt transcription with the MEDIUM model.


Step 5: Video Transcription Component - Performing Transcription

In [6]:
# Step 5: Transcription Loop

import time
import soundfile as sf
import numpy as np
import gc
import torch
from pathlib import Path
# Assuming 'pipe', 'output_base_path', and 'generate_kwargs' are defined
# from the previous cell (Step 3 & 4)

# Define where the audio files are and where transcripts should go
audio_input_base = output_base_path # Contains subject folders -> audio subfolders
transcript_output_base = output_base_path # We'll create transcripts subfolders here

# --- Transcription Function ---
def format_timestamp(seconds):
    """Converts seconds to HH:MM:SS.ms format"""
    if seconds is None:
        return "N/A"
    millisec = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02}.{millisec:03}"

def transcribe_audio_file(audio_path, transcript_path, pipeline, gen_kwargs):
    """Transcribes a single audio file and saves the formatted result."""
    global pipe # Ensure we are using the globally loaded pipeline
    if not pipe:
      print(f"  Skipping {audio_path.name}, pipeline not available.")
      return False, 0.0 # Indicate failure and duration 0

    print(f"  Starting transcription for: {audio_path.name}")
    start_time = time.time()
    try:
        # Check if audio file is valid and get info (optional but good practice)
        try:
            audio_info = sf.info(str(audio_path))
            if audio_info.samplerate != 16000:
                 print(f"    Warning: Sample rate is {audio_info.samplerate}Hz, expected 16000Hz. Model might perform suboptimally.")
            if audio_info.channels != 1:
                 print(f"    Warning: Audio has {audio_info.channels} channels, expected 1 (mono). Model might perform suboptimally.")
        except Exception as e:
            print(f"    Warning: Could not read audio file info for {audio_path.name}: {e}")
            # Decide whether to continue or skip
            # return False, 0.0 # Option: Skip if info can't be read

        # Perform transcription - Pipeline can often take the path directly
        # The pipeline handles loading, chunking, and inference
        outputs = pipeline(str(audio_path), chunk_length_s=30, stride_length_s=5, generate_kwargs=gen_kwargs, return_timestamps=True) # Ensure timestamps are requested

        full_text = outputs["text"].strip()
        chunks = outputs.get("chunks", []) # Chunks contain timestamps

        # Format the output
        output_content = f"--- Full Transcript ---\n{full_text}\n\n--- Timestamps (Segment/Chunk Level) ---\n"
        if chunks:
            for i, chunk in enumerate(chunks):
                start, end = chunk['timestamp']
                # Sometimes start/end can be None, handle this gracefully
                start_str = format_timestamp(start) if start is not None else "???"
                end_str = format_timestamp(end) if end is not None else "???"
                chunk_text = chunk['text'].strip()
                output_content += f"[{start_str} -> {end_str}] {chunk_text}\n"
        else:
            output_content += "No timestamp information available.\n"

        # Save the transcript
        transcript_path.parent.mkdir(parents=True, exist_ok=True)
        with open(transcript_path, 'w', encoding='utf-8') as f:
            f.write(output_content)

        end_time = time.time()
        duration = end_time - start_time
        print(f"  Finished transcription for: {audio_path.name} (took {duration:.2f}s)")
        print(f"  Saved transcript to: {transcript_path.relative_to(output_base_path)}")

        # Clean up GPU memory after processing each file
        torch.cuda.empty_cache()
        gc.collect()

        return True, duration

    except FileNotFoundError:
        print(f"  Error: Audio file not found at {audio_path}")
        return False, 0.0
    except sf.SoundFileError as e:
        print(f"  Error reading audio file {audio_path.name}: {e}")
        return False, 0.0
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"  Error: CUDA out of memory during transcription for {audio_path.name}.")
            print("    Try reducing batch size if applicable, using a smaller model, or ensuring GPU has enough VRAM.")
            # You might want to release memory explicitly here if possible
            # del outputs # Try deleting intermediate variables
            torch.cuda.empty_cache()
            gc.collect()
        else:
            print(f"  Runtime error during transcription for {audio_path.name}: {e}")
        return False, 0.0
    except Exception as e:
        print(f"  An unexpected error occurred during transcription for {audio_path.name}: {e}")
        # Print traceback for detailed debugging if needed
        # import traceback
        # traceback.print_exc()
        return False, 0.0


# --- Main Transcription Loop ---
total_audio_files = 0
successful_transcriptions = 0
failed_transcriptions = 0
total_transcription_time = 0.0

print("\n--- Starting Transcription Process ---")

# Iterate through subject folders in the output directory (where audio was saved)
for subject_dir in audio_input_base.iterdir():
    if subject_dir.is_dir():
        audio_dir = subject_dir / "audio"
        transcript_dir = transcript_output_base / subject_dir.name / "transcripts"

        if audio_dir.exists() and audio_dir.is_dir():
            print(f"\nProcessing audio in: {audio_dir.relative_to(output_base_path)}")
            # Find audio files in the current subject's audio directory
            audio_files = list(audio_dir.glob('*.wav')) # Look specifically for .wav files

            if not audio_files:
                print(f"  No .wav files found in {audio_dir}.")
                continue

            for audio_file in audio_files:
                total_audio_files += 1
                # Define the output transcript file path
                output_transcript_file = transcript_dir / f"{audio_file.stem}.txt"

                # Transcribe the audio file
                success, duration = transcribe_audio_file(audio_file, output_transcript_file, pipe, generate_kwargs)

                if success:
                    successful_transcriptions += 1
                    total_transcription_time += duration
                else:
                    failed_transcriptions += 1
        else:
             # Check if the subject dir itself contains wav files (if structure differs)
             audio_files_in_subj = list(subject_dir.glob('*.wav'))
             if not audio_files_in_subj:
                print(f"\nSkipping subject {subject_dir.name}: No 'audio' subdirectory found and no .wav files in subject root.")


print("\n--- Transcription Summary ---")
print(f"Total audio files found: {total_audio_files}")
print(f"Successfully transcribed: {successful_transcriptions}")
print(f"Failed transcriptions: {failed_transcriptions}")
if successful_transcriptions > 0:
    avg_time = total_transcription_time / successful_transcriptions
    print(f"Total transcription time: {total_transcription_time:.2f}s")
    print(f"Average time per file: {avg_time:.2f}s")


# Optional: Verify by listing some output transcript files
if successful_transcriptions > 0:
    print("\nExample output transcript files:")
    example_count = 0
    for subject_dir in transcript_output_base.iterdir():
        if subject_dir.is_dir() and (subject_dir / "transcripts").exists():
             for transcript_file in (subject_dir / "transcripts").iterdir():
                 if transcript_file.suffix == '.txt' and example_count < 5:
                     print(f"- {transcript_file.relative_to(output_base_path)}")
                     example_count += 1
             if example_count >= 5:
                 break
    if example_count == 0:
         print("Could not find any example .txt files in the output transcript directories.")


--- Starting Transcription Process ---

Processing audio in: agentic/audio
  Starting transcription for: 19853_shylaja.sharath_31_20250401121200417_Video_ENC.wav


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


  Finished transcription for: 19853_shylaja.sharath_31_20250401121200417_Video_ENC.wav (took 506.29s)
  Saved transcript to: agentic/transcripts/19853_shylaja.sharath_31_20250401121200417_Video_ENC.txt

Processing audio in: TBT/audio
  Starting transcription for: 6a_2020-09-22 09-49-32_TBTCon.wav




  Finished transcription for: 6a_2020-09-22 09-49-32_TBTCon.wav (took 283.08s)
  Saved transcript to: TBT/transcripts/6a_2020-09-22 09-49-32_TBTCon.txt
  Starting transcription for: 6b_2020-09-25 11-29-19_TBTCode 00_00_04-00_35_198.wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  Finished transcription for: 6b_2020-09-25 11-29-19_TBTCode 00_00_04-00_35_198.wav (took 333.53s)
  Saved transcript to: TBT/transcripts/6b_2020-09-25 11-29-19_TBTCode 00_00_04-00_35_198.txt

Processing audio in: stable_diffusion/audio
  Starting transcription for: 19853_shylaja.sharath_31_20250327084200249_Video_ENC.wav




  Finished transcription for: 19853_shylaja.sharath_31_20250327084200249_Video_ENC.wav (took 443.33s)
  Saved transcript to: stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327084200249_Video_ENC.txt
  Starting transcription for: 19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).wav


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


  Finished transcription for: 19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).wav (took 315.05s)
  Saved transcript to: stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).txt

Processing audio in: BST/audio
  No .wav files found in /kaggle/working/output/BST/audio.

Processing audio in: Multimodal/audio
  Starting transcription for: 19853_shylaja.sharath_31_20250401112700078_Video_ENC.wav




  Finished transcription for: 19853_shylaja.sharath_31_20250401112700078_Video_ENC.wav (took 510.73s)
  Saved transcript to: Multimodal/transcripts/19853_shylaja.sharath_31_20250401112700078_Video_ENC.txt

Processing audio in: binary_tree_traversal/audio
  Starting transcription for: 5a_2020-09-15 09-04-51_BinTraversal.wav




  Finished transcription for: 5a_2020-09-15 09-04-51_BinTraversal.wav (took 385.36s)
  Saved transcript to: binary_tree_traversal/transcripts/5a_2020-09-15 09-04-51_BinTraversal.txt

Processing audio in: Heap/audio
  Starting transcription for: 8a_2020-09-24 13-07-04_HeapCon 00_00_01-00_42_121.wav




  Finished transcription for: 8a_2020-09-24 13-07-04_HeapCon 00_00_01-00_42_121.wav (took 424.95s)
  Saved transcript to: Heap/transcripts/8a_2020-09-24 13-07-04_HeapCon 00_00_01-00_42_121.txt
  Starting transcription for: 8b_2020-09-26 09-34-27_HeapCode 00_00_09-00_56_50.wav




  Finished transcription for: 8b_2020-09-26 09-34-27_HeapCode 00_00_09-00_56_50.wav (took 509.65s)
  Saved transcript to: Heap/transcripts/8b_2020-09-26 09-34-27_HeapCode 00_00_09-00_56_50.txt

Processing audio in: Lora&Qlora/audio
  Starting transcription for: 19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).wav




  Finished transcription for: 19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).wav (took 472.77s)
  Saved transcript to: Lora&Qlora/transcripts/19853_shylaja.sharath_31_20250318112700094_Video_ENC (1).txt


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  Starting transcription for: 19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).wav




  Finished transcription for: 19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).wav (took 485.11s)
  Saved transcript to: Lora&Qlora/transcripts/19853_shylaja.sharath_31_20250318121200085_Video_ENC (1).txt
  Starting transcription for: 19853_shylaja.sharath_31_20250318125700082_Video_ENC.wav




  Finished transcription for: 19853_shylaja.sharath_31_20250318125700082_Video_ENC.wav (took 230.95s)
  Saved transcript to: Lora&Qlora/transcripts/19853_shylaja.sharath_31_20250318125700082_Video_ENC.txt

Processing audio in: expn_tree/audio
  Starting transcription for: 7b_2020-09-25 12-12-37_ExprTReeCode 00_00_09-00_26_52.wav




  Finished transcription for: 7b_2020-09-25 12-12-37_ExprTReeCode 00_00_09-00_26_52.wav (took 204.62s)
  Saved transcript to: expn_tree/transcripts/7b_2020-09-25 12-12-37_ExprTReeCode 00_00_09-00_26_52.txt
  Starting transcription for: 7a_2020-09-24 09-28-52_ExprTreeCon.wav




  Finished transcription for: 7a_2020-09-24 09-28-52_ExprTreeCon.wav (took 361.37s)
  Saved transcript to: expn_tree/transcripts/7a_2020-09-24 09-28-52_ExprTreeCon.txt

Processing audio in: Tree_traversal/audio
  Starting transcription for: 9a_2020-09-16 09-46-13_TreeTravCon.wav




  Finished transcription for: 9a_2020-09-16 09-46-13_TreeTravCon.wav (took 211.30s)
  Saved transcript to: Tree_traversal/transcripts/9a_2020-09-16 09-46-13_TreeTravCon.txt

--- Transcription Summary ---
Total audio files found: 15
Successfully transcribed: 15
Failed transcriptions: 0
Total transcription time: 5678.11s
Average time per file: 378.54s

Example output transcript files:
- agentic/transcripts/19853_shylaja.sharath_31_20250401121200417_Video_ENC.txt
- TBT/transcripts/6b_2020-09-25 11-29-19_TBTCode 00_00_04-00_35_198.txt
- TBT/transcripts/6a_2020-09-22 09-49-32_TBTCon.txt
- stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327084200249_Video_ENC.txt
- stable_diffusion/transcripts/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1).txt


Step 6: PowerPoint / PDF Processing Component

In [7]:
# Step 6: Presentation Content Extraction (PPTX and PDF)

import os
import subprocess
import json
from pathlib import Path
import gc

# 1. Install necessary libraries
print("Installing libraries for presentation processing: python-pptx, pdfplumber...")
# Pillow is often needed by pdfplumber for image handling, install explicitly
install_pres_libs = subprocess.run([
    'pip', 'install', '-q',
    'python-pptx>=0.6.21',
    'pdfplumber>=0.10.0', # Use a recent version
    'Pillow'
], capture_output=True, text=True)

if install_pres_libs.returncode == 0:
    print("Libraries installed successfully.")
else:
    print("Error installing presentation processing libraries:")
    print(install_pres_libs.stderr)
    # Consider raising an error if libraries are essential
    # raise RuntimeError("Failed to install required presentation libraries.")

# Import AFTER installation
try:
    import pptx
    import pdfplumber
except ImportError as e:
    print(f"Failed to import libraries: {e}. Please ensure installation was successful.")
    # Stop execution if imports fail
    raise

# Define base paths (original data and output)
# dataset_base_path = Path("/kaggle/input/llm-data/LLM_DATASET") # From Step 1
# output_base_path = Path("/kaggle/working/output") # From Step 1

# Supported presentation extensions
presentation_extensions = ['.pptx', '.pdf']

# --- PPTX Processing Function ---
def process_pptx(pptx_path):
    """Extracts content (titles, text, notes) from a PPTX file."""
    print(f"  Processing PPTX: {pptx_path.name}")
    presentation_data = {
        "source_file": pptx_path.name,
        "file_type": "pptx",
        "slides": []
    }
    try:
        prs = pptx.Presentation(str(pptx_path))
        for i, slide in enumerate(prs.slides):
            slide_data = {
                "slide_number": i + 1,
                "title": None,
                "content": [],
                "notes": None
            }

            # Extract notes if available
            if slide.has_notes_slide:
                notes_slide = slide.notes_slide
                text_frame = notes_slide.notes_text_frame
                if text_frame:
                  slide_data["notes"] = text_frame.text.strip() if text_frame.text else None


            # Extract shapes' text, trying to identify title and content
            title_shape = None
            # Attempt to find title placeholder first
            if slide.shapes.title:
                title_shape = slide.shapes.title
                if title_shape.has_text_frame and title_shape.text.strip():
                   slide_data["title"] = title_shape.text.strip()


            for shape in slide.shapes:
                # Skip the title shape if already processed
                if shape == title_shape:
                    continue

                if shape.has_text_frame:
                    text = shape.text_frame.text.strip()
                    if text:
                       # Simple heuristic: If title not found and shape is high up, maybe it's the title
                       if slide_data["title"] is None and shape.top < pptx.util.Inches(1.5):
                           slide_data["title"] = text
                       else:
                           # Add paragraphs, preserving some indentation sense for lists
                           for paragraph in shape.text_frame.paragraphs:
                               para_text = paragraph.text.strip()
                               if para_text:
                                   indent_level = paragraph.level
                                   prefix = "  " * indent_level + "- " if indent_level > 0 else ""
                                   slide_data["content"].append(prefix + para_text)


            # Fallback if title is still None, use first non-empty content line?
            if slide_data["title"] is None and slide_data["content"]:
                # Check if first line looks like a title (e.g., short, no bullet)
                first_line = slide_data["content"][0]
                if not first_line.strip().startswith('-'):
                     slide_data["title"] = first_line
                     # slide_data["content"].pop(0) # Optional: remove it from content if used as title

            # Clean up empty entries
            if not slide_data["content"]: slide_data["content"] = None
            if not slide_data["notes"]: slide_data["notes"] = None

            presentation_data["slides"].append(slide_data)

        return presentation_data

    except Exception as e:
        print(f"  Error processing {pptx_path.name}: {e}")
        # Optionally return partial data or error indicator
        # import traceback
        # traceback.print_exc()
        return {"source_file": pptx_path.name, "file_type": "pptx", "error": str(e)}


# --- PDF Processing Function ---
def process_pdf(pdf_path):
    """Extracts text content page by page from a PDF file."""
    print(f"  Processing PDF: {pdf_path.name}")
    presentation_data = {
        "source_file": pdf_path.name,
        "file_type": "pdf",
        "pages": []
    }
    try:
        with pdfplumber.open(str(pdf_path)) as pdf:
            for i, page in enumerate(pdf.pages):
                # Extract text with some tolerance to maintain layout
                # Adjust tolerance as needed based on PDF structure
                text = page.extract_text(x_tolerance=2, y_tolerance=3, layout=False, sort_by_position=True)
                page_data = {
                    "page_number": i + 1,
                    "text": text.strip() if text else None
                }
                if page_data["text"]: # Only add pages with actual text content
                   presentation_data["pages"].append(page_data)

        return presentation_data
    except Exception as e:
        print(f"  Error processing {pdf_path.name}: {e}")
        # import traceback
        # traceback.print_exc()
        return {"source_file": pdf_path.name, "file_type": "pdf", "error": str(e)}


# --- Main Processing Loop ---
total_presentations = 0
processed_presentations = 0
failed_presentations = 0

print("\n--- Starting Presentation File Processing ---")

# Iterate through subject folders in the ORIGINAL dataset directory
for subject_dir in dataset_base_path.iterdir():
    if subject_dir.is_dir():
        print(f"\nProcessing subject: {subject_dir.name}")

        # Create corresponding output directory structure for presentation data
        output_subject_pres_dir = output_base_path / subject_dir.name / "presentation_data"
        output_subject_pres_dir.mkdir(parents=True, exist_ok=True)

        # Find presentation files in the current subject directory
        presentation_files = [f for f in subject_dir.iterdir()
                              if f.is_file() and f.suffix.lower() in presentation_extensions]

        if not presentation_files:
            print(f"  No presentation files ({', '.join(presentation_extensions)}) found in {subject_dir.name}.")
            continue

        for pres_file in presentation_files:
            total_presentations += 1
            output_json_path = output_subject_pres_dir / f"{pres_file.stem}_content.json"
            extracted_data = None

            if pres_file.suffix.lower() == '.pptx':
                extracted_data = process_pptx(pres_file)
            elif pres_file.suffix.lower() == '.pdf':
                extracted_data = process_pdf(pres_file)

            if extracted_data and "error" not in extracted_data:
                try:
                    with open(output_json_path, 'w', encoding='utf-8') as f:
                        json.dump(extracted_data, f, indent=2, ensure_ascii=False)
                    print(f"  Saved extracted data to: {output_json_path.relative_to(output_base_path)}")
                    processed_presentations += 1
                except Exception as e:
                    print(f"  Error saving JSON for {pres_file.name}: {e}")
                    failed_presentations += 1
            else:
                print(f"  Failed to extract data from {pres_file.name}.")
                # Check if partial data with error was returned
                if extracted_data and "error" in extracted_data:
                    print(f"  Reason: {extracted_data.get('error', 'Unknown error')}")
                failed_presentations += 1

            # Clean up memory
            gc.collect()


print("\n--- Presentation Processing Summary ---")
print(f"Total presentation files found: {total_presentations}")
print(f"Successfully processed and saved: {processed_presentations}")
print(f"Failed processing/saving: {failed_presentations}")

# Optional: Verify by listing some output JSON files
if processed_presentations > 0:
    print("\nExample output JSON files:")
    example_count = 0
    for subject_dir in output_base_path.iterdir():
        pres_data_dir = subject_dir / "presentation_data"
        if subject_dir.is_dir() and pres_data_dir.exists():
             for json_file in pres_data_dir.iterdir():
                 if json_file.suffix == '.json' and example_count < 5:
                     print(f"- {json_file.relative_to(output_base_path)}")
                     example_count += 1
             if example_count >= 5:
                 break
    if example_count == 0:
         print("Could not find any example .json files in the output presentation_data directories.")

Installing libraries for presentation processing: python-pptx, pdfplumber...
Libraries installed successfully.

--- Starting Presentation File Processing ---

Processing subject: stable_diffusion
  Processing PDF: Stable Diffusion.pdf
  Saved extracted data to: stable_diffusion/presentation_data/Stable Diffusion_content.json

Processing subject: Lora&Qlora
  Processing PDF: Finetuning.pdf
  Saved extracted data to: Lora&Qlora/presentation_data/Finetuning_content.json

Processing subject: expn_tree
  Processing PPTX: Class7_Unit3_Trees_ExprTree.pptx
  Saved extracted data to: expn_tree/presentation_data/Class7_Unit3_Trees_ExprTree_content.json

Processing subject: agentic
  Processing PDF: AutoGen CrewAI.pdf
  Saved extracted data to: agentic/presentation_data/AutoGen CrewAI_content.json
  Processing PDF: Agentic Workflow.pdf
  Saved extracted data to: agentic/presentation_data/Agentic Workflow_content.json

Processing subject: Heap
  Processing PPTX: Class8_Unit3_Trees_Heap.pptx
  Save

Step 7: Content Integration and Note Generation with LLMs

Step 7.1: Install Groq Library and Initialize Client

In [27]:
# Step 7.1 (Modified - Revert to Llama 3 8B): Install Groq Library and Initialize Client

import os
import subprocess
import json
from pathlib import Path
import time
import gc

# 1. Install Groq Python library (run again in case kernel restarted)
print("Ensuring Groq library is installed...")
install_groq_lib = subprocess.run(['pip', 'install', '-q', 'groq>=0.4.0'], capture_output=True, text=True)

if install_groq_lib.returncode == 0:
    print("Groq library installed/verified successfully.")
else:
    print("Error installing Groq library:")
    print(install_groq_lib.stderr)
    raise RuntimeError("Failed to install Groq library.")

# Import AFTER installation
try:
    from groq import Groq, RateLimitError, APIError
    # Using tiktoken to estimate token count for chunking
    # Install it if not already present
    try:
        import tiktoken
    except ImportError:
        print("Installing tiktoken library for token counting...")
        subprocess.run(['pip', 'install', '-q', 'tiktoken'], check=True)
        import tiktoken
    print("Tiktoken library available.")

    from kaggle_secrets import UserSecretsClient
except ImportError as e:
    print(f"Failed to import libraries: {e}. Please ensure installation was successful.")
    raise

# 2. Access API Key from Kaggle Secrets and Initialize Client
groq_api_key = None
client = None
try:
    user_secrets = UserSecretsClient()
    groq_api_key = user_secrets.get_secret("GROQ_API_KEY")

    if not groq_api_key:
        print("ERROR: Groq API Key not found in Kaggle Secrets.")
        print("Please ensure you added a secret with the label 'GROQ_API_KEY'.")
    else:
        print("Groq API Key retrieved successfully.")
        client = Groq(api_key=groq_api_key)
        print("Groq client initialized.")

except Exception as e:
    print(f"Error accessing Kaggle Secrets or initializing Groq client: {e}")
    print("Ensure Kaggle Secrets are properly configured and the key label is 'GROQ_API_KEY'.")

# 3. Define LLM Model to use - REVERTING TO LLAMA 3 8B (8k Context)
# *********************************************************************
# Reverting to llama3-8b-8192 as large context models failed.
# We will implement chunking.
llm_model_name = "llama3-8b-8192"
# *********************************************************************
print(f"Using LLM model: {llm_model_name} (8k context)")

# --- Placeholder for Note Generation Loop (Next Part) ---
if client:
    print(f"\nSetup complete. Ready for note generation with {llm_model_name} and chunking.")
else:
    print("\nSetup failed. Cannot proceed with note generation.")

Ensuring Groq library is installed...
Groq library installed/verified successfully.
Tiktoken library available.
Groq API Key retrieved successfully.
Groq client initialized.
Using LLM model: llama3-8b-8192 (8k context)

Setup complete. Ready for note generation with llama3-8b-8192 and chunking.


Step 7.2: Generating Lecture Notes using LLM

In [28]:
# Step 7.2 - Cell 1 (Modified for Chunking): Helper Functions, Chunking, Prompt

import json
from pathlib import Path
import time
import gc
import tiktoken # For token counting / chunking estimate
# Requires Groq, RateLimitError, APIError from groq (imported in 7.1)
# Assumes 'client', 'llm_model_name', 'output_base_path' are defined from 7.1

# --- Tokenizer for estimating length ---
# Use a tokenizer appropriate for Llama 3 models (cl100k_base is common)
try:
    tokenizer = tiktoken.get_encoding("cl100k_base")
except Exception as e:
     print(f"Warning: Could not get tiktoken encoder 'cl100k_base'. Using basic split for chunking. Error: {e}")
     tokenizer = None

# --- Helper Function for Text Chunking ---
def chunk_text(text, max_tokens_per_chunk=2000, overlap_tokens=200):
    """Splits text into chunks based on estimated token count with overlap."""
    if not tokenizer:
        # Fallback to character-based chunking if tokenizer failed
        print("Warning: Using character-based chunking due to tokenizer issue.")
        max_len = max_tokens_per_chunk * 4 # Rough estimate: 4 chars per token
        overlap_len = overlap_tokens * 4
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + max_len, len(text))
            chunks.append(text[start:end])
            start += max_len - overlap_len
            if start >= len(text): break # Avoid infinite loop on short overlap
        return chunks

    # Token-based chunking
    tokens = tokenizer.encode(text)
    chunks = []
    start_token = 0
    while start_token < len(tokens):
        end_token = min(start_token + max_tokens_per_chunk, len(tokens))
        # Decode the chunk tokens back to text
        chunk_text = tokenizer.decode(tokens[start_token:end_token])
        chunks.append(chunk_text)

        next_start_token = start_token + max_tokens_per_chunk - overlap_tokens
        # Ensure we don't get stuck if overlap is too large or chunk is small
        if next_start_token <= start_token:
             next_start_token = start_token + 1 # Force progression

        start_token = next_start_token
        if start_token >= len(tokens): break

    return chunks


# --- Helper Function to Format Presentation JSON ---
# (Keep the format_presentation_data function exactly as it was before)
def format_presentation_data(data):
    """Converts presentation JSON data into a readable string for the LLM."""
    if not data or ("error" in data):
        return "No presentation data available or there was an error processing it.\n"

    output = f"--- Presentation Content ({data['file_type']}: {data['source_file']}) ---\n\n"
    if data['file_type'] == 'pptx':
        if not data.get('slides'):
             return output + "No slides found or extracted.\n"
        for slide in data['slides']:
            output += f"## Slide {slide['slide_number']}\n"
            if slide.get('title'):
                output += f"### Title: {slide['title']}\n"
            if slide.get('content'):
                output += "Content:\n"
                for line in slide['content']:
                     output += f"{line}\n"
            if slide.get('notes'):
                output += f"Presenter Notes: {slide['notes']}\n"
            output += "\n"
    elif data['file_type'] == 'pdf':
        if not data.get('pages'):
            return output + "No pages found or extracted.\n"
        for page in data['pages']:
            output += f"## Page {page['page_number']}\n"
            if page.get('text'):
                output += f"Text:\n{page['text']}\n"
            output += "\n"
    else:
        output += "Unknown presentation format.\n"

    output += "--- End of Presentation Content ---\n"
    return output


# --- LLM Prompt Template (Slightly Modified for Chunks) ---
# Added mention that it's a segment/chunk
prompt_template = """
You are an expert AI assistant tasked with creating comprehensive, structured lecture notes *for a specific segment* of a lecture.
You will be given a transcript segment from the spoken lecture and the content of the *entire* accompanying presentation (slides/PDF).
Your goal is to synthesize information from BOTH sources to generate high-quality notes *focused on the content covered in the provided transcript segment*, using the presentation for context and structure.

**Instructions:**

1.  **Analyze Both Inputs:** Carefully read the provided Lecture Transcript Segment and the full Presentation Content.
2.  **Focus on the Segment:** Generate notes *primarily* based on the information discussed in the **Lecture Transcript Segment**.
3.  **Use Presentation for Context:** Refer to the full Presentation Content to understand the structure, headings, and visual information relevant to the current transcript segment. Integrate points from the slides if they are clearly discussed or referenced *within the segment*.
4.  **Synthesize Information:** Combine relevant information. Don't just copy; integrate the ideas. Enrich presentation points with details, explanations, and examples mentioned *in the transcript segment*.
5.  **Structure the Notes:** Organize the notes logically using Markdown formatting:
    * Use headings (`##`, `###`) for main topics and sub-topics *relevant to this segment*.
    * Use bullet points (`-` or `*`) for key details, definitions, and examples *from this segment*.
    * Use bold text (`**text**`) to highlight important keywords, concepts, or definitions *mentioned in this segment*.
    * Ensure clear paragraph breaks for readability.
6.  **Output Format:** Produce **Markdown (.md)** formatted notes for *this segment only*. Do not include any introductory or concluding remarks like "Here are the notes:" or "I hope this helps.". Start directly with the first note heading relevant to the segment. If the segment is very short or contains no substantive information (e.g., only greetings or silence), output only `# [Segment Contains No Substantive Content]`.

**Input Data:**

{presentation_content}

**Lecture Transcript Segment:**

```text
{transcript_chunk}

Generated Lecture Notes for this Segment (Markdown Format):
"""

# --- Function to call LLM with retry for rate limits ---
# (Ensure tokenizer is available in the global scope if used here)
def generate_notes_with_groq(prompt, max_retries=2, initial_delay=5):
    """Calls the Groq API to generate notes, handles rate limits with retry."""
    global client, llm_model_name, tokenizer # Use global client, model name, and tokenizer
    if not client:
        print("  Error: Groq client not initialized.")
        return None

    delay = initial_delay
    for attempt in range(max_retries + 1):
        try:
            # Estimate prompt tokens BEFORE sending (optional but good practice)
            prompt_tokens = 0
            if tokenizer:
                try:
                    prompt_tokens = len(tokenizer.encode(prompt))
                    # print(f"    Estimated prompt tokens: {prompt_tokens}") # Debug print
                    # Check against model limit (leave room for completion tokens)
                    # Llama3 8k limit (8192). Let's set a safer threshold like 7000.
                    token_threshold = 7000
                    if prompt_tokens > token_threshold:
                        print(f"  Error: Estimated prompt tokens ({prompt_tokens}) exceed safety margin ({token_threshold}) for {llm_model_name}. Skipping API call.")
                        return "## Error: Content Too Long\n\nThe text chunk combined with presentation data exceeded the safe input length for this model. Notes could not be generated for this segment."
                except Exception as e:
                    print(f"  Warning: Token estimation failed. Proceeding without check. Error: {e}")

            # Actual API Call
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model=llm_model_name,
                temperature=0.5,
            )

            # Process Response
            if chat_completion.choices and chat_completion.choices[0].message:
                response_content = chat_completion.choices[0].message.content
                if response_content and isinstance(response_content, str) and response_content.strip():
                    # Success case
                    return response_content
                else:
                    # Handle empty response from LLM
                    print(f"  Warning: Received empty content from LLM (Attempt {attempt + 1}/{max_retries + 1}).")
                    if attempt < max_retries:
                        print(f"   Retrying in {delay} seconds...")
                        time.sleep(delay)
                        delay *= 2
                        continue # Go to next attempt
                    else:
                        return None # Failed after retries
            else:
                # Handle invalid response structure
                print(f"  Warning: Received invalid response structure from LLM (Attempt {attempt + 1}/{max_retries + 1}).")
                if attempt < max_retries:
                    print(f"   Retrying in {delay} seconds...")
                    time.sleep(delay)
                    delay *= 2
                    continue # Go to next attempt
                else:
                    return None # Failed after retries

        except RateLimitError as e:
            # Handle rate limit errors
            if attempt < max_retries:
                print(f"  Warning: Rate limit hit (Attempt {attempt + 1}/{max_retries + 1}). Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2 # Exponential backoff
            else:
                print(f"  Error: Rate limit exceeded after {max_retries} retries. {e}")
                return None # Failed after retries
        except APIError as e:
            # Handle API errors (including context length)
            error_message = str(e).lower()
            if "request too large" in error_message or \
               "context_length_exceeded" in error_message or \
               "maximum context length" in error_message or \
               "prompt is too long" in error_message:
                print(f"  Error: Input context length exceeded for model {llm_model_name}. Prompt is too long.")
                # Check if the error occurred despite passing the initial token check
                if prompt_tokens <= token_threshold:
                     print(f"    Note: Context error occurred even though estimated tokens ({prompt_tokens}) were below threshold ({token_threshold}). Tokenizer mismatch or API limit issue?")
                return "## Error: Content Too Long\n\nThe combined transcript and presentation content exceeded the maximum length for this model. Notes could not be generated for this segment."
            else:
                # Handle other API errors
                print(f"  Error: Groq API error (Attempt {attempt + 1}/{max_retries + 1}): {e}")
                # Consider retrying certain API errors? For now, fail immediately.
                return None
        except Exception as e:
            # Handle unexpected errors during the call
            print(f"  Error: An unexpected error occurred during LLM call (Attempt {attempt + 1}/{max_retries + 1}): {e}")
            # import traceback # Uncomment for detailed debugging
            # traceback.print_exc()
            if attempt < max_retries:
                 print(f"   Retrying in {delay} seconds...")
                 time.sleep(delay)
                 delay *= 2
                 continue # Go to next attempt
            else:
                 return None # Failed after retries

    # Fallback if all retries fail
    return None

# This print statement should be AFTER the function definition in your cell
print("Helper functions (incl. chunking), updated prompt template, and LLM caller defined.")


Helper functions (incl. chunking), updated prompt template, and LLM caller defined.


In [29]:
# Step 7.2 - Cell 2 (Modified for Chunking): Main Note Generation Loop

# Initialize counters
total_transcripts_processed_for_notes = 0
successful_notes_files = 0
failed_notes_files = 0
# Note: Counters now track *files* processed, success/failure determined by processing *all* chunks for a file.

print("\n--- Starting Lecture Note Generation (with Chunking) ---")

# Iterate through subject folders in the output directory
for subject_dir in output_base_path.iterdir():
    if not subject_dir.is_dir():
        continue

    print(f"\nProcessing subject: {subject_dir.name}")

    # --- Load Presentation Data ---
    presentation_data_dir = subject_dir / "presentation_data"
    presentation_json_files = list(presentation_data_dir.glob('*.json'))
    if not presentation_json_files:
        print(f"  Skipping subject {subject_dir.name}: No presentation JSON file found.")
        continue
    presentation_json_path = presentation_json_files[0]
    print(f"  Using presentation data: {presentation_json_path.name}")
    try:
        with open(presentation_json_path, 'r', encoding='utf-8') as f:
            presentation_data = json.load(f)
        # Use helper function from Cell 1
        formatted_presentation = format_presentation_data(presentation_data)
        if "error processing it" in formatted_presentation:
             print(f"  Warning: Issue noted during formatting presentation data for {subject_dir.name}.")
    except Exception as e:
        print(f"  Error loading or formatting presentation JSON {presentation_json_path.name}: {e}")
        continue # Skip subject if presentation can't be loaded/formatted

    # --- Find Transcript Files ---
    transcript_dir = subject_dir / "transcripts"
    transcript_files = list(transcript_dir.glob('*.txt'))
    if not transcript_files:
        print(f"  No transcript files found in {transcript_dir}.")
        continue # Move to next subject

    # --- Create Output Directory ---
    output_notes_dir = subject_dir / "final_notes"
    output_notes_dir.mkdir(parents=True, exist_ok=True)

    # --- Process Each Transcript File (with Chunking) ---
    for transcript_path in transcript_files:
        total_transcripts_processed_for_notes += 1
        print(f"\n  Processing transcript file: {transcript_path.name}")
        output_note_path = output_notes_dir / f"{transcript_path.stem}_notes.md"
        all_chunk_notes = [] # Store notes generated for each chunk
        file_had_errors = False # Flag to track if any chunk failed for this file

        # --- Load Transcript Text ---
        try:
            with open(transcript_path, 'r', encoding='utf-8') as f:
                full_content = f.read()
                transcript_header = "--- Full Transcript ---"
                header_index = full_content.find(transcript_header)
                if header_index != -1:
                    text_start_index = full_content.find('\n', header_index) + 1
                    timestamp_marker = "\n\n--- Timestamps"
                    timestamp_index = full_content.find(timestamp_marker, text_start_index)
                    transcript_text = full_content[text_start_index:timestamp_index].strip() if timestamp_index != -1 else full_content[text_start_index:].strip()
                else:
                    transcript_text = full_content.strip()

            if not transcript_text:
                 print("    Warning: Transcript file is empty. Skipping.")
                 failed_notes_files += 1 # Count as failed file
                 continue # Move to next transcript file

        except Exception as e:
            print(f"    Error loading transcript {transcript_path.name}: {e}")
            failed_notes_files += 1 # Count as failed file
            continue # Move to next transcript file

        # --- Chunk the Transcript ---
        # Use chunk_text function defined in Cell 1
        # Adjust chunk/overlap tokens as needed. Smaller chunks = more API calls but less likely to hit context limit.
        # Let's aim for ~3000 tokens combined (prompt+chunk+presentation), so chunk ~1500-2000 tokens?
        transcript_chunks = chunk_text(transcript_text, max_tokens_per_chunk=1800, overlap_tokens=150)
        print(f"    Transcript split into {len(transcript_chunks)} chunks.")

        # --- Process Each Chunk ---
        for i, chunk in enumerate(transcript_chunks):
            print(f"      Processing chunk {i + 1}/{len(transcript_chunks)}...")
            if not chunk.strip():
                 print("      Skipping empty chunk.")
                 continue

            # Construct the final prompt for this chunk
            final_prompt = prompt_template.format(
                presentation_content=formatted_presentation,
                transcript_chunk=chunk # Use the current chunk
            )

            # Call LLM for this chunk
            # Use generate_notes_with_groq function from Cell 1
            start_time = time.time()
            generated_notes_for_chunk = generate_notes_with_groq(final_prompt)
            end_time = time.time()

            if generated_notes_for_chunk:
                print(f"        LLM call for chunk {i + 1} successful (took {end_time - start_time:.2f}s).")
                all_chunk_notes.append(generated_notes_for_chunk)
                # Check if the content indicates an error message was returned
                if generated_notes_for_chunk.strip().startswith("## Error:"):
                    print(f"        NOTE: LLM returned an error message for chunk {i+1}: {generated_notes_for_chunk.splitlines()[0]}")
                    file_had_errors = True # Mark file as having errors even if API call itself succeeded
            else:
                print(f"        LLM call failed or returned empty response for chunk {i + 1}.")
                # Append an error marker to the notes for this chunk
                all_chunk_notes.append(f"\n\n## Error: Failed to generate notes for this segment (Chunk {i+1}).\n\n")
                file_had_errors = True # Mark file as having errors

            # Clean up memory (might be excessive here, but safe)
            gc.collect()
            # Optional delay between chunks if needed for rate limits
            # time.sleep(1)

        # --- Combine Chunk Notes and Save File ---
        print(f"    Finished processing all chunks for {transcript_path.name}.")
        if all_chunk_notes:
            # Combine notes from all chunks with separators
            final_notes_content = "\n\n---\n\n".join(all_chunk_notes) # Add separator between chunks

            # Add a header indicating the source file
            final_output = f"# Lecture Notes: {transcript_path.stem}\n\n"
            if file_had_errors:
                final_output += "**Note:** Errors were encountered during the generation of some segments below.\n\n---\n\n"
            final_output += final_notes_content

            try:
                with open(output_note_path, 'w', encoding='utf-8') as f:
                    f.write(final_output)
                print(f"    Saved combined notes to: {output_note_path.relative_to(output_base_path)}")
                if not file_had_errors:
                    successful_notes_files += 1
                else:
                    failed_notes_files += 1 # Count as failed if any chunk had errors
            except Exception as e:
                print(f"    Error saving combined notes to {output_note_path.name}: {e}")
                failed_notes_files += 1 # Count as failed if saving fails
        else:
            print(f"    No notes were generated for any chunk of {transcript_path.name}.")
            failed_notes_files += 1 # Count as failed if no chunks produced output


print("\n--- Note Generation Loop (with Chunking) Finished ---")


--- Starting Lecture Note Generation (with Chunking) ---

Processing subject: agentic
  Using presentation data: Agentic Workflow_content.json

  Processing transcript file: 19853_shylaja.sharath_31_20250401121200417_Video_ENC.txt
    Transcript split into 6 chunks.
      Processing chunk 1/6...
        LLM call for chunk 1 successful (took 1.40s).
      Processing chunk 2/6...
        LLM call for chunk 2 successful (took 49.33s).
      Processing chunk 3/6...


KeyboardInterrupt: 

In [26]:
# Step 7.2 - Cell 3: Summary and Verification

print("\n--- Note Generation Summary ---")
# These counters were updated in Cell 2
print(f"Total transcripts found for processing: {total_transcripts_processed}")
print(f"Successfully generated notes: {successful_notes}")
print(f"Failed note generations (LLM errors, load/save errors, content too long): {failed_notes}")
# Refine skipped count logic if necessary - current counter is basic
# print(f"Subjects/Transcripts skipped due to missing inputs: {skipped_due_to_missing_data}")

# Calculate transcripts skipped (Total - Success - Failed) if needed, assumes total_transcripts_processed is accurate
processed_or_failed = successful_notes + failed_notes
skipped_transcripts = total_transcripts_processed - processed_or_failed
if skipped_transcripts < 0 : skipped_transcripts = 0 # Sanity check
print(f"Transcripts skipped or not processed (e.g., empty, load errors before LLM call): {skipped_transcripts}")


# Optional: Verify by listing some output note files
if successful_notes > 0:
    print("\nExample output note files (.md):")
    example_count = 0
    for subject_dir in output_base_path.iterdir():
        notes_data_dir = subject_dir / "final_notes"
        if subject_dir.is_dir() and notes_data_dir.exists():
             for note_file in notes_data_dir.iterdir():
                 if note_file.suffix == '.md' and example_count < 5:
                     print(f"- {note_file.relative_to(output_base_path)}")
                     example_count += 1
             if example_count >= 5:
                 break
    if example_count == 0:
         print("Could not find any example .md files in the output final_notes directories.")


--- Note Generation Summary ---
Total transcripts found for processing: 15
Successfully generated notes: 1
Failed note generations (LLM errors, load/save errors, content too long): 14
Transcripts skipped or not processed (e.g., empty, load errors before LLM call): 0

Example output note files (.md):
- agentic/final_notes/19853_shylaja.sharath_31_20250401121200417_Video_ENC_notes.md
- TBT/final_notes/6a_2020-09-22 09-49-32_TBTCon_notes.md
- TBT/final_notes/6b_2020-09-25 11-29-19_TBTCode 00_00_04-00_35_198_notes.md
- stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327092700214_Video_ENC (1)_notes.md
- stable_diffusion/final_notes/19853_shylaja.sharath_31_20250327084200249_Video_ENC_notes.md
