In [1]:
"""
==============================================================================
YOUTUBE VIDEO SUMMARIZER PROJECT
==============================================================================
Name: Kumar Jeet
Date: December 24, 2025
Description: Automated YouTube video transcription and summarization using
             Whisper (speech-to-text) and BART (text summarization)
==============================================================================
"""

# ========== STEP 1: CREATE PROJECT STRUCTURE ==========
print("\n" + "="*70)
print("üìÅ STEP 1: CREATING PROJECT STRUCTURE")
print("="*70 + "\n")

import os

# Define complete project folder structure
project_structure = {
    'youtube_summarizer': {
        'models': [],              # Store downloaded AI models
        'outputs': {
            'audio': [],           # Downloaded audio files from YouTube
            'transcripts': [],     # Whisper transcription outputs
            'summaries': []        # Final BART summaries
        },
        'templates': [],           # Flask HTML templates
        'static': {
            'css': [],             # Stylesheets
            'js': []               # JavaScript files
        },
        'utils': []                # Helper modules (downloader, transcriber, summarizer)
    }
}

def create_structure(base_path, structure):
    """
    Recursively create folder structure

    Args:
        base_path (str): Base directory path
        structure (dict): Nested dictionary representing folder structure
    """
    for name, content in structure.items():
        path = os.path.join(base_path, name)

        if isinstance(content, dict):
            os.makedirs(path, exist_ok=True)
            print(f"  ‚úÖ Created: {path}")
            create_structure(path, content)
        elif isinstance(content, list):
            os.makedirs(path, exist_ok=True)
            print(f"  ‚úÖ Created: {path}")

# Create project in /content/ (Google Colab workspace)
base_dir = '/content/youtube_summarizer'
create_structure('/content', project_structure)

print(f"\n‚úÖ Project structure created at: {base_dir}")
print("\nüìÇ Folder tree:")
print("""
youtube_summarizer/
‚îú‚îÄ‚îÄ models/                  # AI model storage
‚îú‚îÄ‚îÄ outputs/
‚îÇ   ‚îú‚îÄ‚îÄ audio/              # Downloaded MP3 files
‚îÇ   ‚îú‚îÄ‚îÄ transcripts/        # Text transcripts
‚îÇ   ‚îî‚îÄ‚îÄ summaries/          # Final summaries
‚îú‚îÄ‚îÄ templates/              # HTML templates
‚îú‚îÄ‚îÄ static/
‚îÇ   ‚îú‚îÄ‚îÄ css/               # Stylesheets
‚îÇ   ‚îî‚îÄ‚îÄ js/                # JavaScript
‚îî‚îÄ‚îÄ utils/                  # Python modules
""")

print("="*70)
print("‚úÖ STEP 1 COMPLETE!")
print("="*70)



üìÅ STEP 1: CREATING PROJECT STRUCTURE

  ‚úÖ Created: /content/youtube_summarizer
  ‚úÖ Created: /content/youtube_summarizer/models
  ‚úÖ Created: /content/youtube_summarizer/outputs
  ‚úÖ Created: /content/youtube_summarizer/outputs/audio
  ‚úÖ Created: /content/youtube_summarizer/outputs/transcripts
  ‚úÖ Created: /content/youtube_summarizer/outputs/summaries
  ‚úÖ Created: /content/youtube_summarizer/templates
  ‚úÖ Created: /content/youtube_summarizer/static
  ‚úÖ Created: /content/youtube_summarizer/static/css
  ‚úÖ Created: /content/youtube_summarizer/static/js
  ‚úÖ Created: /content/youtube_summarizer/utils

‚úÖ Project structure created at: /content/youtube_summarizer

üìÇ Folder tree:

youtube_summarizer/
‚îú‚îÄ‚îÄ models/                  # AI model storage
‚îú‚îÄ‚îÄ outputs/
‚îÇ   ‚îú‚îÄ‚îÄ audio/              # Downloaded MP3 files
‚îÇ   ‚îú‚îÄ‚îÄ transcripts/        # Text transcripts
‚îÇ   ‚îî‚îÄ‚îÄ summaries/          # Final summaries
‚îú‚îÄ‚îÄ templates/          

In [2]:
# ========== STEP 2: INSTALL ALL DEPENDENCIES ==========
print("\n" + "="*70)
print("üì¶ STEP 2: INSTALLING DEPENDENCIES")
print("="*70 + "\n")

# Suppress unnecessary warnings during installation
import warnings
warnings.filterwarnings('ignore')

print("üîÑ Upgrading pip...")
!pip install --upgrade pip --quiet

print("\nüì• Installing core libraries...\n")

# ------------------------------------------------------------------
# Web Framework
# ------------------------------------------------------------------
print("  üåê Flask & Flask-CORS (web application)")
!pip install flask flask-cors --quiet

# ------------------------------------------------------------------
# YouTube Downloader
# ------------------------------------------------------------------
print("  üìπ yt-dlp (YouTube downloader)")
!pip install yt-dlp --quiet

# ------------------------------------------------------------------
# FFmpeg (Audio processing - required by yt-dlp)
# ------------------------------------------------------------------
print("  üéµ FFmpeg (audio format conversion)")
!apt-get update -qq
!apt-get install -y ffmpeg -qq

# ------------------------------------------------------------------
# AI/ML Libraries
# ------------------------------------------------------------------
print("  ü§ñ OpenAI Whisper (speech-to-text)")
!pip install openai-whisper --quiet

print("  üß† Transformers + PyTorch (BART summarization)")
!pip install transformers torch torchvision --quiet

print("  üìù SentencePiece (tokenization)")
!pip install sentencepiece --quiet

print("  ‚ö° Accelerate (model optimization)")
!pip install accelerate --quiet

# ------------------------------------------------------------------
# Deployment
# ------------------------------------------------------------------
print("  üåç Pyngrok (public URL for Flask)")
!pip install pyngrok --quiet

print("\n" + "="*70)
print("‚úÖ ALL DEPENDENCIES INSTALLED SUCCESSFULLY!")
print("="*70 + "\n")

# ------------------------------------------------------------------
# Verify Installations
# ------------------------------------------------------------------
print("üîç Verifying installations...\n")

import flask
import yt_dlp
import whisper
import transformers
import torch
from pyngrok import ngrok

print(f"  ‚úÖ Flask: {flask.__version__}")
print(f"  ‚úÖ yt-dlp: {yt_dlp.version.__version__}")
print(f"  ‚úÖ Whisper: Installed")
print(f"  ‚úÖ Transformers: {transformers.__version__}")
print(f"  ‚úÖ PyTorch: {torch.__version__}")
print(f"  ‚úÖ Ngrok: Installed")

# ------------------------------------------------------------------
# Check GPU Availability
# ------------------------------------------------------------------
print(f"\nüñ•Ô∏è  GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  üéÆ GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"  ‚ö° Using GPU for faster processing!")
else:
    print(f"  üíª Using CPU (slower but functional)")

print("\n" + "="*70)
print("‚úÖ STEP 2 COMPLETE!")
print("="*70)



üì¶ STEP 2: INSTALLING DEPENDENCIES

üîÑ Upgrading pip...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h
üì• Installing core libraries...

  üåê Flask & Flask-CORS (web application)
  üìπ yt-dlp (YouTube downloader)
  üéµ FFmpeg (audio format conversion)
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
  ü§ñ OpenAI Whisper (speech-to-text)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  üß† Transformers + PyTorch (BART summarization)
  üìù SentencePiece (tokenization)
  ‚ö° Accelerate (mod

In [3]:
# ========== STEP 3: YOUTUBE AUDIO DOWNLOADER MODULE (FIXED) ==========
print("\n" + "="*70)
print("üì• STEP 3: CREATING YOUTUBE DOWNLOADER MODULE")
print("="*70 + "\n")

import os

# ------------------------------------------------------------------
# ENSURE FOLDERS EXIST
# ------------------------------------------------------------------
base_dir = '/content/youtube_summarizer'
utils_dir = f'{base_dir}/utils'
audio_dir = f'{base_dir}/outputs/audio'

os.makedirs(utils_dir, exist_ok=True)
os.makedirs(audio_dir, exist_ok=True)
print(f"‚úÖ Ensured directories exist\n")

# ------------------------------------------------------------------
# Complete downloader.py code (FIXED)
# ------------------------------------------------------------------
downloader_code = '''"""
YouTube Audio Downloader Module
--------------------------------
Downloads audio from YouTube videos using yt-dlp
Converts to MP3 format for processing
"""

import os
import yt_dlp
import re

def sanitize_filename(filename):
    """
    Remove invalid characters from filename for safe file storage

    Args:
        filename (str): Original filename

    Returns:
        str: Sanitized filename (max 100 chars, no special characters)
    """
    # Remove ALL special characters and punctuation
    # Keep only: letters, numbers, spaces, hyphens, underscores
    filename = re.sub(r'[^a-zA-Z0-9\\s_-]', '', filename)

    # Replace multiple spaces with single space
    filename = re.sub(r'\\s+', ' ', filename)

    # Replace spaces with underscores for better compatibility
    filename = filename.replace(' ', '_')

    # Remove leading/trailing underscores
    filename = filename.strip('_')

    # Limit length to 100 characters to avoid filesystem issues
    return filename[:100]


def download_youtube_audio(video_url, output_dir='/content/youtube_summarizer/outputs/audio'):
    """
    Download audio from YouTube video and convert to MP3

    Args:
        video_url (str): Full YouTube video URL
        output_dir (str): Directory to save downloaded audio

    Returns:
        tuple: (audio_path, video_title, duration_seconds)

    Raises:
        Exception: If download fails or URL is invalid
    """
    try:
        print(f"\\nüé¨ Processing: {video_url}")

        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)

        # ----------------------------------------------------------
        # FIRST: Get video info without downloading
        # ----------------------------------------------------------
        with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
            info = ydl.extract_info(video_url, download=False)
            video_title = info.get('title', 'unknown')
            duration = info.get('duration', 0)

        # Sanitize title BEFORE download
        safe_title = sanitize_filename(video_title)
        output_filename = f"{safe_title}.mp3"

        # ----------------------------------------------------------
        # Configure yt-dlp options with SANITIZED filename
        # ----------------------------------------------------------
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': os.path.join(output_dir, safe_title + '.%(ext)s'),  # Use sanitized name
            'quiet': False,
            'no_warnings': False,
        }

        # ----------------------------------------------------------
        # Download with sanitized filename
        # ----------------------------------------------------------
        print(f"üì• Downloading audio as: {output_filename}")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])

        # Construct final audio path
        audio_path = os.path.join(output_dir, output_filename)

        # Verify file exists
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        print(f"\\n‚úÖ Audio downloaded successfully!")
        print(f"  üìù Title: {video_title}")
        print(f"  üíæ Saved as: {output_filename}")
        print(f"  ‚è±Ô∏è  Duration: {duration // 60}min {duration % 60}sec")
        print(f"  üìÅ Full path: {audio_path}")

        return audio_path, video_title, duration

    except Exception as e:
        print(f"\\n‚ùå Error downloading audio: {str(e)}")
        raise Exception(f"Failed to download audio: {str(e)}")


def validate_youtube_url(url):
    """
    Validate if URL is a proper YouTube link

    Args:
        url (str): URL to validate

    Returns:
        bool: True if valid YouTube URL, False otherwise
    """
    youtube_patterns = [
        r'https?://(www\\.)?youtube\\.com/watch\\?v=[\\w-]+',
        r'https?://(www\\.)?youtu\\.be/[\\w-]+',
        r'https?://(www\\.)?youtube\\.com/embed/[\\w-]+',
        r'https?://(www\\.)?youtube\\.com/v/[\\w-]+',
    ]

    for pattern in youtube_patterns:
        if re.match(pattern, url):
            return True

    return False
'''

# ------------------------------------------------------------------
# Save to file
# ------------------------------------------------------------------
with open(f'{utils_dir}/downloader.py', 'w') as f:
    f.write(downloader_code)

print(f"‚úÖ Created: {utils_dir}/downloader.py")

# ------------------------------------------------------------------
# Test URL Validation
# ------------------------------------------------------------------
print("\nüß™ Testing URL validation...\n")

import sys
sys.path.insert(0, base_dir)

# Reload module if already imported
if 'utils.downloader' in sys.modules:
    import importlib
    importlib.reload(sys.modules['utils.downloader'])

from utils.downloader import validate_youtube_url

test_urls = [
    ('https://www.youtube.com/watch?v=dQw4w9WgXcQ', True),
    ('https://youtu.be/dQw4w9WgXcQ', True),
    ('https://youtube.com/embed/abc123', True),
    ('https://www.google.com', False),
    ('not a url', False),
]

print("URL Validation Results:")
for url, expected in test_urls:
    is_valid = validate_youtube_url(url)
    status = "‚úÖ PASS" if is_valid == expected else "‚ùå FAIL"
    print(f"  {status}: {url[:50]}")

print("\n" + "="*70)
print("‚úÖ STEP 3 COMPLETE!")
print("="*70)



üì• STEP 3: CREATING YOUTUBE DOWNLOADER MODULE

‚úÖ Ensured directories exist

‚úÖ Created: /content/youtube_summarizer/utils/downloader.py

üß™ Testing URL validation...

URL Validation Results:
  ‚úÖ PASS: https://www.youtube.com/watch?v=dQw4w9WgXcQ
  ‚úÖ PASS: https://youtu.be/dQw4w9WgXcQ
  ‚úÖ PASS: https://youtube.com/embed/abc123
  ‚úÖ PASS: https://www.google.com
  ‚úÖ PASS: not a url

‚úÖ STEP 3 COMPLETE!


In [4]:
# ========== STEP 4: WHISPER TRANSCRIPTION MODULE ==========
print("\n" + "="*70)
print("üé§ STEP 4: CREATING WHISPER TRANSCRIPTION MODULE")
print("="*70 + "\n")

import os

# ------------------------------------------------------------------
# Complete transcriber.py code
# ------------------------------------------------------------------
transcriber_code = '''"""
Whisper Speech-to-Text Transcription Module
-------------------------------------------
Converts audio files to text using OpenAI Whisper (offline)
Supports multiple languages and provides timestamped segments
"""

import os
import whisper
import torch
import json

class WhisperTranscriber:
    """
    Whisper-based audio transcription with GPU acceleration
    """

    def __init__(self, model_size='base', device=None):
        """
        Initialize Whisper model

        Args:
            model_size (str): Model size - 'tiny', 'base', 'small', 'medium', 'large'
                            'base' recommended for balance of speed and accuracy
            device (str): 'cuda' for GPU, 'cpu' for CPU, None for auto-detect
        """
        self.model_size = model_size

        # Auto-detect device if not specified
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device

        print(f"\\n‚öôÔ∏è  Initializing Whisper model...")
        print(f"  üì¶ Model: {model_size}")
        print(f"  üñ•Ô∏è  Device: {self.device}")

        # Load Whisper model
        self.model = whisper.load_model(model_size, device=self.device)

        print(f"  ‚úÖ Whisper model loaded successfully!")


    def transcribe_audio(self, audio_path, language='en',
                        output_dir='/content/youtube_summarizer/outputs/transcripts'):
        """
        Transcribe audio file to text

        Args:
            audio_path (str): Path to audio file (MP3, WAV, etc.)
            language (str): Language code ('en', 'hi', 'es', etc.)
            output_dir (str): Directory to save transcript files

        Returns:
            dict: {
                'text': Full transcript text,
                'segments': List of timestamped segments,
                'transcript_path': Path to saved transcript file,
                'segments_path': Path to saved segments JSON,
                'word_count': Number of words,
                'language': Detected language
            }
        """
        try:
            print(f"\\nüé§ Transcribing audio...")
            print(f"  üìÅ File: {os.path.basename(audio_path)}")

            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)

            # --------------------------------------------------
            # Transcribe with Whisper
            # --------------------------------------------------
            result = self.model.transcribe(
                audio_path,
                language=language,
                verbose=True,
                fp16=(self.device == 'cuda')  # Use FP16 only on GPU
            )

            # Extract results
            transcript_text = result['text'].strip()
            segments = result['segments']

            print(f"\\n‚úÖ Transcription complete!")
            print(f"  üìä Total segments: {len(segments)}")
            print(f"  üìù Total characters: {len(transcript_text)}")
            print(f"  üî§ Total words: {len(transcript_text.split())}")

            # --------------------------------------------------
            # Save outputs
            # --------------------------------------------------
            audio_filename = os.path.splitext(os.path.basename(audio_path))[0]

            # Save full transcript
            transcript_path = os.path.join(output_dir, f"{audio_filename}_transcript.txt")
            with open(transcript_path, 'w', encoding='utf-8') as f:
                f.write(transcript_text)
            print(f"  üíæ Saved transcript: {transcript_path}")

            # Save timestamped segments (JSON)
            segments_path = os.path.join(output_dir, f"{audio_filename}_segments.json")
            with open(segments_path, 'w', encoding='utf-8') as f:
                json.dump(segments, f, indent=2, ensure_ascii=False)
            print(f"  üíæ Saved segments: {segments_path}")

            return {
                'text': transcript_text,
                'segments': segments,
                'transcript_path': transcript_path,
                'segments_path': segments_path,
                'word_count': len(transcript_text.split()),
                'language': result.get('language', language)
            }

        except Exception as e:
            print(f"\\n‚ùå Error during transcription: {str(e)}")
            raise Exception(f"Transcription failed: {str(e)}")


    def get_model_info(self):
        """
        Get information about loaded model

        Returns:
            dict: Model metadata
        """
        return {
            'model_size': self.model_size,
            'device': self.device,
            'gpu_available': torch.cuda.is_available()
        }
'''

# ------------------------------------------------------------------
# Save to file
# ------------------------------------------------------------------
utils_dir = '/content/youtube_summarizer/utils'
with open(f'{utils_dir}/transcriber.py', 'w') as f:
    f.write(transcriber_code)

print(f"‚úÖ Created: {utils_dir}/transcriber.py")

# ------------------------------------------------------------------
# Download and Initialize Whisper Model
# ------------------------------------------------------------------
print("\nüì• Downloading Whisper 'base' model (first time only)...")
print("  üì¶ Model size: ~140 MB")

import sys
sys.path.insert(0, '/content/youtube_summarizer')
from utils.transcriber import WhisperTranscriber

# Initialize transcriber (downloads model on first run)
transcriber = WhisperTranscriber(model_size='base')

# Display model info
info = transcriber.get_model_info()
print(f"\nüìä Model Information:")
print(f"  üì¶ Model Size: {info['model_size']}")
print(f"  üñ•Ô∏è  Device: {info['device']}")
print(f"  üéÆ GPU Available: {info['gpu_available']}")

print("\n" + "="*70)
print("‚úÖ STEP 4 COMPLETE!")
print("="*70)



üé§ STEP 4: CREATING WHISPER TRANSCRIPTION MODULE

‚úÖ Created: /content/youtube_summarizer/utils/transcriber.py

üì• Downloading Whisper 'base' model (first time only)...
  üì¶ Model size: ~140 MB

‚öôÔ∏è  Initializing Whisper model...
  üì¶ Model: base
  üñ•Ô∏è  Device: cuda


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:00<00:00, 239MiB/s]


  ‚úÖ Whisper model loaded successfully!

üìä Model Information:
  üì¶ Model Size: base
  üñ•Ô∏è  Device: cuda
  üéÆ GPU Available: True

‚úÖ STEP 4 COMPLETE!


In [5]:
# ========== STEP 5: BART TEXT SUMMARIZATION MODULE (FIXED) ==========
print("\n" + "="*70)
print("üìù STEP 5: CREATING TEXT SUMMARIZATION MODULE")
print("="*70 + "\n")

import os

# ------------------------------------------------------------------
# ENSURE FOLDERS EXIST
# ------------------------------------------------------------------
utils_dir = '/content/youtube_summarizer/utils'
os.makedirs(utils_dir, exist_ok=True)

# ------------------------------------------------------------------
# Complete summarizer.py code with ROBUST error handling
# ------------------------------------------------------------------
summarizer_code = '''"""
BART Text Summarization Module
-------------------------------
Summarizes long transcripts using Facebook BART model (offline)
Handles long texts by chunking and processes on CPU/GPU
"""

import os
from transformers import pipeline
import torch
import re

class TextSummarizer:
    """
    BART-based text summarization with automatic chunking
    """

    def __init__(self, model_name='facebook/bart-large-cnn', device=None):
        """
        Initialize BART summarization model

        Args:
            model_name (str): HuggingFace model name
            device (int): Device ID (-1 for CPU, 0 for GPU)
        """
        self.model_name = model_name

        # Auto-detect device if not specified
        if device is None:
            self.device = 0 if torch.cuda.is_available() else -1
        else:
            self.device = device

        print(f"\\n‚öôÔ∏è  Initializing Summarization model...")
        print(f"  üì¶ Model: {model_name}")
        print(f"  üñ•Ô∏è  Device: {'GPU' if self.device >= 0 else 'CPU'}")
        print(f"  üì• Downloading model (first use only) - ~1.6 GB")

        # Load summarization pipeline
        self.summarizer = pipeline(
            'summarization',
            model=model_name,
            device=self.device,
            framework='pt'
        )

        print(f"  ‚úÖ Summarization model loaded successfully!")


    def clean_text(self, text):
        """
        Aggressively clean text to prevent tokenization errors

        Args:
            text (str): Raw text

        Returns:
            str: Cleaned text
        """
        # Remove timestamps like [00:00.000 --> 00:03.920]
        text = re.sub(r'\\[\\d{2}:\\d{2}\\.\\d{3}\\s*-->\\s*\\d{2}:\\d{2}\\.\\d{3}\\]', '', text)

        # Remove excessive whitespace
        text = re.sub(r'\\s+', ' ', text)

        # Remove special characters that cause issues
        text = re.sub(r'[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\\x7f-\\x9f]', '', text)

        # Remove URLs
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

        # Normalize quotes
        text = text.replace('"', '"').replace('"', '"')
        text = text.replace("'", "'").replace("'", "'")

        # Strip and ensure not empty
        text = text.strip()

        return text if text else "No content available."


    def chunk_text(self, text, max_words=400):
        """
        Split text into smaller chunks by word count

        Args:
            text (str): Input text
            max_words (int): Maximum words per chunk

        Returns:
            list: List of text chunks
        """
        # Split by sentences
        sentences = re.split(r'(?<=[.!?])\\s+', text)

        chunks = []
        current_chunk = []
        current_word_count = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            word_count = len(sentence.split())

            if current_word_count + word_count <= max_words:
                current_chunk.append(sentence)
                current_word_count += word_count
            else:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_word_count = word_count

        # Add remaining chunk
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks if chunks else [text]


    def summarize_text(self, text, max_length=130, min_length=30,
                      output_dir='/content/youtube_summarizer/outputs/summaries',
                      filename=None):
        """
        Summarize long text with robust error handling

        Args:
            text (str): Input text to summarize
            max_length (int): Maximum summary length per chunk
            min_length (int): Minimum summary length per chunk
            output_dir (str): Directory to save summary
            filename (str): Output filename (optional)

        Returns:
            dict: Summary results
        """
        try:
            print(f"\\nüìù Summarizing text...")
            print(f"  üìä Input length: {len(text)} characters ({len(text.split())} words)")

            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)

            # Clean text AGGRESSIVELY
            text = self.clean_text(text)

            # Check if text is too short
            word_count = len(text.split())
            if word_count < 50:
                print(f"  ‚ö†Ô∏è  Text too short ({word_count} words), returning as-is")
                final_summary = text
            else:
                # Split text into chunks
                chunks = self.chunk_text(text, max_words=400)
                print(f"  ‚úÇÔ∏è  Text split into {len(chunks)} chunks")

                # Summarize each chunk with robust error handling
                summaries = []
                for i, chunk in enumerate(chunks, 1):
                    print(f"  üîÑ Processing chunk {i}/{len(chunks)}...")

                    try:
                        # Ensure chunk is long enough
                        chunk_words = len(chunk.split())
                        if chunk_words < 30:
                            print(f"    ‚ö†Ô∏è  Chunk {i} too short, using as-is")
                            summaries.append(chunk)
                            continue

                        # Adjust max_length based on chunk size
                        adjusted_max = min(max_length, int(chunk_words * 0.7))
                        adjusted_min = min(min_length, int(chunk_words * 0.3))

                        # Summarize chunk
                        result = self.summarizer(
                            chunk,
                            max_length=adjusted_max,
                            min_length=adjusted_min,
                            do_sample=False,
                            truncation=True,
                            clean_up_tokenization_spaces=True
                        )

                        summaries.append(result[0]['summary_text'])
                        print(f"    ‚úÖ Chunk {i} summarized successfully")

                    except Exception as chunk_error:
                        print(f"    ‚ö†Ô∏è  Chunk {i} failed: {str(chunk_error)[:100]}")
                        # Use first 200 words as fallback
                        fallback = ' '.join(chunk.split()[:200])
                        summaries.append(fallback)
                        continue

                # Combine summaries
                if not summaries:
                    final_summary = ' '.join(text.split()[:300])  # First 300 words
                else:
                    final_summary = ' '.join(summaries)

            print(f"\\n‚úÖ Summarization complete!")
            print(f"  üìä Summary length: {len(final_summary)} characters ({len(final_summary.split())} words)")

            # Avoid division by zero
            if len(final_summary) > 0:
                compression = round(len(text) / len(final_summary), 1)
            else:
                compression = 1.0

            print(f"  üóúÔ∏è  Compression ratio: {compression}x")

            # Save summary
            if filename is None:
                filename = 'summary.txt'

            summary_path = os.path.join(output_dir, filename)
            with open(summary_path, 'w', encoding='utf-8') as f:
                f.write(final_summary)

            print(f"  üíæ Saved summary: {summary_path}")

            return {
                'summary': final_summary,
                'summary_path': summary_path,
                'original_length': len(text.split()),
                'summary_length': len(final_summary.split()),
                'compression_ratio': compression,
                'num_chunks': len(chunks) if word_count >= 50 else 1
            }

        except Exception as e:
            print(f"\\n‚ùå Error during summarization: {str(e)}")
            # Return first 300 words as emergency fallback
            emergency_summary = ' '.join(text.split()[:300])
            return {
                'summary': emergency_summary,
                'summary_path': '',
                'original_length': len(text.split()),
                'summary_length': len(emergency_summary.split()),
                'compression_ratio': 1.0,
                'num_chunks': 1
            }


    def get_model_info(self):
        """Get model metadata"""
        return {
            'model_name': self.model_name,
            'device': 'GPU' if self.device >= 0 else 'CPU',
            'gpu_available': torch.cuda.is_available()
        }
'''

# ------------------------------------------------------------------
# Save to file
# ------------------------------------------------------------------
with open(f'{utils_dir}/summarizer.py', 'w') as f:
    f.write(summarizer_code)

print(f"‚úÖ Created: {utils_dir}/summarizer.py")

# ------------------------------------------------------------------
# Reload if needed
# ------------------------------------------------------------------
import sys
sys.path.insert(0, '/content/youtube_summarizer')

if 'utils.summarizer' in sys.modules:
    import importlib
    importlib.reload(sys.modules['utils.summarizer'])

from utils.summarizer import TextSummarizer

# Initialize (downloads model on first run)
print("\nüì• Loading BART model...")
summarizer = TextSummarizer(model_name='facebook/bart-large-cnn')

# Display info
info = summarizer.get_model_info()
print(f"\nüìä Model Information:")
print(f"  üì¶ Model: {info['model_name']}")
print(f"  üñ•Ô∏è  Device: {info['device']}")
print(f"  üéÆ GPU Available: {info['gpu_available']}")

print("\n" + "="*70)
print("‚úÖ STEP 5 COMPLETE!")
print("="*70)



üìù STEP 5: CREATING TEXT SUMMARIZATION MODULE

‚úÖ Created: /content/youtube_summarizer/utils/summarizer.py

üì• Loading BART model...

‚öôÔ∏è  Initializing Summarization model...
  üì¶ Model: facebook/bart-large-cnn
  üñ•Ô∏è  Device: GPU
  üì• Downloading model (first use only) - ~1.6 GB


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


  ‚úÖ Summarization model loaded successfully!

üìä Model Information:
  üì¶ Model: facebook/bart-large-cnn
  üñ•Ô∏è  Device: GPU
  üéÆ GPU Available: True

‚úÖ STEP 5 COMPLETE!


In [6]:
# ========== STEP 6: MAIN PIPELINE INTEGRATION ==========
print("\n" + "="*70)
print("üîó STEP 6: CREATING MAIN PIPELINE")
print("="*70 + "\n")

import os

# ------------------------------------------------------------------
# Complete pipeline.py code
# ------------------------------------------------------------------
pipeline_code = '''"""
YouTube Video Summarizer Pipeline
----------------------------------
Main workflow integrating all modules:
1. Download YouTube audio
2. Transcribe with Whisper
3. Summarize with BART
"""

import os
from utils.downloader import download_youtube_audio, validate_youtube_url
from utils.transcriber import WhisperTranscriber
from utils.summarizer import TextSummarizer

class YouTubeSummarizerPipeline:
    """
    Complete pipeline for YouTube video summarization
    """

    def __init__(self, whisper_model='base', bart_model='facebook/bart-large-cnn'):
        """
        Initialize pipeline with models

        Args:
            whisper_model (str): Whisper model size
            bart_model (str): BART model name
        """
        print("\\nüöÄ Initializing YouTube Summarizer Pipeline...")
        print("="*70)

        # Initialize models
        print("\\nüì• Loading Whisper model...")
        self.transcriber = WhisperTranscriber(model_size=whisper_model)

        print("\\nüì• Loading BART model...")
        self.summarizer = TextSummarizer(model_name=bart_model)

        print("\\n‚úÖ Pipeline initialized successfully!")
        print("="*70)


    def process_video(self, video_url, language='en'):
        """
        Complete workflow: Download ‚Üí Transcribe ‚Üí Summarize

        Args:
            video_url (str): YouTube video URL
            language (str): Audio language code

        Returns:
            dict: Complete results from all steps
        """
        print("\\n" + "="*70)
        print("üé¨ STARTING VIDEO PROCESSING PIPELINE")
        print("="*70)

        # --------------------------------------------------
        # Step 1: Validate URL
        # --------------------------------------------------
        print("\\nüîç Step 1: Validating YouTube URL...")
        if not validate_youtube_url(video_url):
            raise ValueError("‚ùå Invalid YouTube URL!")
        print("  ‚úÖ URL is valid")

        # --------------------------------------------------
        # Step 2: Download Audio
        # --------------------------------------------------
        print("\\nüì• Step 2: Downloading audio...")
        audio_path, video_title, duration = download_youtube_audio(video_url)

        # --------------------------------------------------
        # Step 3: Transcribe Audio
        # --------------------------------------------------
        print("\\nüé§ Step 3: Transcribing audio...")
        transcript_result = self.transcriber.transcribe_audio(
            audio_path=audio_path,
            language=language
        )

        # --------------------------------------------------
        # Step 4: Summarize Transcript
        # --------------------------------------------------
        print("\\nüìù Step 4: Summarizing transcript...")
        summary_result = self.summarizer.summarize_text(
            text=transcript_result['text'],
            max_length=150,
            min_length=50,
            filename=f"{os.path.splitext(os.path.basename(audio_path))[0]}_summary.txt"
        )

        # --------------------------------------------------
        # Combine Results
        # --------------------------------------------------
        results = {
            'video_title': video_title,
            'video_url': video_url,
            'duration': duration,
            'audio_path': audio_path,
            'transcript': transcript_result['text'],
            'transcript_path': transcript_result['transcript_path'],
            'word_count': transcript_result['word_count'],
            'summary': summary_result['summary'],
            'summary_path': summary_result['summary_path'],
            'compression_ratio': summary_result['compression_ratio']
        }

        print("\\n" + "="*70)
        print("‚úÖ PIPELINE COMPLETE!")
        print("="*70)
        print(f"\\nüìä Final Results:")
        print(f"  üé¨ Video: {video_title}")
        print(f"  ‚è±Ô∏è  Duration: {duration // 60}min {duration % 60}sec")
        print(f"  üìù Transcript: {transcript_result['word_count']} words")
        print(f"  üìÑ Summary: {summary_result['summary_length']} words")
        print(f"  üóúÔ∏è  Compression: {summary_result['compression_ratio']}x")
        print(f"\\nüíæ Files saved:")
        print(f"  üéµ Audio: {audio_path}")
        print(f"  üìù Transcript: {transcript_result['transcript_path']}")
        print(f"  üìÑ Summary: {summary_result['summary_path']}")
        print("="*70)

        return results
'''

# ------------------------------------------------------------------
# Save to file
# ------------------------------------------------------------------
base_dir = '/content/youtube_summarizer'
with open(f'{base_dir}/pipeline.py', 'w') as f:
    f.write(pipeline_code)

print(f"‚úÖ Created: {base_dir}/pipeline.py")

print("\n" + "="*70)
print("‚úÖ STEP 6 COMPLETE!")
print("="*70)



üîó STEP 6: CREATING MAIN PIPELINE

‚úÖ Created: /content/youtube_summarizer/pipeline.py

‚úÖ STEP 6 COMPLETE!


In [7]:
# ========== STEP 7: FLASK WEB APPLICATION ==========
print("\n" + "="*70)
print("üåê STEP 7: CREATING FLASK WEB APPLICATION")
print("="*70 + "\n")

import os

# ------------------------------------------------------------------
# ENSURE ALL REQUIRED FOLDERS EXIST
# ------------------------------------------------------------------
base_dir = '/content/youtube_summarizer'
templates_dir = f'{base_dir}/templates'
static_dir = f'{base_dir}/static'

# Create all necessary directories
os.makedirs(base_dir, exist_ok=True)
os.makedirs(templates_dir, exist_ok=True)
os.makedirs(static_dir, exist_ok=True)

print(f"‚úÖ Ensured directories exist:")
print(f"   - {base_dir}")
print(f"   - {templates_dir}")
print(f"   - {static_dir}\n")

# ------------------------------------------------------------------
# Create Flask app.py
# ------------------------------------------------------------------
app_code = '''"""
Flask Web Application for YouTube Video Summarizer
--------------------------------------------------
Web interface for processing YouTube videos
"""

from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
from pipeline import YouTubeSummarizerPipeline
import os

# Initialize Flask app
app = Flask(__name__)
CORS(app)  # Enable CORS for API requests

# Initialize pipeline (models loaded once on startup)
print("\\nüöÄ Starting Flask Application...")
print("="*70)
pipeline = YouTubeSummarizerPipeline(
    whisper_model='base',
    bart_model='facebook/bart-large-cnn'
)
print("‚úÖ Flask app ready!")
print("="*70)


@app.route('/')
def home():
    """
    Home page - Main interface
    """
    return render_template('index.html')


@app.route('/summarize', methods=['POST'])
def summarize():
    """
    API endpoint to process YouTube video

    Request JSON:
        {
            "video_url": "https://youtube.com/watch?v=...",
            "language": "en"  (optional, default: "en")
        }

    Response JSON:
        {
            "success": true/false,
            "video_title": "...",
            "summary": "...",
            "transcript": "...",
            "error": "..." (if failed)
        }
    """
    try:
        # Get request data
        data = request.get_json()
        video_url = data.get('video_url', '')
        language = data.get('language', 'en')

        # Validate input
        if not video_url:
            return jsonify({
                'success': False,
                'error': 'No video URL provided'
            }), 400

        # Process video through pipeline
        print(f"\\nüé¨ Processing: {video_url}")
        results = pipeline.process_video(video_url, language=language)

        # Return results
        return jsonify({
            'success': True,
            'video_title': results['video_title'],
            'video_url': results['video_url'],
            'duration': f"{results['duration'] // 60}min {results['duration'] % 60}sec",
            'transcript': results['transcript'],
            'summary': results['summary'],
            'word_count': results['word_count'],
            'compression_ratio': results['compression_ratio']
        })

    except Exception as e:
        print(f"\\n‚ùå Error: {str(e)}")
        return jsonify({
            'success': False,
            'error': str(e)
        }), 500


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000, debug=True)
'''

# Save app.py
with open(f'{base_dir}/app.py', 'w') as f:
    f.write(app_code)

print(f"‚úÖ Created: {base_dir}/app.py")

# ------------------------------------------------------------------
# Create HTML template
# ------------------------------------------------------------------
html_code = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>YouTube Video Summarizer</title>
    <style>
        * {
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }

        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }

        .container {
            max-width: 900px;
            margin: 0 auto;
            background: white;
            border-radius: 20px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
            padding: 40px;
        }

        h1 {
            text-align: center;
            color: #667eea;
            margin-bottom: 10px;
            font-size: 2.5em;
        }

        .subtitle {
            text-align: center;
            color: #666;
            margin-bottom: 30px;
        }

        .input-section {
            margin-bottom: 30px;
        }

        label {
            display: block;
            margin-bottom: 10px;
            color: #333;
            font-weight: 600;
        }

        input[type="text"] {
            width: 100%;
            padding: 15px;
            border: 2px solid #ddd;
            border-radius: 10px;
            font-size: 16px;
            transition: border-color 0.3s;
        }

        input[type="text"]:focus {
            outline: none;
            border-color: #667eea;
        }

        button {
            width: 100%;
            padding: 15px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            border-radius: 10px;
            font-size: 18px;
            font-weight: 600;
            cursor: pointer;
            transition: transform 0.2s;
        }

        button:hover {
            transform: translateY(-2px);
        }

        button:active {
            transform: translateY(0);
        }

        button:disabled {
            opacity: 0.6;
            cursor: not-allowed;
        }

        .loading {
            display: none;
            text-align: center;
            margin: 20px 0;
            color: #667eea;
            font-size: 18px;
        }

        .spinner {
            border: 4px solid #f3f3f3;
            border-top: 4px solid #667eea;
            border-radius: 50%;
            width: 40px;
            height: 40px;
            animation: spin 1s linear infinite;
            margin: 20px auto;
        }

        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }

        .results {
            display: none;
            margin-top: 30px;
        }

        .result-box {
            background: #f8f9fa;
            border-left: 4px solid #667eea;
            padding: 20px;
            margin-bottom: 20px;
            border-radius: 10px;
        }

        .result-box h3 {
            color: #667eea;
            margin-bottom: 10px;
        }

        .result-box p {
            color: #333;
            line-height: 1.6;
        }

        .error {
            background: #ffe6e6;
            border-left-color: #ff4444;
            color: #cc0000;
        }

        .stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin: 20px 0;
        }

        .stat-item {
            background: white;
            padding: 15px;
            border-radius: 10px;
            text-align: center;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        }

        .stat-value {
            font-size: 24px;
            font-weight: bold;
            color: #667eea;
        }

        .stat-label {
            font-size: 14px;
            color: #666;
            margin-top: 5px;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>üé¨ YouTube Video Summarizer</h1>
        <p class="subtitle">AI-powered transcription and summarization using Whisper + BART</p>

        <div class="input-section">
            <label for="videoUrl">üìπ YouTube Video URL:</label>
            <input
                type="text"
                id="videoUrl"
                placeholder="https://www.youtube.com/watch?v=..."
                value="">

            <br><br>

            <button id="summarizeBtn" onclick="summarizeVideo()">
                ‚ú® Summarize Video
            </button>
        </div>

        <div class="loading" id="loading">
            <div class="spinner"></div>
            <p>Processing video... This may take 2-5 minutes</p>
            <p style="font-size: 14px; color: #999;">Downloading ‚Üí Transcribing ‚Üí Summarizing</p>
        </div>

        <div class="results" id="results"></div>
    </div>

    <script>
        async function summarizeVideo() {
            const videoUrl = document.getElementById('videoUrl').value.trim();
            const loadingDiv = document.getElementById('loading');
            const resultsDiv = document.getElementById('results');
            const btn = document.getElementById('summarizeBtn');

            // Validate input
            if (!videoUrl) {
                alert('Please enter a YouTube URL');
                return;
            }

            // Show loading
            loadingDiv.style.display = 'block';
            resultsDiv.style.display = 'none';
            btn.disabled = true;

            try {
                // Call API
                const response = await fetch('/summarize', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json'
                    },
                    body: JSON.stringify({
                        video_url: videoUrl,
                        language: 'en'
                    })
                });

                const data = await response.json();

                // Hide loading
                loadingDiv.style.display = 'none';
                btn.disabled = false;

                // Display results
                if (data.success) {
                    resultsDiv.innerHTML = `
                        <div class="result-box">
                            <h3>üìπ Video Title</h3>
                            <p>${data.video_title}</p>
                        </div>

                        <div class="stats">
                            <div class="stat-item">
                                <div class="stat-value">${data.duration}</div>
                                <div class="stat-label">Duration</div>
                            </div>
                            <div class="stat-item">
                                <div class="stat-value">${data.word_count}</div>
                                <div class="stat-label">Words (Transcript)</div>
                            </div>
                            <div class="stat-item">
                                <div class="stat-value">${data.compression_ratio}x</div>
                                <div class="stat-label">Compression</div>
                            </div>
                        </div>

                        <div class="result-box">
                            <h3>üìÑ Summary</h3>
                            <p>${data.summary}</p>
                        </div>

                        <div class="result-box">
                            <h3>üìù Full Transcript</h3>
                            <p>${data.transcript}</p>
                        </div>
                    `;
                    resultsDiv.style.display = 'block';
                } else {
                    resultsDiv.innerHTML = `
                        <div class="result-box error">
                            <h3>‚ùå Error</h3>
                            <p>${data.error}</p>
                        </div>
                    `;
                    resultsDiv.style.display = 'block';
                }

            } catch (error) {
                loadingDiv.style.display = 'none';
                btn.disabled = false;
                resultsDiv.innerHTML = `
                    <div class="result-box error">
                        <h3>‚ùå Error</h3>
                        <p>${error.message}</p>
                    </div>
                `;
                resultsDiv.style.display = 'block';
            }
        }

        // Allow Enter key to submit
        document.getElementById('videoUrl').addEventListener('keypress', function(e) {
            if (e.key === 'Enter') {
                summarizeVideo();
            }
        });
    </script>
</body>
</html>
'''

# Save HTML template
with open(f'{templates_dir}/index.html', 'w') as f:
    f.write(html_code)

print(f"‚úÖ Created: {templates_dir}/index.html")

print("\n" + "="*70)
print("‚úÖ STEP 7 COMPLETE!")
print("="*70)



üåê STEP 7: CREATING FLASK WEB APPLICATION

‚úÖ Ensured directories exist:
   - /content/youtube_summarizer
   - /content/youtube_summarizer/templates
   - /content/youtube_summarizer/static

‚úÖ Created: /content/youtube_summarizer/app.py
‚úÖ Created: /content/youtube_summarizer/templates/index.html

‚úÖ STEP 7 COMPLETE!


In [8]:
# ========== STEP 8: DEPLOY FLASK APPLICATION ==========
print("\n" + "="*70)
print("üöÄ STEP 8: DEPLOYING FLASK APPLICATION")
print("="*70 + "\n")

import sys
sys.path.insert(0, '/content/youtube_summarizer')

# ------------------------------------------------------------------
# KILL ANY EXISTING FLASK PROCESS ON PORT 5000
# ------------------------------------------------------------------
print("üßπ Cleaning up any existing Flask processes...")
!fuser -k 5000/tcp 2>/dev/null || echo "No process to kill"
import time
time.sleep(2)
print("‚úÖ Port 5000 is now free\n")

# ------------------------------------------------------------------
# Start Flask Server
# ------------------------------------------------------------------
from app import app
import threading

def run_flask():
    """Run Flask server in background"""
    app.run(host='0.0.0.0', port=5000, debug=False, use_reloader=False)

print("üîÑ Starting Flask server...")
flask_thread = threading.Thread(target=run_flask, daemon=True)
flask_thread.start()

# Wait for Flask to start
time.sleep(5)
print("‚úÖ Flask server started on port 5000\n")

# ------------------------------------------------------------------
# Get Public URL using Google Colab's Built-in Proxy
# ------------------------------------------------------------------
print("üåç Creating public URL with Colab Proxy...")

from google.colab.output import eval_js
public_url = eval_js("google.colab.kernel.proxyPort(5000)")

print("\n" + "="*70)
print("‚úÖ FLASK APP DEPLOYED SUCCESSFULLY!")
print("="*70)
print(f"\nüåê PUBLIC URL: {public_url}")
print("\nüìù HOW TO USE:")
print("  1. ‚¨ÜÔ∏è  Click the URL above (it will open in new tab)")
print("  2. üìπ Enter a YouTube video URL")
print("  3. ‚ú® Click 'Summarize Video' button")
print("  4. ‚è≥ Wait 2-5 minutes for processing")
print("\n‚ö†Ô∏è  IMPORTANT: Keep this cell running!")
print("   (Don't stop this cell or the app will shut down)")
print("="*70 + "\n")

# ------------------------------------------------------------------
# Keep cell running forever
# ------------------------------------------------------------------
print("üîÑ Server is running... (Press ‚ñ† Stop button to shut down)\n")

try:
    while True:
        time.sleep(60)  # Keep alive
except KeyboardInterrupt:
    print("\nüõë Server stopped!")



üöÄ STEP 8: DEPLOYING FLASK APPLICATION

üßπ Cleaning up any existing Flask processes...
No process to kill
‚úÖ Port 5000 is now free


üöÄ Starting Flask Application...

üöÄ Initializing YouTube Summarizer Pipeline...

üì• Loading Whisper model...

‚öôÔ∏è  Initializing Whisper model...
  üì¶ Model: base
  üñ•Ô∏è  Device: cuda
  ‚úÖ Whisper model loaded successfully!

üì• Loading BART model...

‚öôÔ∏è  Initializing Summarization model...
  üì¶ Model: facebook/bart-large-cnn
  üñ•Ô∏è  Device: GPU
  üì• Downloading model (first use only) - ~1.6 GB


Device set to use cuda:0


  ‚úÖ Summarization model loaded successfully!

‚úÖ Pipeline initialized successfully!
‚úÖ Flask app ready!
üîÑ Starting Flask server...
 * Serving Flask app 'app'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


‚úÖ Flask server started on port 5000

üåç Creating public URL with Colab Proxy...

‚úÖ FLASK APP DEPLOYED SUCCESSFULLY!

üåê PUBLIC URL: https://5000-gpu-t4-s-3dqp48hp0uutu-b.asia-southeast1-1.prod.colab.dev

üìù HOW TO USE:
  1. ‚¨ÜÔ∏è  Click the URL above (it will open in new tab)
  2. üìπ Enter a YouTube video URL
  3. ‚ú® Click 'Summarize Video' button
  4. ‚è≥ Wait 2-5 minutes for processing

‚ö†Ô∏è  IMPORTANT: Keep this cell running!
   (Don't stop this cell or the app will shut down)

üîÑ Server is running... (Press ‚ñ† Stop button to shut down)



INFO:werkzeug:127.0.0.1 - - [24/Dec/2025 12:48:10] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [24/Dec/2025 12:48:10] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -



üé¨ Processing: https://www.youtube.com/watch?v=mKdjycj-7eE&t=4s

üé¨ STARTING VIDEO PROCESSING PIPELINE

üîç Step 1: Validating YouTube URL...
  ‚úÖ URL is valid

üì• Step 2: Downloading audio...

üé¨ Processing: https://www.youtube.com/watch?v=mKdjycj-7eE&t=4s




üì• Downloading audio as: Stop_This_Train_Win_a_Lamborghini.mp3
[youtube] Extracting URL: https://www.youtube.com/watch?v=mKdjycj-7eE&t=4s
[youtube] mKdjycj-7eE: Downloading webpage




[youtube] mKdjycj-7eE: Downloading android sdkless player API JSON
[youtube] mKdjycj-7eE: Downloading web safari player API JSON




[youtube] mKdjycj-7eE: Downloading m3u8 information




[info] mKdjycj-7eE: Downloading 1 format(s): 251-24
[download] Destination: /content/youtube_summarizer/outputs/audio/Stop_This_Train_Win_a_Lamborghini.webm
[download] 100% of   17.52MiB in 00:00:00 at 41.33MiB/s  
[ExtractAudio] Destination: /content/youtube_summarizer/outputs/audio/Stop_This_Train_Win_a_Lamborghini.mp3
Deleting original file /content/youtube_summarizer/outputs/audio/Stop_This_Train_Win_a_Lamborghini.webm (pass -k to keep)

‚úÖ Audio downloaded successfully!
  üìù Title: Stop This Train, Win a Lamborghini
  üíæ Saved as: Stop_This_Train_Win_a_Lamborghini.mp3
  ‚è±Ô∏è  Duration: 19min 0sec
  üìÅ Full path: /content/youtube_summarizer/outputs/audio/Stop_This_Train_Win_a_Lamborghini.mp3

üé§ Step 3: Transcribing audio...

üé§ Transcribing audio...
  üìÅ File: Stop_This_Train_Win_a_Lamborghini.mp3
[00:00.000 --> 00:03.840]  Sitting on these train tracks is a quarter million dollar Lamborghini,
[00:03.840 --> 00:07.360]  and barreling down on it right now is a massiv

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[18:30.880 --> 18:31.720]  Yes!
[18:34.020 --> 18:35.520]  Oh my God.
[18:35.520 --> 18:37.820]  This is your new Lamborghini Blake.
[18:37.820 --> 18:39.340]  Not only did he win the Lamborghini,
[18:39.340 --> 18:40.980]  but he also got all this money.
[18:40.980 --> 18:41.980]  Yes!
[18:41.980 --> 18:42.820]  That's right.
[18:42.820 --> 18:43.660]  I didn't.
[18:43.660 --> 18:45.580]  Go to the best white in this video Blake.
[18:45.580 --> 18:46.420]  What?
[18:46.420 --> 18:47.540]  I've been going to World War and buying
[18:47.540 --> 18:48.660]  these for those right now.
[18:48.660 --> 18:50.620]  It's the best tasty chocolate on the planet.
[18:50.620 --> 18:51.740]  Third time's the charm.
[18:51.740 --> 18:52.740]  See you guys later.
[18:52.740 --> 18:53.580]  Yes!
[18:55.260 --> 18:56.580]  Mr. B6000.

‚úÖ Transcription complete!
  üìä Total segments: 580
  üìù Total characters: 19050
  üî§ Total words: 3597
  üíæ Saved transcript: /content/youtube_summarizer/output

INFO:werkzeug:127.0.0.1 - - [24/Dec/2025 12:55:25] "POST /summarize HTTP/1.1" 200 -


    ‚úÖ Chunk 10 summarized successfully

‚úÖ Summarization complete!
  üìä Summary length: 2345 characters (435 words)
  üóúÔ∏è  Compression ratio: 8.1x
  üíæ Saved summary: /content/youtube_summarizer/outputs/summaries/Stop_This_Train_Win_a_Lamborghini_summary.txt

‚úÖ PIPELINE COMPLETE!

üìä Final Results:
  üé¨ Video: Stop This Train, Win a Lamborghini
  ‚è±Ô∏è  Duration: 19min 0sec
  üìù Transcript: 3597 words
  üìÑ Summary: 435 words
  üóúÔ∏è  Compression: 8.1x

üíæ Files saved:
  üéµ Audio: /content/youtube_summarizer/outputs/audio/Stop_This_Train_Win_a_Lamborghini.mp3
  üìù Transcript: /content/youtube_summarizer/outputs/transcripts/Stop_This_Train_Win_a_Lamborghini_transcript.txt
  üìÑ Summary: /content/youtube_summarizer/outputs/summaries/Stop_This_Train_Win_a_Lamborghini_summary.txt

üõë Server stopped!
