<a href="https://colab.research.google.com/github/Ishashianand/Ishashianand/blob/main/YouTube_Video_Summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from IPython import get_ipython
import subprocess
import sys

# Install required packages if in Colab
try:
    import gradio as gr
    import youtube_transcript_api
    import transformers
except ImportError:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install',
                          'youtube-transcript-api', 'gradio', 'transformers', 'torch'])
    import gradio as gr
    from youtube_transcript_api import YouTubeTranscriptApi
    from transformers import pipeline

import re
import urllib.parse
from youtube_transcript_api import YouTubeTranscriptApi

class VideoSummarizer:
    def __init__(self):
        print("Loading summarization model...")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        print("Model loaded successfully!")

    def extract_video_id(self, youtube_url):
        """Extract the video ID from a YouTube URL."""
        try:
            if "youtu.be" in youtube_url:
                return youtube_url.split("/")[-1]
            query = urllib.parse.urlparse(youtube_url).query
            return urllib.parse.parse_qs(query).get("v", [None])[0]
        except:
            return None

    def get_transcript(self, video_id):
        """Get the transcript of a YouTube video."""
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
            return " ".join([t["text"] for t in transcript_list])
        except Exception as e:
            raise Exception(f"Error fetching transcript: {str(e)}")

    def clean_transcript(self, transcript):
        """Clean the transcript text."""
        cleaned = re.sub(r'[^\w\s.,!?]', '', transcript)
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned

    def chunk_text(self, text, max_chunk_size=1000):
        """Split text into chunks that the model can process."""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            if current_size + len(word) + 1 <= max_chunk_size:
                current_chunk.append(word)
                current_size += len(word) + 1
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_size = len(word) + 1

        if current_chunk:
            chunks.append(" ".join(current_chunk))
        return chunks

    def summarize_chunk(self, chunk):
        """Summarize a single chunk of text."""
        try:
            summary = self.summarizer(chunk, max_length=130, min_length=30, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            raise Exception(f"Error during summarization: {str(e)}")

    def generate_summary(self, youtube_url, progress=gr.Progress()):
        """Main function to generate a summary from a YouTube video."""
        try:
            # Extract video ID
            video_id = self.extract_video_id(youtube_url)
            if not video_id:
                raise ValueError("Invalid YouTube URL")

            progress(0.2, desc="Fetching transcript...")
            # Get and clean transcript
            transcript = self.get_transcript(video_id)
            cleaned_transcript = self.clean_transcript(transcript)

            progress(0.4, desc="Processing transcript...")
            # Split into chunks and summarize
            chunks = self.chunk_text(cleaned_transcript)
            summaries = []

            # Process chunks with progress updates
            for i, chunk in enumerate(chunks):
                progress_val = 0.4 + (0.5 * i/len(chunks))
                progress(progress_val, desc=f"Summarizing part {i+1} of {len(chunks)}...")
                summary = self.summarize_chunk(chunk)
                summaries.append(summary)

            # Combine all summaries
            final_summary = " ".join(summaries)

            progress(0.9, desc="Formatting output...")
            # Format the summary
            formatted_summary = self.format_summary_as_book(final_summary)

            progress(1.0, desc="Done!")
            return formatted_summary

        except Exception as e:
            raise Exception(f"Error generating summary: {str(e)}")

    def format_summary_as_book(self, summary, title="Video Summary"):
        """Format the summary in a book-like structure."""
        # Extract key points
        sentences = summary.split('. ')
        key_points = [f"{i}. {sentence}" for i, sentence in enumerate(sentences[:5], 1) if sentence]

        # Generate conclusion
        conclusion = '. '.join(sentences[-2:])

        # Create formatted sections
        title_section = f"# {title}\n\n"
        executive_summary_section = f"## Executive Summary\n{summary}\n\n"
        key_points_section = "## Key Points\n" + "\n".join(key_points) + "\n\n"
        conclusion_section = f"## Conclusion\n{conclusion}"

        # Combine all sections
        formatted_summary = (
            title_section +
            executive_summary_section +
            key_points_section +
            conclusion_section
        )

        return formatted_summary

def create_gradio_interface():
    """Create and launch the Gradio interface."""
    # Initialize the summarizer
    summarizer = VideoSummarizer()

    # Define the interface
    iface = gr.Interface(
        fn=summarizer.generate_summary,
        inputs=[
            gr.Textbox(
                label="YouTube URL",
                placeholder="Enter YouTube video URL...",
                info="Paste a YouTube URL to generate a summary"
            )
        ],
        outputs=gr.Markdown(label="Summary"),
        title="YouTube Video Summarizer",
        description=(
            "This tool generates a comprehensive summary of YouTube videos. "
            "Just paste a YouTube URL and get a structured summary including key points and conclusions. "
            "Note: The video must have English subtitles/captions available."
        ),
        examples=[
            ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],  # Example URL
        ],
        cache_examples=False,
        theme=gr.themes.Soft()
    )

    return iface

# Check if running in Colab
def is_running_in_colab():
    try:
        return 'google.colab' in str(get_ipython())
    except:
        return False

# Main execution
if __name__ == "__main__":
    iface = create_gradio_interface()
    # Launch with appropriate settings for Colab or local
    if is_running_in_colab():
        iface.launch(debug=True, share=True)
    else:
        iface.launch()

Loading summarization model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Model loaded successfully!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f3f3616672444935e6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Created dataset file at: .gradio/flagged/dataset1.csv
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://f3f3616672444935e6.gradio.live


In [3]:
# Install required packages
!pip install youtube-transcript-api transformers torch gradio

import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import urllib.parse
import re

class YouTubeSummarizer:
    def __init__(self):
        print("Loading the summarization model...")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        print("✓ Model loaded successfully!")

    def get_video_id(self, url):
        """Extract video ID from YouTube URL."""
        try:
            if "youtu.be" in url:
                return url.split("/")[-1]
            if "youtube.com" in url:
                if "v=" in url:
                    return url.split("v=")[1].split("&")[0]
                elif "embed" in url:
                    return url.split("embed/")[1].split("?")[0]
        except:
            return None
        return None

    def get_transcript(self, video_id):
        """Fetch video transcript."""
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            return " ".join([entry['text'] for entry in transcript])
        except Exception as e:
            raise Exception(f"⚠️ Error getting transcript: {str(e)}")

    def clean_text(self, text):
        """Clean transcript text."""
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text

    def split_into_chunks(self, text, max_words=500):
        """Split text into smaller chunks."""
        words = text.split()
        chunks = []
        for i in range(0, len(words), max_words):
            chunk = ' '.join(words[i:i + max_words])
            chunks.append(chunk)
        return chunks

    def summarize(self, url, progress=gr.Progress()):
        """Main function to generate video summary."""
        try:
            # Step 1: Extract video ID
            progress(0.1, desc="Getting video information...")
            video_id = self.get_video_id(url)
            if not video_id:
                raise ValueError("❌ Invalid YouTube URL")

            # Step 2: Get transcript
            progress(0.3, desc="Fetching video transcript...")
            transcript = self.get_transcript(video_id)

            # Step 3: Clean transcript
            progress(0.4, desc="Processing transcript...")
            cleaned_transcript = self.clean_text(transcript)

            # Step 4: Split into chunks
            chunks = self.split_into_chunks(cleaned_transcript)

            # Step 5: Summarize each chunk
            summaries = []
            for i, chunk in enumerate(chunks):
                progress(0.5 + (0.4 * (i/len(chunks))), desc=f"Summarizing part {i+1} of {len(chunks)}...")
                summary = self.summarizer(chunk, max_length=150, min_length=40, do_sample=False)
                summaries.append(summary[0]['summary_text'])

            # Step 6: Combine summaries and format
            progress(0.9, desc="Formatting final summary...")
            final_summary = self.format_summary(summaries)

            progress(1.0, desc="✨ Summary ready!")
            return final_summary

        except Exception as e:
            return f"Error: {str(e)}"

    def format_summary(self, summaries):
        """Format the summary in a structured way."""
        # Combine all summaries
        full_text = " ".join(summaries)

        # Split into sentences for key points
        sentences = [s.strip() for s in full_text.split('.') if s.strip()]

        # Format the output
        formatted_output = f"""# 📚 Video Summary

## 📝 Executive Summary
{full_text}

## 🔑 Key Points
"""
        # Add key points (up to 5)
        for i, sentence in enumerate(sentences[:5], 1):
            formatted_output += f"{i}. {sentence}.\n"

        # Add conclusion
        formatted_output += f"""
## 🎯 Conclusion
{'. '.join(sentences[-2:]) if len(sentences) > 1 else sentences[0]}
"""
        return formatted_output

def create_interface():
    """Create the Gradio interface."""
    summarizer = YouTubeSummarizer()

    iface = gr.Interface(
        fn=summarizer.summarize,
        inputs=[
            gr.Textbox(
                label="YouTube URL",
                placeholder="Paste your YouTube video URL here...",
                info="💡 Works best with videos that have English captions/subtitles"
            )
        ],
        outputs=gr.Markdown(label="Generated Summary"),
        title="📺 YouTube Video Summarizer",
        description="Transform any YouTube video into a comprehensive written summary. Simply paste the video URL below!",
        examples=[
            ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"],  # Replace with a good example video
        ],
        theme=gr.themes.Soft(),
        css="footer {display: none !important;}"
    )
    return iface

# Launch the interface
if __name__ == "__main__":
    interface = create_interface()
    interface.launch(share=True)

Loading the summarization model...
✓ Model loaded successfully!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ce4dc8c625532e719d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
# First, install required packages
!pip install -q youtube-transcript-api transformers torch gradio

import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import re

def get_video_id(url):
    """Extract video ID from various YouTube URL formats"""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu.be\/)([0-9A-Za-z_-]{11})'
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def summarize_video(url, progress=gr.Progress()):
    try:
        # Initialize the summarizer
        progress(0.1, desc="Loading summarization model...")
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

        # Get video ID
        video_id = get_video_id(url)
        if not video_id:
            return "Error: Invalid YouTube URL. Please check the URL and try again."

        # Get transcript
        progress(0.3, desc="Fetching video transcript...")
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            text = " ".join([t["text"] for t in transcript])
        except Exception as e:
            return f"Error: Could not fetch video transcript. Make sure the video has English subtitles enabled. Error details: {str(e)}"

        # Clean transcript
        progress(0.4, desc="Processing transcript...")
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\n', ' ', text)
        text = ' '.join(text.split())

        # Split into chunks (BART has a max input length)
        max_chunk = 1000
        chunks = [text[i:i + max_chunk] for i in range(0, len(text), max_chunk)]

        # Summarize each chunk
        progress(0.6, desc="Generating summary...")
        summaries = []
        for i, chunk in enumerate(chunks):
            summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
            summaries.append(summary[0]['summary_text'])
            progress(0.6 + (0.3 * (i + 1) / len(chunks)))

        # Combine summaries
        final_summary = " ".join(summaries)

        # Extract key points (sentences)
        sentences = [s.strip() for s in final_summary.split('.') if s.strip()]
        key_points = sentences[:5]  # Take up to 5 key points

        # Format output
        progress(0.9, desc="Formatting output...")
        output = f"""# Video Summary

## Executive Summary
{final_summary}

## Key Points
"""
        for i, point in enumerate(key_points, 1):
            output += f"{i}. {point}.\n"

        output += f"""
## Conclusion
{sentences[-1] if sentences else 'No conclusion available.'}"""

        progress(1.0, desc="Done!")
        return output

    except Exception as e:
        return f"An error occurred: {str(e)}\nPlease try again with a different video."

# Create and launch the interface
iface = gr.Interface(
    fn=summarize_video,
    inputs=gr.Textbox(
        label="YouTube URL",
        placeholder="Paste YouTube URL here...",
        info="Note: Video must have English subtitles/captions available"
    ),
    outputs=gr.Markdown(label="Summary"),
    title="YouTube Video Summarizer",
    description="Get a quick summary of any YouTube video with English subtitles.",
    examples=[
        ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
    ],
    theme=gr.themes.Soft()
)

iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d432cda05465ce34eb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Your max_length is set to 130, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)
Your max_length is set to 150, but your input_length is only 99. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
