In [None]:
#!pip install openai-whisper transformers pytube pydub torch

In [None]:
import whisper
from pydub import AudioSegment
import os
from pytube import YouTube
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import math


# Function to convert audio file to WAV format if it's not already in that format
def convert_to_wav(audio_file):
    file_name, file_extension = os.path.splitext(audio_file)
    if file_extension.lower() != '.wav':
        audio = AudioSegment.from_file(audio_file)
        wav_file = f"{file_name}.wav"
        audio.export(wav_file, format="wav")
        return wav_file
    return audio_file

# Function to download audio from a YouTube video
def download_audio_from_youtube(url):
    yt = YouTube(url)
    audio_stream = yt.streams.filter(only_audio=True).first()
    audio_file = audio_stream.download(filename='audio.mp4')

    return convert_to_wav(audio_file)

# Function to transcribe audio using Whisper
def transcribe_audio_with_whisper(audio_file):
    # Load the Whisper model
    model = whisper.load_model("base")

    # Convert the audio file to WAV format if necessary
    wav_file = convert_to_wav(audio_file)

    # Transcribe the audio file
    result = model.transcribe(wav_file)

    return result["text"]

# Function to summarize text using Pegasus-XSUM and calculate ROUGE scores
def summarize_text_with_pegasus(text):
    # Load the Pegasus tokenizer and model
    model_name = "google/pegasus-xsum"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the input text
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

    # Calculate min_length and max_length based on the length of the input text
    total_words = len(inputs["input_ids"][0])
    min_length = math.ceil(total_words / 3)  # Minimum summary length as 1/3 of total words
    max_length = math.ceil(total_words / 5)  # Maximum summary length as 1/5 of total words

    # Generate the summary
    summary_ids = model.generate(
        inputs.input_ids,
        num_beams=4,
        min_length=min_length,
        max_length=max_length,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# URL of the YouTube video
youtube_url ="https://youtu.be/pMX2cQdPubk?si=KiU_9GCc54-Air6c"


# Download the audio from the YouTube video
audio_file = download_audio_from_youtube(youtube_url)

# Transcribe the downloaded audio file
transcription = transcribe_audio_with_whisper(audio_file)

# Summarize the transcription
summary = summarize_text_with_pegasus(transcription)


# Print the transcription and its summary
print("Transcription:\n", transcription)
print("\nSummary:\n", summary)

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 123MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]



Transcription:
  Alright, Tim. Great to see you. Thanks for spending the time. Thank you for coming. So you have the WWDC keynote today. Yeah. Which was really fun to watch as a whole two hours. I want to zoom all the way out, because obviously a lot of talk about AI in general. Yeah. And I'm kind of just left wondering how Apple defines AI in general. Because I know if you ask a regular person, you might hear about generative AI, chat bots, things like that. And these are relatively new additions to Apple's AI portfolio. How do you look at AI as Apple? Well, we've been executing with AI for a long time. Right. It's that you're wearing a watch. It's at the root of the watch. I mean, you think about things like crash detection, fall detection, things like a fib, and all of this kind of stuff is machine learning at the end of the day. And so, but what has captured people's imagination is generative AI. And we see it as the opportunity for a whole new curve of technology and providing, an