In [1]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
#nltk.download('punkt_tab')
import numpy as np
import transformers
from transformers import BartTokenizer, BartForConditionalGeneration

In [16]:
# get the link
link = "https://www.youtube.com/watch?v=e049IoFBnLA" 
# get the unique ID
unique_id = link.split("=")[-1]
# get the transcript of the video based on the unique ID
sub = YouTubeTranscriptApi.get_transcript(unique_id)  
# get only the "text" parts and join them together, ignoring others like "duration"
subtitle = " ".join([x['text'] for x in sub])

In [17]:
print(subtitle)
print(repr(subtitle))  # This will display the raw text with all special characters

Hello everyone. Hello everyone. Some of you dear contestants 
are very young so perhaps you do not know who   Professor Terence Tao is. Just a few words of 
introduction: He participated at the IMO for   the first time when he was 11 years old and he 
received a bronze medal. The next year he came   back and he received a silver medal. After that 
at the age of 13 he received a gold medal and   he was the youngest participant to receive a 
gold medal. Then he went to university and he   didn't participate at the IMO anymore. Now he 
is professor at the University of California   LA and I can say that he is definitely the 
biggest IMO star and of course one of the   most influential mathematicians of our time. 
Especially for you: Professor Terence Tao. Thank you. I'm very happy to be back here 
at the IMO. The time I had at the IMO was   one of the most fun times of my life. I still 
look back on it fondly. I hope we all had fun,   not just in the competition whether you 
get a good sc

In [18]:
subtitle = subtitle.replace("\\", "")
sentences = sent_tokenize(subtitle)
#print(sentences)
organized_sent = {k:v for v,k in enumerate(sentences)}
tf_idf = TfidfVectorizer(min_df=2, 
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True,
                                    stop_words = 'english')

sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()


In [19]:
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]


In [20]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)

In [21]:
print(summary)

They   found that after training the neural network 
you could give it all the hyperbolic geometry   invariants and like 90% of the time it will 
predict - it will guess the right signature. But if you already know sort of how the 
code works it saves you a lot of time. But we'll be able to do 
lots of things that we can't do right now.


In [8]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [22]:
# encode the subtitle using Bart tokenizer
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512)


In [23]:
outputs_tensor = model.generate(input_tensor, max_length=1024, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
#outputs_tensor

In [24]:
print(tokenizer.decode(outputs_tensor[0]))


</s><s>Professor Terence Tao is one of the most influential mathematicians of our time. He participated at the IMO for the first time when he was 11 years old. He was the youngest participant to receive a gold medal. His talk is on AI and more generally machine assistance in mathematics. It's all very exciting and it's beginning to be transformative, but on the other   hand there's also a sense of continuity, he says. He will be talking more about how these tools are beginning to change research mathematics which is different from competition mathematics. The talk will be followed by a presentation on the AI Math Olympia right after my talk.</s>


In [56]:
def summarize(link):
    unique_id = link.split("=")[-1]
    sub = YouTubeTranscriptApi.get_transcript(unique_id)
    subtitle = " ".join([x['text'] for x in sub])
    subtitle = subtitle.replace("\\", "")
    sentences = sent_tokenize(subtitle)
    organized_sent = {k:v for v,k in enumerate(sentences)}
    tf_idf = TfidfVectorizer(min_df=2, 
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=True,
                                    smooth_idf=True,
                                    sublinear_tf=True,
                                    stop_words = 'english')

    sentence_vectors = tf_idf.fit_transform(sentences)
    sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()
    N = 3
    top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]
    mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
    mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
    ordered_sentences = [element[0] for element in mapped_sentences]
    summary = " ".join(ordered_sentences)
    return summary
    