## Importing the required libraries

In [2]:
from tkinter import *
from tkinter import messagebox

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi

In [4]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [5]:
from heapq import nlargest

In [6]:
from datetime import date
import time

## Fetching the transcipts

In [7]:
def get_captions(video_url):
    vdo_id = video_url.split('=')[1]
    return YouTubeTranscriptApi.list_transcripts(vdo_id)

In [8]:
def manual_or_auto(transcript_list):
    try:
        return transcript_list.find_manually_created_transcript(['en-US', 'en'])
    except:
        return transcript_list.find_generated_transcript(['en', 'en'])

In [9]:
def fetching(ts):
    transcripts = ts.fetch()
    text = ""
    for txt in transcripts:
        text += txt['text']+" "
    text = text.replace("\ "," ")
    text = text.replace("\n"," ")
    return text

## NLP Model

In [10]:
def model(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return doc

## Tokenization

In [11]:
def tokenization(doc):
    tokens = [token.text for token in doc]
    return tokens

## Recording Word Frequency 

In [12]:
punctuations =''
def word_freq_table(doc,tokens):
    stopwords = list(STOP_WORDS)
    punctuations = punctuation + '\n'
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuations:
                if word.text.lower() not in word_frequencies.keys():
                    word_frequencies[word.text.lower()] = 1
                else:
                    word_frequencies[word.text.lower()] += 1
    return word_frequencies

In [13]:
def normalization(word_frequencies):
    max_freq = max(word_frequencies.values())
    for w in word_frequencies.keys():
        word_frequencies[w] = word_frequencies[w]/max_freq
    return word_frequencies

## Sentence Scoring

In [14]:
def sentence_scoring(doc,word_frequencies):
    sentence_tokens = [s for s in doc.sents]
    sentence_scores = {}
    for sent in sentence_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word.text.lower()]
    return sentence_tokens,sentence_scores

## Summarization and Length Regulation

In [15]:
def get_summary(sentence_tokens,sentence_scores,percent):
    p = int(percent)/100.0
    req_len = int(len(sentence_tokens)*p)
    summary = nlargest(req_len, sentence_scores, key = sentence_scores.get)
    final = [word.text for word in summary]
    final_summary = ' '.join(final)
    return final_summary

In [16]:
def summarizer(video_url,percent):
    try:
        transcript_list = get_captions(video_url)
    except:
        messagebox.showerror("No transcripts found", "Either no transcripts available or invalid link")
    else:   
        ts = manual_or_auto(transcript_list)
        text = fetching(ts)
        doc = model(text)
        tokens = tokenization(doc)
        word_frequencies = word_freq_table(doc,tokens)
        word_freq = normalization(word_frequencies)
        sentence_tokens,sentence_scores = sentence_scoring(doc,word_freq)
    try:
        summary = get_summary(sentence_tokens,sentence_scores,percent)
    except:
        messagebox.showwarning("% Length invalid","The summary length % is either empty or invalid")
    else:
        tday = date.today()
        curr = time.strftime("%H%M%S", time.localtime())
        with open("Documents/Summary" + str(tday) + curr + ".txt","w") as file:
            file.write(summary)
        messagebox.showinfo("Summarization Complete","Please check your Documents folder")

## GUI Application

In [None]:
root = Tk()
root.geometry("500x280")
root.title("Video Transcript Summarizer")
root.configure(background='lightblue')
icon = PhotoImage(file = 'S-icon.png')
root.iconphoto(False, icon)

titlelbl = Label(root,text="Video Transcript Summarizer",font=("Times New Roman",20,"italic"),background='lightblue')
titlelbl.place(x=75,y=20)

link = Entry(root, width=55,borderwidth=2)
link.place(x=140,y=100)

urllbl = Label(root,text="Video URL:",font=("Times New Roman",14,"italic"),background='lightblue')
urllbl.place(x=20,y=100)

percentlbl = Label(root,text="% Length:",font=("Times New Roman",14,"italic"),background='lightblue')
percentlbl.place(x=35,y=150)

percent = Entry(root, width=10,borderwidth=2)
percent.place(x=140,y=150)

suggestlbl = Label(root,text="(Suggested: 30)",font=("Times New Roman",12,"italic"),background='lightblue')
suggestlbl.place(x=220,y=150)

plbl = Label(root,text="(Summary length % compared to original text between 10 to 100)",font=("Times New Roman",12,"italic")
             ,background='lightblue')
plbl.place(x=15,y=180)

def clear_entry():
    link.delete(0,END)
    percent.delete(0,END)

s = Button(root, text="Summarize", command=lambda: summarizer(link.get(),percent.get()),padx=20)
s.place(x=140,y=230)

c = Button(root, text="Clear", command=clear_entry,padx=15)
c.place(x=270,y=230)

root.mainloop()