In [126]:
import sys
import numpy as np
import pandas as pd
import requests 
import urllib
import re
import string
from nltk.stem import WordNetLemmatizer
from youtube_transcript_api import YouTubeTranscriptApi

pattern = "watch\?v=(.*)"
lemmatizer = WordNetLemmatizer()

def get_transcript(link):
    video_id = re.findall(pattern= pattern, string = link)[0]
    srt = YouTubeTranscriptApi.get_transcript(video_id, 
                                          languages=['en'])
    transcript_lines = [timestamp['text'] for timestamp in srt]
    transcript_merged = " ".join(transcript_lines)
    return transcript_merged


def get_frequency(words, start_year = 2018, end_year = 2019, smoothing = 0, corpus = "en-US-2019"):
    words = urllib.parse.quote(words)
    url = 'https://books.google.com/ngrams/json?content=' + words + '&year_start=' + str(start_year) + '&year_end=' + str(end_year) + '&corpus=' + str(corpus) + '&smoothing=' + str(smoothing) + '' 
    print(url)
    response = requests.get(url) 
    outputs = response.json() 
    print(outputs)
    freq = {}
    freq = {output['ngram']: np.max([i for i in output['timeseries'] if i!=0]) for output in outputs}
    df = pd.DataFrame(freq.items(), columns= ["Word", "Value"])
    return df.sort_values(by = "Value", ascending=False)

def clean_text(words):
    pattern = r'[{}]'.format(re.escape(string.punctuation))
    cleaned_text = re.sub(pattern, " ", words)
    cleaned_text = cleaned_text.lower().split()
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text]
    cleaned_text = set(cleaned_text)
    print(len(cleaned_text))
    cleaned_text = ", ".join(cleaned_text)
    print(cleaned_text)
    return cleaned_text


if __name__ == '__main__':
    link = "https://www.youtube.com/watch?v=ljmifo4Klss"
    text = get_transcript(link)
    cleaned_text = clean_text(text)
    feq = get_frequency(cleaned_text)
    print(feq.tail(n = 30))

327
recorded, power, humanity, about, did, million, victory, out, again, across, them, slave, human, victim, god, covered, thrived, without, done, century, region, no, broken, thousand, war, take, harness, mandated, gave, more, last, europe, drove, every, day, confused, that, this, fifty, those, yet, jenner, had, curse, parent, nation, took, one, beyond, civilian, their, pharaoh, banished, loved, slew, found, 16th, only, american, village, kind, at, know, european, perhaps, destroyed, memory, language, endanger, rash, of, all, single, many, waxed, someone, civil, call, first, place, cry, inhuman, around, powerful, master, child, waned, wa, people, effort, month, and, dozen, dispatching, asia, have, weaker, empire, donald, attempting, ever, unite, weapon, 1979, confronted, none, city, forth, area, bury, conceived, you, view, annihilate, neighbor, devastating, surrounding, horror, foolish, crippled, against, written, are, son, 5, hiding, fought, voice, be, strike, life, egypt, by, push, 

In [125]:
feq

Unnamed: 0,Word,Value
319,the,4.178944e-02
70,of,2.549171e-02
91,and,2.320032e-02
202,to,1.996130e-02
226,a,1.496891e-02
...,...,...
216,peloponnesian,2.426762e-09
295,maow,2.253421e-09
325,zhdanov,1.087101e-09
158,raska,5.200204e-10
