In [38]:
# import sys
import numpy as np
import pandas as pd
import requests
import urllib
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.corpus import stopwords

# nltk.download('stopwords')

PATTERN = "watch\?v=(.*)"
pd.options.display.float_format = "{:.10f}".format


def get_transcript(link):
    video_id = re.findall(pattern=PATTERN, string=link)[0]
    srt = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
    transcript_lines = [timestamp["text"] for timestamp in srt]
    transcript_merged = " ".join(transcript_lines)
    return transcript_merged


def clean_text(words):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    pattern_cleaning = r"[{}]".format(re.escape(string.punctuation))
    cleaned_text = re.sub(pattern_cleaning, " ", words)
    cleaned_text = cleaned_text.lower().split()
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text]
    cleaned_text = [word for word in cleaned_text if not word in stop_words]
    return cleaned_text


def preprocess_cleaned_text(cleaned_text):
    unique_words = set(cleaned_text)
    string_unique_words = ",".join(unique_words)
    sample_size = len(cleaned_text)
    words_df = pd.DataFrame(
        [(word, cleaned_text.count(word) / sample_size) for word in unique_words],
        columns=["Word", "Text Frequency"],
    )
    words_df = words_df.sort_values(by="Text Frequency")
    words_df["Text Order"] = range(0, len(words_df))
    return string_unique_words, words_df


def get_request(
    words, start_year=2018, end_year=2019, smoothing=0, corpus="en-US-2019"
):
    words = urllib.parse.quote(words)
    url = (
        f"https://books.google.com/ngrams/json?content={words}"
        f"&year_start= {start_year}"
        f"&year_end= {end_year}"
        f"&corpus={corpus}"
        f"&smoothing= {smoothing}"
    )
    response = requests.get(url)
    outputs = response.json()
    return outputs


def get_frequency(text):
    words, text_df = preprocess_cleaned_text(clean_text(text))
    outputs = get_request(words)
    freq = {
        output["ngram"]: np.mean([i for i in output["timeseries"] if i != 0])
        for output in outputs
    }
    df = pd.DataFrame(freq.items(), columns=["Word", "Language Frequency"])
    sorted_df = df.sort_values(by="Language Frequency", ascending=False)
    sorted_df["Language Order"] = range(0, len(df))
    combined_text_language_df = sorted_df.join(text_df.set_index("Word"), on="Word")
    combined_text_language_df["Order Offset"] = (
        combined_text_language_df["Language Order"]
        - combined_text_language_df["Text Order"]
    )
    combined_text_language_df = combined_text_language_df.set_index("Word")
    return combined_text_language_df[
        [
            "Language Frequency",
            "Text Frequency",
            "Language Order",
            "Text Order",
            "Order Offset",
        ]
    ]

if __name__ == '__main__':
    link = "https://www.youtube.com/watch?v=ljmifo4Klss"
    text = get_transcript(link)
    feq = get_frequency(text)
    print(feq.to_string())

[nltk_data] Downloading package stopwords to C:\Users\Illia
[nltk_data]     Kvashuk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


               Language Frequency  Text Frequency  Language Order  Text Order  Order Offset
Word                                                                                       
one                       0.00180          0.0125               0         260          -260
would                     0.00156          0.0075               1         251          -250
time                      0.00123          0.0075               2         250          -248
could                     0.00111          0.0225               3         265          -262
people                    0.00078          0.0050               4         222          -218
first                     0.00076          0.0075               5         247          -242
back                      0.00073          0.0075               6         244          -238
know                      0.00067          0.0025               7          83           -76
even                      0.00067          0.0025               8         166   

In [15]:
import re
import string
import pandas as pd
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sample = "sun is in the skies, suns"


words_df

Unnamed: 0,Word,Text Frequency,Order
0,sun,0.333333,0
1,is,0.166667,1
2,in,0.166667,2
3,the,0.166667,3
4,sky,0.166667,4
5,sun,0.333333,5


In [125]:
feq

Unnamed: 0,Word,Value
319,the,4.178944e-02
70,of,2.549171e-02
91,and,2.320032e-02
202,to,1.996130e-02
226,a,1.496891e-02
...,...,...
216,peloponnesian,2.426762e-09
295,maow,2.253421e-09
325,zhdanov,1.087101e-09
158,raska,5.200204e-10
