In [1]:
import pandas as pd
import os

In [2]:

def get_every_transcription(transcribed_path = "transcribed-audio"):
    global documents; documents = {}

    for transcription_name in os.listdir(transcribed_path):
        if os.path.isfile(os.path.join(transcribed_path, transcription_name)):
            dataframe = pd.read_csv(transcribed_path + "/" + transcription_name)
            documents[str(transcription_name)] = "".join(dataframe["text"].astype(str).values.tolist())


In [3]:
get_every_transcription()
documents

{'flagrant-mrBeast-27092022_transcribed.csv': " I know all about YouTube analytics. Do you want me to help you with that? I could use the help. Jesse. Yeah Are you logged into the show's channel? Oh shit, he's really mad. Hold on, hold on, hold on, what are we- Where did you meet your girlfriend? Uh, but God. That one was uh- Hey! You know, the mile high as I was going to Antarctica. Oh, hey, yo. A billion dollars if we could phone the channel and the companies and stuff like that. Like, that sounds enticing, but- Like you've been offered a billion dollars for your YouTube channel. With all the companies. Yeah. What's different about the Squid Game video? What did you tap into? I-I most spent like two million dollars on a video up to that point. That one we spent 4.2 million. That sky up there's not real. Oh wow! Starting at the end of the blue is all CGI? Yeah, all CGI. And I swear to God, I thought this guy should be president. Is that something you thought about? It is appealing. Ye

In [4]:
#Get a document to perform tests with it
test_doc = next(iter(documents.items()))
test_text = test_doc[1]
print(test_doc[0])

flagrant-mrBeast-27092022_transcribed.csv


In [5]:
import nltk
nltk.download('punkt')  #Needed for word_tokenize
from nltk.tokenize import wordpunct_tokenize, word_tokenize, TweetTokenizer
from nltk import TreebankWordTokenizer, sent_tokenize
tbank_tokenizer = TreebankWordTokenizer()
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

[nltk_data] Downloading package punkt to /home/guayo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#Vader for sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()

In [77]:
text_doc_sentences = sent_tokenize(test_text)

In [78]:
sentences_with_sentiment = {}

for sent in text_doc_sentences:
    scores = sa.polarity_scores(text=sent)
    sentences_with_sentiment[sent] = scores['compound']

In [79]:
"""
    Functions to get the n highest or lowest value of an array of them.
    arr: the input array from where we want to extract the number.
    n: = 1 is like using max() or min(); > 1 gets the n-1 highest/lowest number after the maximum/minimum one
"""

def get_n_highest(arr: list, n: int) -> float:
    return sorted(arr, reverse=True)[n-1:n][0]

def get_n_lowest(arr: list, n:int) -> float:
    return sorted(arr, reverse=False)[n-1:n][0]

In [100]:
most_positive = get_n_highest(list(sentences_with_sentiment.values()), 5)
most_negative = get_n_lowest(list(sentences_with_sentiment.values()), 5)

for key, value in sentences_with_sentiment.items():
    if value == most_positive or value == most_negative:
        print(str(value) + ":", key)

-0.8947: But what's the difference between that and just actually being a fucking obsessive loser over something unimportant?
0.8644: But yeah, so Tony was there and he's just like, and he was like, yeah, it wasn't cool at all.


### Extract the most relevant parts with sentiment analysis

In [96]:
def get_n_highest_dict(d: dict, n: int) -> dict:
    """
    Gets the n highest value in a dictionary and returns the key-value pair with that value.
    """
    sorted_items = sorted(d.items(), key=lambda x: abs(x[1]), reverse=True)
    nth_highest_item = sorted_items[n-1] if len(sorted_items) >= n else None
    return nth_highest_item

In [178]:
def get_sentiment_n_sentences(sent_arr, sent_window):

    """
    Get the average sentiment for a group of sentences.

    sent_arr: array of sentences
    sent_window: determines the size of the group of sentences. sent_window=5 will take sentences from 5 to 5
    """

    sentences_with_sentiment_window = {}

    for i, sent in enumerate(sent_arr):
        phrases = sent_arr[i:i+sent_window]
        polarity_sum = 0
        for phrase in phrases:
            score = sa.polarity_scores(text=phrase)
            polarity_sum += score['compound']
        polarity_avg = polarity_sum/sent_window
        sentences_with_sentiment_window[' '.join(phrases)] = polarity_avg

    return sentences_with_sentiment_window

In [180]:
def top_n_moments_SA(document, n, sent_window=5):

    #Separate the document into sentences
    doc_sentences = sent_tokenize(document)

    #Analyze the sentiment of each window of sentences
    sentences_with_sentiment_window = get_sentiment_n_sentences(doc_sentences, sent_window)

    #get the top n windows of sentences with the most sentiment (positive or negative)
    for i in range(1, n+1):
        yield get_n_highest_dict(sentences_with_sentiment_window, i)    

In [187]:
for moment in top_n_moments_SA(test_text, 10, 3): print(moment[1], '\n', moment[0])

0.8097333333333334 
 But at that time it's like she just was like, please, like she was even like, she would be like, she loves me. She'd like, you know, if we got a million dollars that comes in like, hide a hundred grand as a rainy day fund in case I just blow up. And then I'd find it and I'd be like, oh, no, no, perfect.
0.7912666666666667 
 Like I just need a partner who loves learning, you know, and just wants to improve. Like for us now, like an idea of a date is just to like take an IQ test and then study and see if we can get it higher. Like I love it.
0.7902666666666667 
 And once they were just like going ham, I was like, okay, I feel a little bit. They were super really confident when they were speaking to you about it. Yeah, they were, they were, which is why it's great working with professionals because I don't know shit.
0.7815666666666666 
 And then after watching it where you're like, wow, this is fun. Yeah, we should be creative. But in general we're like, whoa, like T

### Tokenization of documents

In [11]:
test_text = test_doc[1]

wordpunct_ex = wordpunct_tokenize(test_text)
wordtokenize_ex = word_tokenize(test_text)
treebank_ex = tbank_tokenizer.tokenize(test_text)
tweettknzr_ex = tweet_tokenizer.tokenize(test_text)

print("", wordpunct_ex, "\n", wordtokenize_ex, "\n", treebank_ex, "\n", tweettknzr_ex)

 ['I', 'know', 'all', 'about', 'YouTube', 'analytics', '.', 'Do', 'you', 'want', 'me', 'to', 'help', 'you', 'with', 'that', '?', 'I', 'could', 'use', 'the', 'help', '.', 'Jesse', '.', 'Yeah', 'Are', 'you', 'logged', 'into', 'the', 'show', "'", 's', 'channel', '?', 'Oh', 'shit', ',', 'he', "'", 's', 'really', 'mad', '.', 'Hold', 'on', ',', 'hold', 'on', ',', 'hold', 'on', ',', 'what', 'are', 'we', '-', 'Where', 'did', 'you', 'meet', 'your', 'girlfriend', '?', 'Uh', ',', 'but', 'God', '.', 'That', 'one', 'was', 'uh', '-', 'Hey', '!', 'You', 'know', ',', 'the', 'mile', 'high', 'as', 'I', 'was', 'going', 'to', 'Antarctica', '.', 'Oh', ',', 'hey', ',', 'yo', '.', 'A', 'billion', 'dollars', 'if', 'we', 'could', 'phone', 'the', 'channel', 'and', 'the', 'companies', 'and', 'stuff', 'like', 'that', '.', 'Like', ',', 'that', 'sounds', 'enticing', ',', 'but', '-', 'Like', 'you', "'", 've', 'been', 'offered', 'a', 'billion', 'dollars', 'for', 'your', 'YouTube', 'channel', '.', 'With', 'all', 'the'

In [12]:
unique_wordpunct_ex = set(wordpunct_ex) - set(wordtokenize_ex) - set(wordtokenize_ex) - set(tweettknzr_ex)
unique_wordtokenize_ex = set(wordtokenize_ex) - set(wordpunct_ex) - set(wordtokenize_ex) - set(tweettknzr_ex)
unique_treebank_ex = set(wordtokenize_ex) - set(wordtokenize_ex) - set(wordpunct_ex) - set(tweettknzr_ex)
unique_tweettknzr_ex = set(tweettknzr_ex) - set(wordtokenize_ex) - set(wordtokenize_ex) - set(wordpunct_ex)

print(list(unique_wordpunct_ex))
print(list(unique_wordtokenize_ex))
print(list(unique_treebank_ex))
print(list(unique_tweettknzr_ex))

['e', 'com', 'non', 'Didn', 'r', 'C', '7', '85', 'hmm', 're', 'Gen', 'isn', 'couldn', 'll', 'multi', 'd', '-$', 'T', 'Nas', '8', 'H', '56', 'pre', 'Doesn', 'didn', 'Fi', 'eastern', 'installed', 'propeying', 'deals', 'mucking', 'downs', '6', 'U', 'wouldn', 'depth', '%,', 'atesunning', 'wasn', 'surfshark', 'nut', 'ators', 'hasn', 'joinhoney', 'weren', 'aren', 'Joinhoney', 'releasing', 'don', 'gigga', 'shouldn', 'haven', 'hadn', '%.', 'Isn', 'Wi', 'L', 'upgrade', 't', 'Don', '000', 'entry', 'millionaire', 's', 'u', 'p', "'?", 've', 'rolls', '%?', 'doesn', '.,', 'y']
[]
[]
["hand's", "hell's", "We've", "boy's", "attorney's", "you're", "Dubai's", "we'd", "You've", "it'll", "phone's", "retention's", "Let's", "TV's", "weren't", "Post's", "That's", "It's", "Wikipedia's", "I'll", "Mom's", "Lord's", 'g-a-m-e-r-s-u-p-p-s', "isn't", "who's", "there's", "Everyone's", "He'd", "There's", "Obama's", "Google's", "he's", "year's", "let's", "she'd", "Today's", "cookie's", 'g-g', "We'll", "you'll", "they'

#### CHECK IF THERE ARE DIFFERENCES BETWEEN THE LARGE MODEL AND THE TINY ONE, WHICH WAS THE ONE USED TO TRANSCRIBE.