In [1]:
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
doc = """
         Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs.[1] It infers a function 
         from labeled training data consisting of a set of 
         training examples.[2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias).
      """

In [98]:

n_gram_range = (1, 2)
stop_words = 'english'

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([docs])
candidates = count.get_feature_names()

In [107]:
candidates

['10',
 '10 20',
 '10 67',
 '10 block',
 '10 blocks',
 '10 count',
 '10 going',
 '10 tens',
 '10 times',
 '110',
 '110 number',
 '110 starting',
 '13',
 '13 ll',
 '18',
 '18 57',
 '18 77',
 '18 number',
 '20',
 '20 21',
 '20 30',
 '20 just',
 '20 know',
 '20 let',
 '21',
 '21 22',
 '21 57',
 '21 yep',
 '22',
 '22 23',
 '22 greater',
 '22 know',
 '22 let',
 '22 number',
 '23',
 '23 24',
 '24',
 '24 25',
 '25',
 '25 25',
 '25 26',
 '25 31',
 '25 54',
 '25 greater',
 '25 look',
 '25 looks',
 '25 remember',
 '25 tens',
 '26',
 '26 27',
 '27',
 '27 28',
 '28',
 '28 28',
 '28 base',
 '28 greater',
 '28 let',
 '30',
 '30 40',
 '31',
 '31 31',
 '31 greater',
 '31 little',
 '31 look',
 '31 number',
 '31 numbers',
 '40',
 '40 40',
 '40 50',
 '40 cubes',
 '40 let',
 '40 long',
 '41',
 '41 42',
 '42',
 '42 43',
 '42 number',
 '43',
 '43 44',
 '44',
 '44 45',
 '45',
 '45 46',
 '45 54',
 '45 let',
 '45 means',
 '45 number',
 '46',
 '46 47',
 '47',
 '47 47',
 '47 cubes',
 '50',
 '50 greater',
 '51',


In [99]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([docs])
candidate_embeddings = model.encode(candidates)

In [106]:
candidate_embeddings.shape

(949, 768)

In [100]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [101]:
keywords

['race today',
 'cars participate',
 'girls racetrack',
 'race cars',
 'racing helmet']

In [102]:
import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [103]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=30)

['40 cubes', 'girls teacher', 'counting base', 'new videos', 'racing helmet']

In [79]:
from youtube_transcript_api import YouTubeTranscriptApi
raw_transcript = YouTubeTranscriptApi.get_transcript("Qn87cKHa7v4")


In [80]:
raw_transcript

[{'text': '[Music]', 'start': 0.61, 'duration': 4.99},
 {'text': 'hello boys and girls we are at the',
  'start': 12.06,
  'duration': 6.55},
 {'text': 'racetrack ready to compare some numbers',
  'start': 15.7,
  'duration': 6.15},
 {'text': "let's see which cars participate in the",
  'start': 18.61,
  'duration': 5.64},
 {'text': 'race today and which numbers we are',
  'start': 21.85,
  'duration': 4.98},
 {'text': 'going to compare you already know about',
  'start': 24.25,
  'duration': 5.4},
 {'text': 'place value and compare numbers using',
  'start': 26.83,
  'duration': 6.42},
 {'text': "base ten blocks now I'll help you find",
  'start': 29.65,
  'duration': 5.85},
 {'text': 'numbers that are greater than or less',
  'start': 33.25,
  'duration': 6.36},
 {'text': 'than other numbers on your mark get set',
  'start': 35.5,
  'duration': 8.79},
 {'text': 'go do you like that racing helmet see',
  'start': 39.61,
  'duration': 9.06},
 {'text': "the number it's number 25 we can 

In [81]:
filtered_transcript = list(filter(lambda e: e['text'] != '[Music]', raw_transcript))

In [82]:
filtered_transcript

[{'text': 'hello boys and girls we are at the',
  'start': 12.06,
  'duration': 6.55},
 {'text': 'racetrack ready to compare some numbers',
  'start': 15.7,
  'duration': 6.15},
 {'text': "let's see which cars participate in the",
  'start': 18.61,
  'duration': 5.64},
 {'text': 'race today and which numbers we are',
  'start': 21.85,
  'duration': 4.98},
 {'text': 'going to compare you already know about',
  'start': 24.25,
  'duration': 5.4},
 {'text': 'place value and compare numbers using',
  'start': 26.83,
  'duration': 6.42},
 {'text': "base ten blocks now I'll help you find",
  'start': 29.65,
  'duration': 5.85},
 {'text': 'numbers that are greater than or less',
  'start': 33.25,
  'duration': 6.36},
 {'text': 'than other numbers on your mark get set',
  'start': 35.5,
  'duration': 8.79},
 {'text': 'go do you like that racing helmet see',
  'start': 39.61,
  'duration': 9.06},
 {'text': "the number it's number 25 we can look at",
  'start': 44.29,
  'duration': 7.38},
 {'tex

In [83]:
whole_transcript = list(map(lambda e: e['text'], filtered_transcript))

In [84]:
docs = " ".join(whole_transcript)

In [85]:
docs

"hello boys and girls we are at the racetrack ready to compare some numbers let's see which cars participate in the race today and which numbers we are going to compare you already know about place value and compare numbers using base ten blocks now I'll help you find numbers that are greater than or less than other numbers on your mark get set go do you like that racing helmet see the number it's number 25 we can look at the digit in the tens place first to help compare numbers how many tens does my number have 25 has two tens it also has five ones nice job there's one of the race cars now what number is it is 31 greater or less than 25 both 25 and 31 have numbers in the tens place see we see that 3 is greater than 2 that means that 31 is greater than 25 we put up the greater than sign with this small pointed side chasing the smaller number so the sign would look like this whoa look there's another car with a new number is that number greater than 25 well first we can look at the larg