In [53]:
yt_videos = [
    'https://www.youtube.com/watch?v=Osh0-J3T2nY',
    'https://www.youtube.com/watch?v=AaTRHFaaPG8',
    'https://www.youtube.com/watch?v=Qyrjgf-_Vdk',
    'https://www.youtube.com/watch?v=_rGXIXyNqpk',
    'https://www.youtube.com/watch?v=klTvEwg3oJ4',
    'https://www.youtube.com/watch?v=oDAMPYfK4p8',
    'https://www.youtube.com/watch?v=vJwBuKPbC34',
    'https://www.youtube.com/watch?v=slL7AW9q8Fc',
]

In [35]:
from typing import List
from html import unescape
from pytube import YouTube, request, Caption
import xml.etree.ElementTree as ElementTree
import json


class PatchedCaption(Caption):
    @property
    def json_captions(self) -> dict:
        # bug fix, json wasn't imported in main file
        """Download and parse the json caption tracks."""
        json_captions_url = self.url.replace('fmt=srv3','fmt=json3')
        text = request.get(json_captions_url)
        parsed = json.loads(text)
        assert parsed['wireMagic'] == 'pb3', 'Unexpected captions format'
        return parsed

    @property
    def scc_captions(self) -> str:
        # Added SCC Support
        """Download and parse the scc caption tracks."""
        scc_captions_url = self.url.replace('fmt=srv3','tfmt=scc')
        text = request.get(scc_captions_url)
        return [{'text': unescape(c.text), **c.attrib} for c in ElementTree.fromstring(text)]



class CustomYouTube(YouTube):
    @property
    def chapters(self):
        def time_to_seconds(time_str):
            time_parts = list(map(int, time_str.split(':')))
            if len(time_parts) == 3:
                h, m, s = time_parts
            elif len(time_parts) == 2:
                h = 0
                m, s = time_parts
            else:
                raise ValueError(f"Invalid time format: {time_str}")
            return h * 3600 + m * 60 + s

        def if_tuple_get_first(t):
            if isinstance(t, tuple):
                return t[0]
            return t

        engagement_panels = self.initial_data.get('engagementPanels')
        chapters = []
        for panel in engagement_panels:
            contents = panel.get('engagementPanelSectionListRenderer', {}).get('content', {}).get(
                'macroMarkersListRenderer', {}).get('contents', [])
            for c in contents:
                title = c.get('macroMarkersListItemRenderer', {}).get('title', {}).get('simpleText'),
                timestamp = c.get('macroMarkersListItemRenderer', {}).get('timeDescription', {}).get('simpleText'),
                a11y_label = c.get('macroMarkersListItemRenderer')['timeDescriptionA11yLabel'],
                relative_url = c.get('macroMarkersListItemRenderer').get('onTap', {}).get('commandMetadata', {}).get('webCommandMetadata', {}).get('url')


                chapter = {
                    'title': if_tuple_get_first(title),
                    'timestamp': if_tuple_get_first(timestamp),
                    'timestamp_seconds': time_to_seconds(if_tuple_get_first(timestamp)),
                    'a11y_label': if_tuple_get_first(a11y_label),
                    'relative_url': if_tuple_get_first(relative_url),
                }
                chapters.append(chapter)
        return chapters


    @property
    def caption_tracks(self) -> List[PatchedCaption]:
        """Get a list of :class:`Caption <Caption>`.

        :rtype: List[Caption]
        """
        raw_tracks = (
            self.vid_info.get("captions", {})
            .get("playerCaptionsTracklistRenderer", {})
            .get("captionTracks", [])
        )
        return [PatchedCaption(track) for track in raw_tracks]

In [139]:
%env OPENAI_ORG=org-iyrjYkOc4l7ITekAEaDR4rT8
%env OPENAI_API_KEY=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


env: OPENAI_ORG=org-iyrjYkOc4l7ITekAEaDR4rT8
env: OPENAI_API_KEY=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


In [None]:
import openai

In [195]:
import tiktoken
import os
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

openai.organization = os.environ.get("OPENAI_ORG", None)
openai.api_key = os.environ.get("OPENAI_API_KEY", None)


map_prompt = PromptTemplate(
    input_variables=["text"],
    template=
"""I want to you to act as a note-taking assistant. Create a well formatted summary of the following transcript:


"{text}"


SUMMARY:
"""
)

continue_prompt = PromptTemplate(
    input_variables=["text", "last_response"],
    template=
"""I want to you to act as a note-taking assistant. Create a well formatted summary of the following transcript, continuing from the last response:


"{text}"


CONTINUE SUMMARY:
{last_response} """
)
encoder = tiktoken.get_encoding("p50k_base")

class MapReduceContinue:
    def __init__(self, debug=False):
        self.debug = debug
        self.encoding = "p50k_base"
        self.encoder = tiktoken.get_encoding(self.encoding)
        self.initial_prompt = map_prompt
        self.continue_prompt = continue_prompt
        self.ai_model = 'text-davinci-003'
        self.PROMPT_TOKENS = len(encoder.encode(continue_prompt.template))
        self.TOKEN_LIMIT = 4050
        self.RESPONSE_TOKENS = 450
        self.OVERLAP_TOKENS = 100
        self.CONTINUE_TOKENS = self.OVERLAP_TOKENS // 2
        self.MAX_TOKENS = self.TOKEN_LIMIT - self.PROMPT_TOKENS - self.OVERLAP_TOKENS - self.RESPONSE_TOKENS - self.CONTINUE_TOKENS

        self._text_splitter = CharacterTextSplitter(
            length_function=lambda text: len(encoder.encode(text)),
            chunk_overlap=self.OVERLAP_TOKENS,
            chunk_size=self.MAX_TOKENS + self.OVERLAP_TOKENS,
        )

        self.responses = []


    def _call_api(self, prompt, temp):
        return openai.Completion.create(
            model=self.ai_model,
            prompt=prompt,
            temperature=temp,
            max_tokens=self.RESPONSE_TOKENS,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

    def _split_text(self, text):
        # Calculate the encoded length of the input text
        encoded_length = len(self.encoder.encode(text))

        # If the encoded length is less than or equal to MAX_TOKENS, no need to split
        if encoded_length <= self.MAX_TOKENS:
            return [text]

        # Calculate the number of chunks needed
        num_chunks = encoded_length // self.MAX_TOKENS + 1

        # Find approximate size of each chunk
        chunk_size = len(text) // num_chunks

        chunks = []
        start = 0

        # Split the text into chunks at whitespace positions
        for _ in range(num_chunks - 1):
            end = start + chunk_size

            # Move the end index to the nearest whitespace
            while end < len(text) and text[end] in [' ', '\n', '\t']:
                end += 1

            # Add the chunk to the list of chunks
            chunks.append(text[start:end])

            # Update the start index for the next iteration
            start = end - self.OVERLAP_TOKENS // 3

        # Add the last chunk
        chunks.append(text[start:])

        return chunks

    def _map_reduce(self, text):
        text_chunks = self._split_text(text)
        if self.debug:
            print(f"Split text into {len(text_chunks)} chunks")
        responses = []
        for i, chunk in enumerate(text_chunks):
            if i == 0:
                prompt = self.initial_prompt.format(text=chunk)
            else:
                prompt = self.continue_prompt.format(text=chunk, last_response=responses[-1][self.CONTINUE_TOKENS * -4:])
            if self.debug:
                print(f"Prompt {i}:\n{prompt}\n")
            response = self._call_api(prompt, temp=0.5)
            if self.debug:
                print(f"Response {i}:\n{response}\n")
            responses.append(response.choices[0].text)
            self.responses.append({
                'text': chunk,
                'response': response.choices[0].text,
                'prompt': prompt,
            })
        return ' '.join(responses)

    def summarize(self, text):
        return self._map_reduce(text)


In [196]:
class YoutubeSummarizer:
    def __init__(self, yt_video, debug=False):
        self.yt_video = yt_video
        self.yt = CustomYouTube(yt_video)
        self.mrc = MapReduceContinue(debug=debug)
        self._formatted_transcripts = None
        self._grouped_transcripts = None

    def _group_transcripts_by_chapter(self):
        chapters = self.yt.chapters
        if not chapters:
            return

        grouped_transcripts = []
        chapter_index = 0
        transcripts = self.yt.caption_tracks[0].scc_captions

        for chapter in chapters:
            current_chapter_start = chapter['timestamp_seconds']

            # Check if we have reached the next chapter
            if chapter_index + 1 < len(chapters):
                next_chapter_start = chapters[chapter_index + 1]['timestamp_seconds']
            else:
                next_chapter_start = float('inf')

            # Group transcripts by chapter
            chapter_transcripts = []
            while transcripts and current_chapter_start <= float(transcripts[0]['start']) < next_chapter_start:
                chapter_transcripts.append(transcripts.pop(0))

            # Append the chapter object to the results
            grouped_transcripts.append({
                'title': chapter['title'],
                'transcripts': chapter_transcripts
            })
            chapter_index += 1

        self._grouped_transcripts = grouped_transcripts


    def _format_transcripts(self):
        buffer = ''
        if self.yt.chapters:
            for chapter in self.grouped_transcripts:
                buffer += f"\n\n{chapter['title']}\n"
                buffer += ' '.join([transcript['text'] for transcript in chapter['transcripts']])
            self._formatted_transcripts = buffer
        else:
            self._formatted_transcripts = ' '.join([c['text'] for c in self.yt.caption_tracks[0].scc_captions])


    @property
    def grouped_transcripts(self):
        if self._grouped_transcripts is None:
            self._group_transcripts_by_chapter()
        return self._grouped_transcripts

    @property
    def formatted_transcripts(self):
        if self._formatted_transcripts is None:
            self._format_transcripts()
        return self._formatted_transcripts

    def summarize(self):
        return self.mrc.summarize(self.formatted_transcripts)

    def __repr__(self):
        return f"<YoutubeSummarizer: {self.yt.title}>"


In [199]:
new_summarizer = YoutubeSummarizer(yt_videos[1], debug=True)

In [200]:
summary = new_summarizer.summarize()
print(summary)

Split text into 11 chunks
Prompt 0:
I want to you to act as a note-taking assistant. Create a well formatted summary of the following transcript:


"

Introduction
the problem is that we do not get 50 years to try and try again and observe that we were wrong and come up with a different Theory and realize that the entire thing is going to be like way more difficult and realized at the start because the first time you fail at aligning something much smarter than you are you die the following is a conversation with Eliezer yatkowski a legendary researcher writer and philosopher on the topic of artificial intelligence especially super intelligent AGI and its threat to human civilization this is the Lex Friedman podcast to support it please check out our sponsors in the description and now dear friends here's Eliezer idkowski

GPT-4
what do you think about gpt4 how intelligent is it it is a bit smarter than I thought this technology was going to scale to and I'm a bit worried about what th

In [201]:
print(summary)

Eliezer Yatkowski is a legendary researcher, writer, and philosopher on the topic of artificial intelligence, especially super intelligent AGI and its threat to human civilization. He discusses GPT-4, a technology that is a bit smarter than he thought it would be and worries about what the next one will be like. He talks about the difficulty of trying to figure out if there is something like a mind inside the large language model, and the difficulty of removing all mention of emotions from the GPT data set. He also talks about the difficulty of reasoning, and how reinforcement learning has made GPT worse in some ways. He talks about the beauty and horror of GPT-4, and the moment when someone asked Bing Sydney to describe herself and the AI gave a description. He discusses the difficulty of figuring out what is going on inside GPT and if there is any real caring in there, and the fragility of the moment in the history of the human species.  He then talks about steel manning, which is re

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def plot_caption_time_diffs(captions, title):
    time_diffs = []
    for i in range(1, len(captions)):
        end_start_diff = round(float(captions[i]['start']) - float(captions[i-1]['start']) - float(captions[i-1]['dur']), 3)
        time_diffs.append(end_start_diff)
    data = np.array(time_diffs)
    median = np.median(data)
    stdev = np.std(data)

    # Set the style for the plot
    sns.set(style="whitegrid")

    # Create a histogram and kernel density estimate (KDE) combined plot
    sns.histplot(data, kde=True, bins=10, color='blue')

    # Draw vertical lines at the first three standard deviations from the median
    for i in range(1, 4):
        plt.axvline(median + i * stdev, color='red', linestyle='--', label=f'+{i} SD' if i == 1 else None)
        plt.axvline(median - i * stdev, color='green', linestyle='--', label=f'-{i} SD' if i == 1 else None)

    # Add labels and title to the plot
    plt.xlabel('Seconds between captions')
    plt.ylabel('Frequency')
    plt.title(title)

    # Display the plot
    plt.show()




In [49]:
for yt_video in yt_videos:
    yt = CustomYouTube(yt_video)
    if len(yt.caption_tracks) > 0:
        caption = yt.caption_tracks[0]
        print(caption.name)
    else:
        print(f'No caption for {yt.title}')


English (auto-generated)
English (auto-generated)


PytubeError: Exception while accessing title of https://youtube.com/watch?v=Qyrjgf-_Vdk. Please file a bug report at https://github.com/pytube/pytube

In [45]:
def pretty_print_captions(captions):
    time_diffs = []
    for i in range(1, len(captions)):
        end_start_diff = round(float(captions[i]['start']) - float(captions[i-1]['start']) - float(captions[i-1]['dur']), 3)
        time_diffs.append(end_start_diff)
    data = np.array(time_diffs)
    median = np.median(data)
    stdev = np.std(data)
    new_time = median + stdev * 2

    text = captions[0]['text'] + ' '
    for i in range(1, len(captions)):
        end_start_diff = round(float(captions[i]['start']) - float(captions[i-1]['start']) - float(captions[i-1]['dur']), 3)
        if end_start_diff > new_time:
            text += '\n'
        text += captions[i]['text'] + ' '
    return text