In [51]:
import openai
import os
openai.organization = os.environ.get("OPENAI_ORG", None)
openai.api_key = os.environ.get("OPENAI_KEY", None)

if openai.api_key is None:
    raise ValueError("OpenAI API key not found. Please set the OPENAI_KEY environment variable.")

In [35]:
import urllib.parse
import requests
from bs4 import BeautifulSoup
import json
import re
from youtube_transcript_api import YouTubeTranscriptApi
import tiktoken



class YoutubeScraper:
    """
    A class to scrape YouTube video metadata.
    This gets the video title, description, chapters, and transcript.

    ...
    Attributes
    ----------
    url : str
        The YouTube video URL
    """

    def __init__(self, url):
        self.url = url
        self.video_key = self.extract_video_key(url)
        self.data = None

    def get_video_info(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
        }

        response = requests.get(self.url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        try:
            script = soup.find('script', string=lambda x: x and 'var ytInitialPlayerResponse' in x)
            if not script:
                raise ValueError('Failed to find ytInitialPlayerResponse in the page source.')

            json_str = re.search(r'var ytInitialPlayerResponse = ({.*?});', script.string).group(1)
            data = json.loads(json_str)
            video_details = data.get('videoDetails', {})

            title = video_details.get('title')
            description = video_details.get('shortDescription')

            script = soup.find('script', string=lambda x: x and 'var ytInitialData' in x)
            if not script:
                raise ValueError('Failed to find ytInitialData in the page source.')

            json_str = re.search(r'var ytInitialData = ({.*?});', script.string).group(1)
            yt_initial_data = json.loads(json_str)

            chapters = {}
            engagement_panels = yt_initial_data.get('engagementPanels', [])
            for panel in engagement_panels:
                contents = panel.get('engagementPanelSectionListRenderer', {}).get('content', {}).get(
                    'macroMarkersListRenderer', {}).get('contents', [])
                for content in contents:
                    timestamp = content.get('macroMarkersListItemRenderer', {}).get('timeDescription', {}).get(
                        'simpleText')
                    chapter_title = content.get('macroMarkersListItemRenderer', {}).get('title', {}).get('simpleText')
                    if timestamp is not None and chapter_title is not None:
                        chapters[timestamp] = chapter_title

            self.data = {
                'title': title,
                'description': description,
                'chapters': chapters,
                'transcripts': YouTubeTranscriptApi.get_transcript(self.video_key)
            }
            return self

        except Exception as e:
            print(f"Error: {str(e)}")
            return None

    def get_data(self):
        return self.data

    def __str__(self):
        return str(self.url)

    @staticmethod
    def extract_video_key(url):
        parsed_url = urllib.parse.urlparse(url)
        query_params = urllib.parse.parse_qs(parsed_url.query)
        video_key = query_params.get('v', [None])[0]
        return video_key

In [36]:
class YouTubeInfo:
    def __init__(self, video_url, ignore_chapters=False):
        self.transcripts = None
        self.title = None
        self.description = None
        self.chapters = None
        self.segments = None
        self.video_url = video_url
        self.ignore_chapters = ignore_chapters

    @staticmethod
    def seconds_to_timestamp(seconds):
        seconds = round(float(seconds))
        h, remainder = divmod(seconds, 3600)
        m, s = divmod(remainder, 60)
        return f"{h:02d}:{m:02d}:{s:02d}"

    @staticmethod
    def time_to_seconds(time_str):
        time_parts = list(map(int, time_str.split(':')))
        if len(time_parts) == 3:
            h, m, s = time_parts
        elif len(time_parts) == 2:
            h = 0
            m, s = time_parts
        else:
            raise ValueError(f"Invalid time format: {time_str}")
        return h * 3600 + m * 60 + s


    def get_video_info(self):
        from YoutubeScraper import YoutubeScraper
        data = YoutubeScraper(self.video_url).get_video_info().get_data()
        self.transcripts = data['transcripts']
        self.title = data['title']
        self.description = data['description']
        if bool(data['chapters']) and not self.ignore_chapters:
            self.chapters = data['chapters']
        else:
            self.chapters = None
        return self

    def group_transcripts_by_chapters(self):
        chapters = sorted([(self.time_to_seconds(k), v) for k, v in self.chapters.items()])
        grouped_transcripts = [{"title": c, "time": k, "segments": []} for k, c in chapters]

        for segment in self.transcripts:
            start_time = segment['start']

            for i, chapter in enumerate(grouped_transcripts):
                chapter_start = chapter["time"]
                if i + 1 < len(grouped_transcripts) and chapter_start <= start_time < grouped_transcripts[i + 1]["time"]:
                    chapter["segments"].append(segment)
                    break
                elif i + 1 == len(grouped_transcripts) and start_time >= chapter_start:
                    chapter["segments"].append(segment)
                    break

        return grouped_transcripts


    def json(self):
        return {
            'title': self.title,
            'description': self.description,
            'chapters': self.chapters,
            'transcripts': self.transcripts
        }

    def __repr__(self):
        return str(self.json())

    def __str__(self):
        return str(self.title)

    def __dict__(self):
        return self.json()

    def __len__(self):
        return len(self.transcripts)

In [46]:
class TranscriptSummarizer:
    """
    Takes in transcripts and summarizes them with the OpenAi API.
    :param video_url: The url of the video to summarize
    :param ignore_chapters: If True, the chapters will be ignored and video will be summarized as a whole.
    """
    def __init__(self, video_url, ignore_chapters=False):
        self.yt_info = YouTubeInfo(video_url, ignore_chapters).get_video_info()
        self.encoding = tiktoken.get_encoding("p50k_base")
        self.processed_chapters = self.process_chapters()
        self.DEFAULT_PROMPT = '\nWrite a summary of this transcript using markdown formatting.\n\n## {}\n'
        self.CONT_PROMPT = 'Continue writing your summary of this transcript. This is where you left off:\n{}'
        self.MAX_TOKENS = 4000
        self.RESPONSE_TOKENS = 500
        self.PROMPT_TOKENS = len(self.encoding.encode(self.DEFAULT_PROMPT))
        self.BUFFER_TOKENS = 100
        self.MAX_CONTENT_TOKENS = self.MAX_TOKENS - self.PROMPT_TOKENS - self.BUFFER_TOKENS - self.RESPONSE_TOKENS
        self.AI_MODEL = 'text-davinci-003'
        self.summary = None

    def process_chapters(self):
        if self.yt_info.chapters:
            processed_chapters = self.yt_info.group_transcripts_by_chapters()
        else:
            processed_chapters = [{"title": "Full Summary", "time": 0, "segments": self.yt_info.transcripts}]
        return processed_chapters

    def calculate_content_segments(self, content):
        from math import ceil
        content_tokens = len(self.encoding.encode(content))
        return ceil(content_tokens / self.MAX_CONTENT_TOKENS), content_tokens

    def format_content_segments(self, segments):
        return ''.join([f'[{YouTubeInfo.seconds_to_timestamp(s["start"])}]{s["text"]}' for s in segments])

    def split_array(self, array, n_chunks):
        chunk_size = len(array) // n_chunks
        remainder = len(array) % n_chunks
        result = []
        index = 0

        for i in range(n_chunks):
            size = chunk_size + (1 if i < remainder else 0)
            result.append(array[index:index + size])
            index += size

        return result

    def call_openai_api(self, prompt, temperature):
        return openai.Completion.create(
            model=self.AI_MODEL,
            prompt=prompt,
            temperature=temperature,
            max_tokens=self.RESPONSE_TOKENS,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )

    def split_summary(self, summary, max_length):
        summary_parts = []
        while len(summary) > max_length:
            split_point = summary[:max_length].rfind('\n')
            summary_parts.append(summary[:split_point])
            summary = summary[split_point:]
        summary_parts.append(summary)
        return summary_parts

    def summarize(self):
        for chapter in self.processed_chapters:
            initial_process = self.format_content_segments(chapter['segments'])
            segments, initial_tokens = self.calculate_content_segments(initial_process)
            print(f'{chapter["title"]} | {initial_tokens = } | {segments = }')
            if segments == 1:
                chapter['processed_segments'] = [initial_process]
            else:
                secondary_segments = self.split_array(chapter['segments'], segments)
                chapter['processed_segments'] = [self.format_content_segments(s) for s in secondary_segments]

            chapter_title = chapter['title']
            chapter_segments = chapter['processed_segments']
            chapter['summary'] = ''
            chapter['responses'] = []
            for i, segment in enumerate(chapter_segments):
                if i == 0:
                    full_segment = segment + self.DEFAULT_PROMPT.format(chapter_title)
                else:
                    full_segment = segment + self.CONT_PROMPT.format(chapter['summary'][-3 * self.BUFFER_TOKENS])
                # ChatGPT Note: Create a wrapper for calling the OpenAI API
                response = self.call_openai_api(
                    prompt=full_segment,
                    temperature=0.3,
                )
                chapter['responses'].append(response)
                chapter['summary'] += response.choices[0].text
                reduction_percent = round(100.00 - response.usage.completion_tokens / response.usage.prompt_tokens * 100, 2)
                print(f'{chapter_title} | {i + 1}/{len(chapter_segments)} | {reduction_percent}%')

        first_summary = f'# {self.yt_info.title}\n\n## Chapters\n'
        for chapter in self.processed_chapters:
            first_summary += f'### {chapter["title"]}\n'
            first_summary += chapter['summary']
            first_summary += '\n\n'

        summary_parts = self.split_summary(first_summary, self.MAX_CONTENT_TOKENS)
        self.summary = ''
        for i, summary_part in enumerate(summary_parts):
            if i == 0:
                prompt = summary_part + '\n Write an executive summary of this outline using markdown formatting.\n\n'
            else:
                prompt = summary_part + '\n Continue writing your executive summary of this outline. This is where you left off:\n\n' + \
                self.summary[-3 * self.BUFFER_TOKENS:]

            response = self.call_openai_api(
                prompt=prompt,
                temperature=0.5,
            )
            self.summary += response.choices[0].text

        return self

    def generate_markdown_output(self):
        md_output = f'# {self.yt_info.title}\n'
        md_output += f'## Executive Summary\n{self.summary}\n\n## Chapters\n'
        for chapter in self.processed_chapters:
            md_output += f'### {chapter["title"]}\n'
            md_output += chapter['summary']
            md_output += '\n\n'
        return md_output

    def generate_html_output(self):
        html_output = f'<h1>{self.yt_info.title}</h1>'
        html_output += f'<h2>Executive Summary</h2><p>{self.summary}</p><h2>Chapters</h2>'
        for chapter in self.processed_chapters:
            html_output += f'<h3>{chapter["title"]}</h3>'
            html_output += chapter['summary']
            html_output += '<hr />'
        return html_output

    def render_markdown(self):
        from IPython.display import display, Markdown
        return display(Markdown(self.generate_markdown_output()))

    def render_html(self):
        from IPython.display import display, HTML
        return display(HTML(self.generate_html_output()))


In [None]:
summarizer = TranscriptSummarizer('https://www.youtube.com/watch?v=oDAMPYfK4p8')
summarizer.summarize()

In [32]:
from IPython.display import display, Markdown, display_markdown
display_markdown(summarizer.generate_markdown_output(), raw=True)


# Why Are Animals Symmetrical?
## Executive Summary
Bilaterians are a large group of animals that share the trait of bilateral symmetry. This trait is inherited from a common ancestor and is shared by more than 95% of all land animals. They have a body plan consisting of a tube with two openings and complex sensory organs that give them an advantage over non-bilaterians. This allows them to move quickly and precisely, and is why they are the only animals that have conquered dry land. Additionally, their two-sided symmetry allows them to push from both sides of the body with equal force, giving them an advantage over non-bilaterians.

Cephalization is the process by which sensory organs have become concentrated at one end of an animal, leading to the development of a brain and well-defined head section. This has occurred in groups such as vertebrates, cephalopods, and arthropods. While bilateral symmetry is the most common body plan in the animal kingdom, there are examples of animals that have broken away from this pattern due to selective pressures. These include cross-bill birds, fish like plaice, and some species of male crab.

In conclusion, bilateral symmetry is a common body plan among animals, but exceptions are known to exist. This demonstrates that selective pressures can lead to animals adapting to different body plans in order to survive. Despite the differences, animals still share many similar features, such as the cephalization process which has allowed for the development of a brain in many species.

## Chapters
### Intro
Animals come in all shapes and sizes, but they all share one common trait: bilateral symmetry. This trait is shared by more than 95% of all land animals, and is inherited from a common ancestor hundreds of millions of years ago. This group of animals is known as bilaterans.

## Summary
Bilaterans are a large group of animals that share the trait of bilateral symmetry, which is the mirroring of the left and right sides of the body. This trait is inherited from a common ancestor and is shared by more than 95% of all land animals.

### Types of Symmetry

Animals typically have two-sided symmetry, but some animals have a different type of symmetry, such as radial symmetry, and some animals have no symmetry at all. Nidarians, such as jellyfish, sea anemones, and coral, have radial symmetry and a type of cell called a nidosite that lets them deliver a sting. Sponges have no symmetry and their shape is determined by which side of the sponge is receiving more nutrients. In the Ediacaran period, over 540 million years ago, some creatures had different forms of symmetry. Fossils of an ancient organism called Chania were found in England and a whole fossilized ecosystem of these organisms was discovered on the east coast of Canada, named the Avalon Explosion. These organisms, called rangiomorphs, looked like leaves but lived too deep in the ocean to be able to photosynthesize.

### Ancient Animals
The Ediacaran period (555 million years ago) is known for its strange and unique organisms, such as stem animals and fractal organisms. Towards the end of the period, bilaterians (organisms with bilateral symmetry) began to appear in the fossil record. This included Spraguina, which looked like a trilobite, and Kimberella, which was thought to have lived like a slug. There was also a small worm-like animal called Ikario Warayutia, which was about the size of a grain of rice and burrowed into the sands of the ancient Australian seabed.

### Bilateral Animals
Bilateral animals, which evolved during the Ediacaran period, were initially outnumbered by animals with radial symmetry or other forms of symmetry. However, during the Cambrian explosion, bilaterans became much more successful and are now the most common type of animal. This is likely due to the evolution of certain traits which opened up other evolutionary pathways and gave them an advantage over other animals.

### Body Plan
Bilaterians have a body plan consisting of a tube with two openings, a mouth and an anus, connected by a digestive tract. Sensory organs have developed at the head end above the mouth, giving them a defined front end and the ability to move purposefully towards stimuli. This gives them an advantage over non-bilaterians, who mostly use lures or drift through the ocean waiting for food. Box jellyfish are an exception, actively hunting fish, but they are slower than bilaterians.

### Movement
Bilaterians have complex sensory organs and purposeful precise movements, which helps them move faster and more efficiently than non-bilaterians. This is due to their two-sided symmetry, which allows them to push from both sides of the body with equal force. This is why bilaterians are the only animals that have conquered dry land, as a good stable body shape is more important when out of the water.

### Cephalization
Cephalization is the process where sensory organs have become more concentrated at one end of an animal over time. This process has led to the development of a brain and a well-defined head section in different animal groups such as vertebrates, cephalopods, and arthropods. Although symmetry offers many advantages, there are times when animals have evolved to break their symmetry. Examples of this include cross-bill birds, which have a beak that doesn't meet in the middle to help them access pine cone seeds, fish like plaice which orientate their body on its side to hide on the sea floor, and some species of male crab which have one claw larger than the other.

### Conclusion
Bilateral symmetry is a common body plan among animals, but there are exceptions. Starfish and sea urchins, for example, appear to be radial life forms, but are actually bilaterians. This shows that selective pressures can lead to animals adapting to different body plans in order to survive. Despite the differences, animals still share a lot of similarities.



In [None]:
summarizer = TranscriptSummarizer('https://www.youtube.com/watch?v=oDAMPYfK4p8', ignore_chapters=True)

In [39]:
summarizer.summarize()

Full Summary | initial_tokens = 4156 | segments = 2
Full Summary | 1/2 | 90.09%
Full Summary | 2/2 | 77.46%


<__main__.TranscriptSummarizer at 0x7f90289bcd60>

In [41]:
print(summarizer.generate_markdown_output())

# Why Are Animals Symmetrical?
## Executive Summary

Animals come in all shapes and sizes, but the vast majority share one trait: bilateral symmetry. This means that their left and right sides form a mirror image of one another. This type of symmetry is a result of an ancient common ancestor and is found in more than 95% of known land animals. Outside of this group are sponges, which have no symmetry and the nidarians, which have radial symmetry. During the Ediacaran period, some creatures had different forms of symmetry, and the bilaterians became much more successful during the Cambrian explosion. This is likely due to the structure of the bilaterian body and lifestyle, which enabled certain evolutionary pathways. The bilateral body plan offers advantages such as purposeful movement, a platform for pushing forward in a straight line, and a well-defined head section and brain. There are some animals that have broken the trend of two-sided symmetry, but for the most part, bilateral bod

In [208]:
lex_and_sam = YouTubeInfo('https://www.youtube.com/watch?v=L_Guz73e6fw').get_video_info()
print(lex_and_sam)
print(lex_and_sam.chapters)

Sam Altman: OpenAI CEO on GPT-4, ChatGPT, and the Future of AI | Lex Fridman Podcast #367
{'0:00': 'Introduction', '4:36': 'GPT-4', '16:02': 'Political bias', '23:03': 'AI safety', '43:43': 'Neural network size', '47:36': 'AGI', '1:09:05': 'Fear', '1:11:14': 'Competition', '1:13:33': 'From non-profit to capped-profit', '1:16:54': 'Power', '1:22:06': 'Elon Musk', '1:30:32': 'Political pressure', '1:48:46': 'Truth and misinformation', '2:01:09': 'Microsoft', '2:05:09': 'SVB bank collapse', '2:10:00': 'Anthropomorphism', '2:14:03': 'Future applications', '2:17:54': 'Advice for young people', '2:20:33': 'Meaning of life'}


In [210]:
jack = YouTubeInfo('https://www.youtube.com/watch?v=RY7GDivnc0k').get_video_info()
print(jack)
print(jack.chapters)

This game has a dark secret lurking underneath...
{}


In [42]:
animals_v_two = TranscriptSummarizer('https://www.youtube.com/watch?v=oDAMPYfK4p8').summarize()
print(animals_v_two.generate_markdown_output())

Intro | initial_tokens = 276 | segments = 1
Intro | 1/1 | 53.24%
Types of Symmetry | initial_tokens = 783 | segments = 1
Types of Symmetry | 1/1 | 87.94%
Ancient Animals | initial_tokens = 656 | segments = 1
Ancient Animals | 1/1 | 80.86%
Bilateral Animals | initial_tokens = 445 | segments = 1
Bilateral Animals | 1/1 | 81.03%
Body Plan | initial_tokens = 370 | segments = 1
Body Plan | 1/1 | 74.23%
Movement | initial_tokens = 365 | segments = 1
Movement | 1/1 | 71.99%
Cephalization | initial_tokens = 807 | segments = 1
Cephalization | 1/1 | 74.73%
Conclusion | initial_tokens = 454 | segments = 1
Conclusion | 1/1 | 78.56%
# Why Are Animals Symmetrical?
## Executive Summary
Bilaterians are animals that have two-sided symmetry, which is the most common type of symmetry found in animals. This symmetry is believed to have been inherited from a common ancestor hundreds of millions of years ago. Bilateral symmetry allows for a more stable body shape, which is beneficial for movement and conque

In [44]:
animals_v_two_no_chapters = TranscriptSummarizer('https://www.youtube.com/watch?v=oDAMPYfK4p8', ignore_chapters=True).summarize()
print(animals_v_two_no_chapters.generate_markdown_output())

Full Summary | initial_tokens = 4156 | segments = 2
Full Summary | 1/2 | 89.19%
Full Summary | 2/2 | 84.33%
# Why Are Animals Symmetrical?
## Executive Summary

Animals come in all shapes and sizes, but they all share one trait in common: bilateral symmetry. This type of symmetry, where the left and right sides are mirror images of each other, is found in more than 95% of known land animals and is part of the largest categorization of animals, the bilaterians. Bilateral symmetry has enabled animals to evolve traits that have given them an advantage over other animals with other forms of symmetry, such as increased mobility and intelligence. Bilateral symmetry isn't always the best way to survive for every animal, but it shows how similar we are to even the most distantly related animals.

## Chapters
### Full Summary
Animals range drastically in size, shape, and biology, but they all share one common trait: bilateral symmetry. This type of symmetry, where the left and right sides are m