In [56]:
import re
import json
import requests
from openai import OpenAI
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from youtube_transcript_api import YouTubeTranscriptApi

In [57]:
class Video:
    def __init__(self, id, title, published):
        self.id = id
        self.title = title
        self.published_text = published
        published = self.parse_date(published)
        self.published = published

    @staticmethod
    def parse_date(published):
        # Dictionary to map Ukrainian time unit keywords to timedelta arguments
        time_units = {
            'сек': 'seconds',
            'хв': 'minutes',
            'год': 'hours',
            'дн': 'days',
            'тижн': 'weeks',
            'місяц': 'months',
            'р': 'years'
        }

        # Current time
        now = datetime.now()

        # Search for number and unit in the string
        match = re.search(r'(\d*)\s*(\w+)', published)
        if match:
            number, unit = match.groups()

            # Default to 1 if no number is found
            number = int(number) if number else 1

            # Find the correct time unit
            for key, value in time_units.items():
                if key in unit:
                    # Special handling for months and years as timedelta does not support them directly
                    if value == 'months':
                        return now - timedelta(days=30 * number)
                    elif value == 'years':
                        return now - timedelta(days=365 * number)
                    else:
                        return now - timedelta(**{value: number})

        # If no match, return current time (or handle as appropriate)
        return now

In [58]:
class Channel:
    def __init__(self, url, title=None, videos=None):
        self.url = url
        self.title = title
        self.videos = videos

In [71]:
class YoutubeChannelParser:
    videos_string_template = '"canonicalBaseUrl":"/@"}},"title":"Відео","selected":true,"content":{"richGridRenderer":{"contents":'.split("@")
    allowed_languages = ["uk"]
    
    def __init__(self, canonical_name, soup=None):
        canonical_name = canonical_name.replace("@", "")
        self.name = canonical_name
        self.curl = f"https://www.youtube.com/@{self.name}/videos"
        self.soup = soup
        self.vstx = self.videos_string_template[0] + f"@{self.name}" + self.videos_string_template[1]
        

    def parse(self):
        response = requests.get(self.curl)
        self.soup = BeautifulSoup(response.content, 'html.parser')
        chanel_title = self.soup.body.find("title").text
        chanel_videos = self.find_front_page_videos()
        return Channel(self.curl, chanel_title, chanel_videos)

    def find_front_page_videos(self):
        start_index = str(self.soup).find(self.vstx)
        started_text = str(self.soup)[start_index + len(self.vstx):]
        end_index = start_index + 1
        
        stack = []
        for i in range(len(started_text)):
            char = started_text[i]
            if char == "[": 
                stack.append(1)
            if char == "]": 
                stack.pop()
            if len(stack) < 1:
                end_index = i
                break
                
        data = json.loads(started_text[:end_index+1])

        videos = []
        for i in range(len(data) - 1):
            video_data = data[i]['richItemRenderer']['content']['videoRenderer']
            title = video_data['title']['runs'][0]['text']
            published = video_data['publishedTimeText']['simpleText']
            video_id = video_data['videoId']
            videos.append(Video(video_id, title, published))

        return videos

In [68]:
class VideoTranscript:
    def __init__(self, video):
        self.video = video

    def get_transcript(self):
        video_transcript = YouTubeTranscriptApi.get_transcript(video.id, languages=['uk', 'ru', 'en'])
        video_text = " ".join([chunk['text'] for chunk in video_transcript])
        return  video_text

In [73]:
class TranscriptSummarizer:
    def __init__(self, video, transcript, summary=None):
        self.video = video
        self.transcript = transcript
        self.summary = summary
        self.client = OpenAI()
        self.role_prompt = \
        f"""
        You are a transcript summarizer that receives the transcript text and return the summary in up to 8 sentences.
        The video is called: "{self.video.title}"
        Summary must be precise and follow the key points of the video. You output only the summary!
        Summary should be in English. The audience for your summary is Ukrainian people. Keep in mind that 
        transcript does not have punctuation, so please, use common sense to separate sentences. Transcript is
        auto-generated, so some of the words may be incorrect, use common sense to derive the correct information
        """

    def get_summary(self):
        completion = self.client.chat.completions.create(model="gpt-3.5-turbo-1106",
                                                         messages=[{"role": "system", "content": self.role_prompt},
                                                                   {"role": "user", "content": self.transcript}])
        self.summary = completion.choices[0].message.content
        return self.summary

    def save_summary_audio(self):
        if self.summary is not None:
            response = self.client.audio.speech.create(model="tts-1",
                                            voice="echo",
                                            input=self.summary)
            response.stream_to_file("output.mp3")
        else:
            print("Summary is None")

In [74]:
channel = YoutubeChannelParser("portnikov.argumenty").parse()
print(channel.title)
video = channel.videos[5]
print(video.title)
transcript = VideoTranscript(video).get_transcript()
summarizer = TranscriptSummarizer(video, transcript)
summary = summarizer.get_summary()
print(summary)
summarizer.save_summary_audio()

Портников. Аргументы - YouTube
США хотят отдать российские деньги Украине | Виталий Портников
The White House is considering the confiscation of Russian assets worth $300 billion in favor of Ukraine. This is indicated by a memorandum from the United States National Security Council. The US wants to ensure support from allies in resolving this matter and to prevent Russia from filing numerous lawsuits to impede the allocation of funds for Ukraine's needs and the confiscation of these assets. However, there are concerns about the impact on the US financial system and the potential for authoritarian regimes to shift their money away from US banks. The idea of confiscating assets is gaining support in the US, but it is also a challenge to the Russian President Vladimir Putin, who is already strained due to ongoing aggression against Ukraine. Confiscation of Russian assets could be a response to Russian plans for aggression against other former Soviet republics. Despite potential economic c

  response.stream_to_file("output.mp3")


In [None]:
# client = OpenAI()
# response = client.speech.create(
#     model="tts-1",
#     voice="echo",
#     input=summary,
# )

# response.stream_to_file("output.mp3")