/
youtube-summarize.py
93 lines (66 loc) · 2.69 KB
/
youtube-summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import openai
from youtube_transcript_api import (YouTubeTranscriptApi, NoTranscriptFound)
from config import OPENAI_API_KEY
MAX_VIDEO_LENGTH_MINUTES = 20
SUMMARY_WORDS = 200
def format_transcript_as_raw_text(transcript, max_video_length_minutes):
max_length = 60*max_video_length_minutes
content = []
last_timestamp = transcript[-1]["start"]
if last_timestamp > max_length:
print(f"> Video is longer than {max_video_length_minutes} minutes, unsuitable for GPT. Aborting")
return ""
for line in transcript:
# remove lines with just music or other non-speech
if (line["text"].startswith("[") and line["text"].endswith("]")):
continue
content.append(line["text"])
return "\n".join(content)
def save_content_to_file(content: str, filename: str):
with open(f"{filename}.txt", "w", encoding="utf-8") as file:
file.write(content)
def summarize_transcript(transcript: str):
print("> Summarizing transcript")
# no randomness
temperature = 0
prompt = f"""
Generate a summary of at minimum {SUMMARY_WORDS} words from the content below, delimited by triple @ symbols.
Content: @@@{transcript}@@@
"""
# https://platform.openai.com/docs/models/gpt-3-5-turbo
model = "gpt-3.5-turbo"
messages = [{"role": "user", "content": prompt}]
completion = openai.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
)
result = completion.choices[0].message.content
return result
def download_english_transcript(video_id):
print(f"> Downloading transcript of video {video_id}")
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
except NoTranscriptFound:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en-US"])
return transcript
if __name__ == "__main__":
openai.api_key = OPENAI_API_KEY
video_id = sys.argv[1]
if not video_id:
raise Exception("No Youtube video id provided")
if video_id.startswith("https://www.youtube.com/watch"):
video_id = video_id.split("v=")[1].split("&")[0]
if len(sys.argv) == 3:
max_video_length_minutes = int(sys.argv[2])
else:
max_video_length_minutes = MAX_VIDEO_LENGTH_MINUTES
transcript = download_english_transcript(video_id)
transcript = format_transcript_as_raw_text(transcript, max_video_length_minutes)
if len(transcript) > 0:
save_content_to_file(transcript, f"{video_id}_transcript")
summary = summarize_transcript(transcript)
save_content_to_file(summary, f"{video_id}_summary")
print("> Done")
print(f"> Summary:\n{summary}")