In [1]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
from openai import OpenAI

openai_client = OpenAI()

In [10]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

        messages.append({
         "role": "user",
         "content": user_prompt
         })

    response = openai_client.responses.create(
        model=model,
        input=messages

    )

    return response.output_text

In [11]:
!uv add youtube-transcript-api

[2mResolved [1m152 packages[0m [2min 1.55s[0m[0m
[2mPrepared [1m1 package[0m [2min 420ms[0m[0m
[2mInstalled [1m1 package[0m [2min 80ms[0m[0m
 [32m+[39m [1myoutube-transcript-api[0m[2m==1.2.2[0m


In [16]:
from youtube_transcript_api import YouTubeTranscriptApi

In [22]:
video_id = 'ph1PxZIkz1o'

ytt_api = YouTubeTranscriptApi()
transcript = ytt_api.fetch(video_id)

In [23]:
import pickle

In [24]:
with open(f'{video_id}.bin', 'wb') as f_out:
    pickle.dump(transcript, f_out)

In [26]:
transcript[:10]

[FetchedTranscriptSnippet(text='So hi everyone. Uh today we are going to', start=0.0, duration=5.04),
 FetchedTranscriptSnippet(text='talk about our upcoming course. The', start=2.96, duration=3.52),
 FetchedTranscriptSnippet(text='upcoming course is called machine', start=5.04, duration=5.92),
 FetchedTranscriptSnippet(text='learning zoom camp. And um this is', start=6.48, duration=5.92),
 FetchedTranscriptSnippet(text='already I put the link in the', start=10.96, duration=3.599),
 FetchedTranscriptSnippet(text="description. So if you're watching um", start=12.4, duration=4.719),
 FetchedTranscriptSnippet(text="this video in recording or you're", start=14.559, duration=4.88),
 FetchedTranscriptSnippet(text='watching it live, you go here in the', start=17.119, duration=4.561),
 FetchedTranscriptSnippet(text='description after under this video and', start=19.439, duration=5.6),
 FetchedTranscriptSnippet(text='then you see a link course. uh click on', start=21.68, duration=6.24)]

In [27]:


def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)




In [28]:
subtitles = make_subtitles(transcript)

In [29]:
print(subtitles[:500])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub


In [30]:
instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

Output format: 

<OUTPUT>
Summary

timestamp chapter 
timestamp chapter
...
timestamp chapter
</OUTPUT>
"""

In [31]:
answer = llm(subtitles, instructions=instructions)

In [33]:
from pydantic import BaseModel

In [34]:
class Chapter(BaseModel):
    timestamp: str
    title: str

class YTSummaryResponse(BaseModel):
    summary: str
    chapters: list[Chapter]

In [39]:
def llm_structured(instructions, user_prompt, output_type, model="gpt-4o-mini"):
    messages = [
        {
            "role": "system",
            "content": instructions
        },
        {
         "role": "user",
         "content": user_prompt
         }
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type

    )

    return response.output_parsed

In [40]:
summary = llm_structured(
    instructions=instructions,
    user_prompt=subtitles,
    output_type=YTSummaryResponse
)

In [43]:
print(summary.summary)
print()
for c in summary.chapters:
    print(c.timestamp, c.title)

The video introduces the upcoming "Machine Learning Zoom Camp" course, focusing on the course structure, prerequisites, and what participants can expect to learn. The instructor discusses the updates being made to the curriculum, the engineering focus of the course, and how it prepares students for careers in machine learning and ML engineering. They emphasize the importance of having programming skills and being comfortable with command line operations as prerequisites. Participants are encouraged to ask questions and interact through a dedicated platform. The video highlights milestones, such as upcoming deadlines, potential for job placements, and the inclusion of projects that will provide certificates upon completion.

0:00 Introduction to the course
0:48 Course overview and sign-up
2:38 Course updates and module content
3:50 Job placement opportunities
4:56 Deep dive into topics covered
6:09 Course prerequisites
10:00 Command line usage
10:56 Using PyTorch and TensorFlow
12:43 Ta

In [44]:
print(subtitles[:1000])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub page.
0:34 This GitHub page is the main entry point
0:36 to our course and um yeah I think it's
0:41 more or less self-explanatory. If you
0:43 want to sign up this is the button you
0:45 click and the actual course starts in on
0:48 September 15th. it means that it's uh
0:51 slightly less than one one month before
0:53 the course starts and the purpose of
0:55 today's um session is to just answer
0:58 your questions. So you have some
1:00 questions and uh you can ask these
1:03 questions using

In [46]:
def sliding_window(seq, size, step):
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result



In [49]:
chunk = transcript[:10]

In [50]:
def join_lines(transcript) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)

def format_chunk(chunk):
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }


In [52]:
chunks = []

for chunk in sliding_window(transcript, 60, 30):
    processed = format_chunk(chunk)
    chunks.append(processed)



In [53]:
print(f"Created {len(chunks)} chunks")

Created 46 chunks


In [55]:
from minsearch import Index

index = Index(text_fields=["text"])
index.fit(chunks)



<minsearch.minsearch.Index at 0x1952699ed50>

In [57]:
results = index.search('Can I find a job after the course?', num_results=5)

In [59]:

import json

def search(query):
    """Search for relevant documents."""
    return index.search(
        query=query,
        num_results=15
    )

instructions = """
Answer the QUESTION based on the CONTEXT from the subtitles of a YouTube video.

Use only the facts from the CONTEXT when answering the QUESTION.

When answering the question, 
provide the citation in form of the video URL pointing at the timestamp where
this is discussed. If the question is discussed in multiple documents,
cite all of them.

Don't use markdown or any formatting in the output.
""".strip()

prompt_template = """
<VIDEO_ID>
{video_id}
</VIDEO_ID>

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=context,
        video_id=video_id
    ).strip()

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt, instructions=instructions)
    return response

# Test it:


In [60]:
answer = rag('Can I find a job after the course?')
print(answer)

Yes, according to the video, many participants of the course have successfully found jobs after completing it. While the course does not provide job placement services, it teaches important skills for machine learning that increase the chances of finding a job. Additionally, engaging in projects and possibly volunteering can further enhance job readiness. 

For more details, you can refer to the discussion at this timestamp: https://www.youtube.com/watch?v=ph1PxZIkz1o&t=1m21s.
