In [1]:
from openai import OpenAI

openai_client = OpenAI()

In [2]:
#uv add youtube-transcript-api

In [3]:
import pickle

In [4]:
#!wget https://github.com/alexeygrigorev/ai-bootcamp-codespace/raw/refs/heads/main/week1/ph1PxZIkz1o.bin

In [5]:
video_id = 'ph1PxZIkz1o'
with open(f'{video_id}.bin','rb') as f_in:
    transcript = pickle.load(f_in)

In [6]:
transcript[:10]

[FetchedTranscriptSnippet(text='So hi everyone. Uh today we are going to', start=0.0, duration=5.04),
 FetchedTranscriptSnippet(text='talk about our upcoming course. The', start=2.96, duration=3.52),
 FetchedTranscriptSnippet(text='upcoming course is called machine', start=5.04, duration=5.92),
 FetchedTranscriptSnippet(text='learning zoom camp. And um this is', start=6.48, duration=5.92),
 FetchedTranscriptSnippet(text='already I put the link in the', start=10.96, duration=3.599),
 FetchedTranscriptSnippet(text="description. So if you're watching um", start=12.4, duration=4.719),
 FetchedTranscriptSnippet(text="this video in recording or you're", start=14.559, duration=4.88),
 FetchedTranscriptSnippet(text='watching it live, you go here in the', start=17.119, duration=4.561),
 FetchedTranscriptSnippet(text='description after under this video and', start=19.439, duration=5.6),
 FetchedTranscriptSnippet(text='then you see a link course. uh click on', start=21.68, duration=6.24)]

In [7]:

def format_timestamp(seconds: float) -> str:
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds,3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}: {minutes:02}:{secs:02}"
    else: 
        return f"{minutes}:{secs:02}"

def make_subtitles(transcript):
    lines=[]

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n',' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

subtitles = make_subtitles(transcript)
         

In [8]:
print(subtitles[0:500])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming course is called machine
0:06 learning zoom camp. And um this is
0:10 already I put the link in the
0:12 description. So if you're watching um
0:14 this video in recording or you're
0:17 watching it live, you go here in the
0:19 description after under this video and
0:21 then you see a link course. uh click on
0:25 that link and this bring you will bring
0:27 you to
0:29 this website this GitHub


In [9]:
instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

Output format: 

<OUTPUT>
Summary

timestamp chapter 
timestamp chapter
...
timestamp chapter
</OUTPUT>
don'T include <OUTPUT> in the output
"""

In [10]:
def llm_structured(instructions, user_prompt, output_type, model="gpt-4o-mini"):
    messages=[
        {"role":"system","content":instructions},
        {"role":"user", "content": user_prompt}
    ]
    response=openai_client.responses.parse(model=model, input=messages, text_format=output_type)
    return response.output_parsed

In [11]:
from pydantic import BaseModel

In [12]:
class Chapter(BaseModel):
    timestamp:str
    title:str

class YTSummaryResponse(BaseModel):
    summary:str
    chapters: list[Chapter]

In [13]:
#print(answer)

In [14]:
summary = llm_structured(
    instructions=instructions,
    user_prompt=subtitles,
    output_type=YTSummaryResponse
)

In [15]:
print(summary)

summary="In this video, the instructor outlines the details of the upcoming 'Machine Learning Zoom Camp' course, set to begin on September 15. The discussion covers course structure, content updates, prerequisites, and common queries related to placement and learning outcomes. Notably, the course focuses on Machine Learning engineering with practical applications, unlike traditional data science courses. Questions from participants highlight topics such as job placements, programming prerequisites, and course materials. The video aims to inform and answer queries for prospective students, enhancing their understanding prior to enrollment." chapters=[Chapter(timestamp='0:00', title='Introduction to the course'), Chapter(timestamp='1:14', title='Course content and updates'), Chapter(timestamp='2:50', title='Job placement opportunities'), Chapter(timestamp='5:06', title='Computer vision and deep learning modules'), Chapter(timestamp='6:08', title='Prerequisites for the course'), Chapter(t

In [16]:
for c in summary.chapters:
    print(c.timestamp,c.title)

0:00 Introduction to the course
1:14 Course content and updates
2:50 Job placement opportunities
5:06 Computer vision and deep learning modules
6:08 Prerequisites for the course
10:03 Using command line and programming languages
11:44 Learning PyTorch and TensorFlow
12:42 Target audience for the course
13:36 Recommended resources
15:02 Mathematical prerequisites
18:12 Computer requirements for the course
19:35 Using AI tools for learning
20:33 MLOps course offerings
21:38 Job suitability for each course
22:55 New material vs. old videos
24:42 Assignment expectations
26:06 Certificates and portfolio projects
29:24 Live session structure and deadlines
32:12 Project requirements for certificates
34:03 The importance of pair reviews
36:04 Next steps after the course
37:46 Access to recorded videos
38:30 Homework importance
39:42 Closing remarks and course enrollment details


In [17]:
def sliding_window(seq,size,step):
    if size <=0 or step<=0:
        raise ValueError("Size and Step must be positive")
    n=len(seq)
    result=[]
    for i in range(0,n,step):
        batch = seq[i:i+size]
        result.append(batch)
        if i+size>=n:
            break
    return result

In [18]:
sliding_window(list(range(18)),5,2)

[[0, 1, 2, 3, 4],
 [2, 3, 4, 5, 6],
 [4, 5, 6, 7, 8],
 [6, 7, 8, 9, 10],
 [8, 9, 10, 11, 12],
 [10, 11, 12, 13, 14],
 [12, 13, 14, 15, 16],
 [14, 15, 16, 17]]

In [19]:
def join_lines(transcript) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)

def format_chunk(chunk):
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }

In [20]:
chunks = []

# Experiment with different values: try (30, 10) for more granular chunks
for chunk in sliding_window(transcript, 30, 10):
    processed = format_chunk(chunk)
    chunks.append(processed)

print(f"Created {len(chunks)} chunks")

Created 139 chunks


In [21]:
from minsearch import Index
index=Index(text_fields=["text"])
index.fit(chunks)

<minsearch.minsearch.Index at 0x7424a00ba060>

In [22]:
results=index.search("Can I find a job after the course?", num_results=5)

In [23]:
results

[{'start': '53:30',
  'end': '54:42',
  'text': "path to follow after the completing the course? Uh to step into advanced stuff, find a job. That's the best way. Um cuz you can do courses forever, but I think you need to work on projects. This is where the real experience comes from. So you need to find something that is a job. Maybe at the beginning could be difficult but then find a volunteering job. I don't know there are so many places where you can volunteer. So put your skills into practice and then by doing this you will again do this project based learning that I um talked about and then it will force you to learn new things in order to solve a problem you have and then you build a portfolio of things and then it will make you even more job ready. Um yeah so I wouldn't recommend taking another course of course you can do a melops course that we have you can find a lot of courses like we have this page yeah wait no not this one 23 free online courses on machine learning so there

In [34]:
import json

def search(query):
    return index.search(query=query, num_results=15)

instructions = """
Answer the QUESTION based on the CONTEXT from the subtitles of a YouTube video.

Use only the facts from the CONTEXT when answering the QUESTION.

When answering the question, 
provide the citation in form of the video URL pointing at the timestamp where
this is discussed. If the question is discussed in multiple documents,
cite all of them.

Don't use markdown or any formatting in the output.
""".strip()

prompt_template = """
<VIDEO_ID>
{video_id}
</VIDEO_ID>

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def prompt_builder(question, search_results):
    context=json.dumps(search_results)
    #query+search_results
    return prompt_template.format(question=question, context=context, video_id=video_id).strip()

def llm(instructions, user_prompt, model="gpt-4o-mini"):
    messages=[
        {"role":"system","content":instructions},
        {"role":"user", "content": user_prompt}
    ]
    response=openai_client.responses.create(model=model, input=messages)
    return response.output_text

def rag(query):
    #concat instructions und prompt mit system und user und content
    #send to llm (response.create with openai_client)
    search_results=search(query)
    prompt=prompt_builder(query, search_results)
    answer = llm(instructions, prompt)
    return answer

In [36]:
rag("what is the main focus of the course?")

'The main focus of the course is on machine learning engineering, where it covers both the foundational machine learning concepts (using libraries like Scikit-learn) and delves deeper into deployment aspects which many data scientists might find challenging. There is an emphasis on engineering skills related to machine learning, including containerization and deployment of models, making it more suitable for those aspiring to become machine learning engineers. The course is a blend of theoretical knowledge and practical deployment skills necessary for real-world applications. \n\nFor more details, see the discussion at timestamp 3:03 - 4:14 and 34:21 - 35:38 in the video: https://www.youtube.com/watch?v=ph1PxZIkz1o.'